In [1]:
import pandas as pd

# Load the dataset
file_path = '/content/CKD Data.csv'
df = pd.read_csv(file_path)

# Data Cleaning: Handling missing or inconsistent data
# Removing the first row since it appears to be attribute type information
df_cleaned = df.iloc[2:].reset_index(drop=True)

# Handling missing values - replacing missing values with NaN and dropping unnecessary rows
df_cleaned.replace("?", pd.NA, inplace=True)
df_cleaned.dropna(inplace=True)

# Cleaning numerical columns that have ranges or inconsistent formats
def clean_numerical_column(column):
    # Replace ranges and other inconsistent values with NaN
    column = column.str.extract(r'([0-9.]+)').astype(float)

    # Replace NaN values with the mode of the column
    mode_value = column.mode()[0]  # Get the mode of the column
    column.fillna(mode_value, inplace=True)  # Replace NaN with the mode

    return column

numerical_columns_to_clean = ["sg", "al", "bgr",  "sod",  "hemo", "pcv", "rbcc", "wbcc", "grf", "age"]
for col in numerical_columns_to_clean:
    df_cleaned[col] = clean_numerical_column(df_cleaned[col])

# Converting all variables to categorical types with specified categories
# Categorizing discrete and numerical attributes
categorical_mappings = {
    "bp (Diastolic)": {
        "0": "Low",
        "1": "Normal",
        "2": "High"
    },
    "bp limit": {
        "0": "Normal",
        "1": "Stage 1 Hypertension",
        "2": "Stage 2 Hypertension"
    },
    "class": {
        "ckd": "Yes",
        "notckd": "No"
    },
    "rbc": {
        "0": "Normal",
        "1": "Abnormal"
    },
    "su": {
       "< 0": "No Trace",
        "1": "Trace",
        "2": "Moderate",
        "3": "High",
        "4": "Very High",
       "> 4": "Extremely High"
    },
    "sc": {
        "< 3.65": "Very Low",
        "3.65 - 6.8": "Low to Moderate",
        "6.8 - 9.95": "Moderate to high",
        "9.95 - 13.1": "High",
        "13.1 - 16.25": "Very High",
       "16.25 - 19.4": "Severe",
       "> 28.85": "Critical"
    },
    "bu": {
       "< 48.1": "Low",
        "48.1 - 86.2": "Slightly Elevated",
        "86.2 - 124.3": "Moderate",
        "124.3 - 162.4": "High",
        "162.4 - 200.5": "Very High",
       "200.5 - 238.6": "Severe",
       "238.6 - 276.7": "Critical",
       "> 352.9": "Extremely Critical"
     },
    "pot": {
        "< 7.31": "Low",
        "7.31-11.72": "Moderate",
        "38.18-42.59": "Very High",
        "> 42.59": "Extremely High",
    },
    "pc": {
        "0": "Normal",
        "1": "Abnormal"
    },
    "pcc": {
        "0": "Not present",
        "1": "Present"
    },
    "ba": {
        "0": "Not present",
        "1": "Present"
    },
    "htn": {
        "0": "No",
        "1": "Yes"
    },
    "dm": {
        "0": "No",
        "1": "Yes"
    },
    "cad": {
        "0": "No",
        "1": "Yes"
    },
    "appet": {
        "0": "Good",
        "1": "Poor"
    },
    "pe": {
        "0": "No",
        "1": "Yes"
    },
    "ane": {
        "0": "No",
        "1": "Yes"
    },
    "stage": {
        "s1": "Stage 1",
        "s2": "Stage 2",
        "s3": "Stage 3",
        "s4": "Stage 4",
        "s5": "Stage 5"
    },
    "affected": {
        "0": "Left or Right",
        "1": "Both"
    }
}

for col, mapping in categorical_mappings.items():
    df_cleaned[col] = df_cleaned[col].astype(str).map(mapping)
    df_cleaned[col] = df_cleaned[col].astype('category')

# Attribute Information: Define attribute descriptions
attribute_info = {
    "sg": "Specific Gravity",
    "al": "Albumin",
    "bgr": "Blood Glucose Random",
    "bu": "Blood Urea",
    "sod": "Sodium",
    "sc": "Serum Creatinine",
    "pot": "Potassium",
    "hemo": "Hemoglobin",
    "pcv": "Packed Cell Volume",
    "rbcc": "Red Blood Cell Count",
    "wbcc": "White Blood Cell Count",
    "grf": "Glomerular Filtration Rate",
    "age": "Age of the Patient",
    "bp (Diastolic)": "Diastolic Blood Pressure",
    "bp limit": "Blood Pressure Limit",
    "class": "Chronic Kidney Disease Status",
    "rbc": "Red Blood Cells",
    "pc": "Pus Cell",
    "pcc": "Pus Cell Clumps",
    "ba": "Bacteria",
    "htn": "Hypertension",
    "dm": "Diabetes Mellitus",
    "cad": "Coronary Artery Disease",
    "appet": "Appetite",
    "pe": "Pedal Edema",
    "ane": "Anemia",
    "stage": "Stage of Kidney Disease",
    "affected": "Affected Kidney (Left/Right/Both)"
}

# Display attribute information
print("\nAttribute Information:")
for attribute, description in attribute_info.items():
    print(f"{attribute}: {description}")

# Save the cleaned dataframe to a CSV file
output_file_path = '/content/CKD_Data_Cleaned.csv'
df_cleaned.to_csv(output_file_path, index=False)

# Summary Statistics for Numerical and Categorical Attributes
numerical_summary = df_cleaned[numerical_columns_to_clean].describe()
categorical_summary = df_cleaned[categorical_mappings.keys()].describe()

# Display cleaned dataset and summary statistics
print("Cleaned Chronic Kidney Disease Dataset:")
print(df_cleaned.head())

# Display summary statistics
print("Numerical Summary Statistics:\n", numerical_summary)
print("\nCategorical Summary Statistics:\n", categorical_summary)



Attribute Information:
sg: Specific Gravity
al: Albumin
bgr: Blood Glucose Random
bu: Blood Urea
sod: Sodium
sc: Serum Creatinine
pot: Potassium
hemo: Hemoglobin
pcv: Packed Cell Volume
rbcc: Red Blood Cell Count
wbcc: White Blood Cell Count
grf: Glomerular Filtration Rate
age: Age of the Patient
bp (Diastolic): Diastolic Blood Pressure
bp limit: Blood Pressure Limit
class: Chronic Kidney Disease Status
rbc: Red Blood Cells
pc: Pus Cell
pcc: Pus Cell Clumps
ba: Bacteria
htn: Hypertension
dm: Diabetes Mellitus
cad: Coronary Artery Disease
appet: Appetite
pe: Pedal Edema
ane: Anemia
stage: Stage of Kidney Disease
affected: Affected Kidney (Left/Right/Both)
Cleaned Chronic Kidney Disease Dataset:
  bp (Diastolic)              bp limit     sg   al class       rbc        su  \
0            Low                Normal  1.019  1.0   Yes    Normal  No Trace   
1            Low                Normal  1.009  0.0   Yes    Normal  No Trace   
2            Low                Normal  1.009  4.0   Yes