In [6]:
import pandas as pd

# Load the dataset
file_path = '/content/CKD Data.csv'
df = pd.read_csv(file_path)

# Data Cleaning: Handling missing or inconsistent data
# Removing the first row since it appears to be attribute type information
df_cleaned = df.iloc[2:].reset_index(drop=True)

# Handling missing values - replacing missing values with NaN and dropping unnecessary rows
df_cleaned.replace("?", pd.NA, inplace=True)
df_cleaned.dropna(inplace=True)

# Cleaning numerical columns that have ranges or inconsistent formats
def clean_numerical_column(column):
    # Replace ranges and other inconsistent values with NaN
    column = column.str.extract(r'([0-9.]+)').astype(float)
    return column

numerical_columns_to_clean = ["sg", "al", "bgr", "bu", "sod", "sc", "pot", "hemo", "pcv", "rbcc", "wbcc", "grf", "age"]
for col in numerical_columns_to_clean:
    df_cleaned[col] = clean_numerical_column(df_cleaned[col])

# Converting all variables to categorical types with specified categories
# Categorizing discrete and numerical attributes
categorical_mappings = {
    "bp (Diastolic)": {
        "0": "Low",
        "1": "Normal",
        "2": "High"
    },
    "bp limit": {
        "0": "Normal",
        "1": "Stage 1 Hypertension",
        "2": "Stage 2 Hypertension"
    },
    "class": {
        "ckd": "Yes",
        "notckd": "No"
    },
    "rbc": {
        "0": "Normal",
        "1": "Abnormal"
    },
    "su": {
        "<1.007": "None",
        "1.009-1.011": "Trace",
        "1.015-1.017": "Moderate",
        "1.019-1.021": "High",
        ">1.023": "Very High"
    },
    "pc": {
        "0": "Normal",
        "1": "Abnormal"
    },
    "pcc": {
        "0": "Not present",
        "1": "Present"
    },
    "ba": {
        "0": "Not present",
        "1": "Present"
    },
    "htn": {
        "0": "No",
        "1": "Yes"
    },
    "dm": {
        "0": "No",
        "1": "Yes"
    },
    "cad": {
        "0": "No",
        "1": "Yes"
    },
    "appet": {
        "0": "Good",
        "1": "Poor"
    },
    "pe": {
        "0": "No",
        "1": "Yes"
    },
    "ane": {
        "0": "No",
        "1": "Yes"
    },
    "stage": {
        "s1": "Stage 1",
        "s2": "Stage 2",
        "s3": "Stage 3",
        "s4": "Stage 4",
        "s5": "Stage 5"
    },
    "affected": {
        "0": "Left or Right",
        "1": "Both"
    }
}

for col, mapping in categorical_mappings.items():
    df_cleaned[col] = df_cleaned[col].astype(str).map(mapping)
    df_cleaned[col] = df_cleaned[col].astype('category')

# Converting numerical variables to categorical with specified ranges, updated age ranges
numerical_to_categorical_mappings = {
    "sg": ["<1.007", "1.009-1.011", "1.015-1.017", "1.019-1.021", ">1.023"],
    "al": ["<0", "1", "2", "3", "4", ">4"],
    "bgr": ["<112", "112-154", "154-196", "196-238", "238-280", "280-322", "322-364", "364-406", "406-448", ">448"],
    "bu": ["<48.1", "48.1-86.2", "86.2-124.3", "124.3-162.4", "162.4-200.5", "200.5-238.6", "238.6-276.7", ">352.9"],
    "sod": ["<118", "118-123", "123-128", "128-133", "133-138", "138-143", "143-148", "148-153", ">158"],
    "sc": ["<3.65", "3.65-6.8", "6.8-9.95", "9.95-13.1", "13.1-16.25", "16.25-19.4", ">28.85"],
    "pot": ["<7.31", "7.31-11.72", "38.18-42.59", ">42.59"],
    "hemo": ["<6.1", "6.1-7.4", "7.4-8.7", "8.7-10", "10-11.3", "11.3-12.6", "12.6-13.9", "13.9-15.2", "15.2-16.5", ">16.5"],
    "pcv": ["<17.9", "17.9-21.8", "21.8-25.7", "25.7-29.6", "29.6-33.5", "33.5-37.4", "37.4-41.3", "41.3-45.2", "45.2-49.1"],
    "rbcc": ["<2.69", "2.69-3.28", "3.28-3.87", "3.87-4.46", "4.46-5.05", "5.05-5.64", "5.64-6.23", "6.23-6.82", ">7.41"],
    "wbcc": ["<4980", "4980-7360", "7360-9740", "9740-12120", "12120-14500", "14500-16880", "16880-19260", "19260-21640", ">24020"],
    "grf": ["<26.6175", "26.6175-51.7832", "51.7832-76.949", "76.949-102.115", "102.115-127.281", "127.281-152.446", "152.446-177.612", "177.612-202.778", "202.778-227.944", ">227.944"],
    "age": ["<12", "12-20", "20-27", "27-35", "35-43", "43-51", "51-59", "59-66", "66-74", ">74"]
}

for col, bins in numerical_to_categorical_mappings.items():
    df_cleaned[col] = pd.cut(df_cleaned[col].astype(float), bins=len(bins), labels=bins)
    df_cleaned[col] = df_cleaned[col].astype('category')

# Summary Statistics: Descriptive statistics for key attributes
summary_statistics = df_cleaned.describe(include='category')

# Display cleaned dataset and summary statistics
print("Cleaned Chronic Kidney Disease Dataset:")
print(df_cleaned.head())

print("\nSummary Statistics:")
print(summary_statistics)

# Attribute Information: Description of available attributes in the dataset
attribute_info = {
    "bp (Diastolic)": "Diastolic blood pressure values categorized as 'Low', 'Normal', or 'High'.",
    "bp limit": "Blood pressure limit category (e.g., 'Normal', 'Stage 1 Hypertension', 'Stage 2 Hypertension').",
    "sg": "Specific gravity of urine categorized into discrete ranges.",
    "al": "Albumin concentration categorized into discrete ranges.",
    "class": "Patient classification, either chronic kidney disease (ckd) or non-ckd (Yes or No).",
    "rbc": "Red Blood Cells categorized as 'Normal' or 'Abnormal'.",
    "su": "Sugar levels in the urine categorized as 'None', 'Trace', 'Moderate', 'High', or 'Very High'.",
    "pc": "Pus Cell categorized as 'Normal' or 'Abnormal'.",
    "pcc": "Pus Cell Clumps categorized as 'Present' or 'Not present'.",
    "ba": "Bacteria in urine categorized as 'Present' or 'Not present'.",
    "htn": "Hypertension categorized as 'Yes' or 'No'.",
    "dm": "Diabetes Mellitus categorized as 'Yes' or 'No'.",
    "cad": "Coronary Artery Disease categorized as 'Yes' or 'No'.",
    "appet": "Appetite status categorized as 'Good' or 'Poor'.",
    "pe": "Pedal Edema categorized as 'Yes' or 'No'.",
    "ane": "Anemia categorized as 'Yes' or 'No'.",
    "stage": "CKD stages categorized as 'Stage 1', 'Stage 2', etc.",
    "affected": "Affected area categorized as 'Left', 'Right', or 'Both'.",
    "age": "Patient age range categorized.",
    "bgr": "Random blood glucose level categorized into discrete ranges.",
    "bu": "Blood urea level categorized into discrete ranges.",
    "sod": "Serum sodium level categorized into discrete ranges.",
    "sc": "Serum creatinine level categorized into discrete ranges.",
    "pot": "Serum potassium level categorized into discrete ranges.",
    "hemo": "Hemoglobin level categorized into discrete ranges.",
    "pcv": "Packed cell volume categorized into discrete ranges.",
    "rbcc": "Red blood cell count categorized into discrete ranges.",
    "wbcc": "White blood cell count categorized into discrete ranges.",
    "grf": "Glomerular filtration rate categorized into discrete ranges."
}

# Display attribute information
print("\nAttribute Information:")
for attribute, description in attribute_info.items():
    print(f"{attribute}: {description}")


Cleaned Chronic Kidney Disease Dataset:
  bp (Diastolic)              bp limit           sg  al class       rbc   su  \
0            Low                Normal  1.019-1.021   1   Yes    Normal  NaN   
1            Low                Normal       <1.007  <0   Yes    Normal  NaN   
2            Low                Normal       <1.007  >4   Yes  Abnormal  NaN   
3         Normal  Stage 1 Hypertension       <1.007   4   Yes    Normal  NaN   
4            Low                Normal  1.015-1.017  <0   Yes    Normal  NaN   

         pc          pcc           ba  ... htn   dm cad appet   pe ane  \
0    Normal  Not present  Not present  ...  No   No  No  Good   No  No   
1    Normal  Not present  Not present  ...  No   No  No  Good   No  No   
2  Abnormal  Not present      Present  ...  No   No  No  Poor   No  No   
3    Normal  Not present  Not present  ...  No   No  No  Good   No  No   
4    Normal  Not present  Not present  ...  No  Yes  No  Poor  Yes  No   

               grf    stage affect