In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
print("Libraries imported.")

Libraries imported.


In [2]:
# --- Load Data Using a More Robust Method ---
kidney_path = os.path.join("..", "data", "raw", "kidney_disease", "chronic_kidney_disease.arff")

# Find where the actual data starts
with open(kidney_path, 'r') as f:
    for i, line in enumerate(f):
        if line.strip().lower().startswith('@data'):
            data_start_line = i + 1
            break

# Define the column names manually
column_names = [
    'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
    'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn',
    'dm', 'cad', 'appet', 'pe', 'ane', 'class'
]

# --- THIS IS THE FIX ---
# Read the data using the more flexible 'python' engine and handle bad lines
df = pd.read_csv(
    kidney_path,
    header=None,
    names=column_names,
    skiprows=data_start_line,
    na_values=['?', '\t?'],
    engine='python',            # Use the more flexible python engine
    on_bad_lines='skip'         # Skip rows that are still problematic
)

print("Data loaded successfully using robust method.")
print(f"Initial data shape: {df.shape}")

# --- Data Cleaning ---
# The data is already loaded with NaNs, so we can proceed with cleaning.

# Identify categorical and numerical columns
categorical_cols = [
    'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'
]
numerical_cols = [
    'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc'
]

# Clean up string values in categorical columns (e.g., remove spaces)
for col in categorical_cols:
    df[col] = df[col].str.strip()

# Clean up the target variable 'class'
df['class'] = df['class'].str.strip().replace({'ckd': 1, 'notckd': 0})
df['class'] = pd.to_numeric(df['class'], errors='coerce')


# --- Imputation ---
# Impute numerical with median and categorical with mode
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)
    
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


# --- Final Encoding ---
# One-hot encode the categorical columns
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Drop rows where the target is still missing
df_processed.dropna(subset=['class'], inplace=True)
# Ensure target is integer
df_processed['class'] = df_processed['class'].astype(int)


print("\nData cleaning complete. Final shape:", df_processed.shape)
display(df_processed.head())

Data loaded successfully using robust method.
Initial data shape: (397, 25)

Data cleaning complete. Final shape: (397, 25)


  df['class'] = df['class'].str.strip().replace({'ckd': 1, 'notckd': 0})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will 

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,class,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_poor,pe_yes,ane_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,138.0,4.4,15.4,44.0,7800.0,5.2,1,True,True,False,False,True,True,False,False,False,False
1,7.0,50.0,1.02,4.0,0.0,121.0,18.0,0.8,138.0,4.4,11.3,38.0,6000.0,4.8,1,True,True,False,False,False,False,False,False,False,False
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,138.0,4.4,9.6,31.0,7500.0,4.8,1,True,True,False,False,False,True,False,True,False,True
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1,True,False,True,False,True,False,False,True,True,True
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,138.0,4.4,11.6,35.0,7300.0,4.6,1,True,True,False,False,False,False,False,False,False,False


In [None]:
# --- Corrected Code for Cell 4 (Aggressive Feature Dropping) ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define Features (X) and Target (y)
X = df_processed.drop('class', axis=1)
y = df_processed['class']

# --- THIS IS THE FINAL FIX ---
# Based on our deep dive, many features are diagnostic, not predictive.
# Let's remove all direct urinalysis and key blood markers that define CKD.
features_to_drop = [
    # Previously identified
    'hemo', 'sc', 'pcv', 'sg', 'rc',
    
    # Newly identified potential leakers from EDA
    'al', # Albumin
    'su', # Sugar
    'rbc_normal', # Red Blood Cells (normal/abnormal)
    'pc_normal',  # Pus Cell (normal/abnormal)
    'pcc_present',# Pus Cell Clumps
    'ba_present'  # Bacteria
] 

# Drop the leaky features from our feature set
X_fixed = X.drop(columns=features_to_drop, errors='ignore')

print(f"Dropped {len(features_to_drop)} potentially leaky/diagnostic features.")
print(f"Final features for model: {X_fixed.columns.tolist()}")
print(f"New training data shape: {X_fixed.shape}")


# --- Split the Data (using the fixed feature set) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_fixed, y, test_size=0.2, random_state=42, stratify=y
)

# --- Scale All Remaining Features ---
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_fixed.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_fixed.columns)

print("\nData splitting and scaling complete on the final fixed dataset.")

Dropped 11 potentially leaky/diagnostic features.
Final features for model: ['age', 'bp', 'bgr', 'bu', 'sod', 'pot', 'wc', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_poor', 'pe_yes', 'ane_yes']
New training data shape: (397, 13)

Data splitting and scaling complete on the final fixed dataset.


In [4]:
# Create directories
KIDNEY_PROCESSED_DIR = os.path.join("..", "data", "processed", "kidney_disease")
KIDNEY_MODELS_DIR = os.path.join("..", "models", "kidney_disease")
os.makedirs(KIDNEY_PROCESSED_DIR, exist_ok=True)
os.makedirs(KIDNEY_MODELS_DIR, exist_ok=True)

# Save data
joblib.dump(X_train, os.path.join(KIDNEY_PROCESSED_DIR, "X_train.joblib"))
joblib.dump(X_test, os.path.join(KIDNEY_PROCESSED_DIR, "X_test.joblib"))
joblib.dump(y_train, os.path.join(KIDNEY_PROCESSED_DIR, "y_train.joblib"))
joblib.dump(y_test, os.path.join(KIDNEY_PROCESSED_DIR, "y_test.joblib"))

# Save the scaler and feature list
joblib.dump(scaler, os.path.join(KIDNEY_MODELS_DIR, "kidney_disease_scaler.joblib"))
joblib.dump(X.columns.tolist(), os.path.join(KIDNEY_MODELS_DIR, "kidney_disease_features.joblib"))

print("Chronic Kidney Disease data artifacts saved successfully!")

Chronic Kidney Disease data artifacts saved successfully!
