***This notebook is dedicated to evaluated the kidney.csv dataset on the following classification models: logistic regression, k-nearest neighbor and random forest***

# Imports

In [1]:
# DATA MANIPULATION
import pandas as pd
import numpy as np

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')



In [2]:
# sklearn preproc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
#from sklearn.feature_selection import SequentialFeatureSelector

from data import get_cleaned_data
from randomforest import RandomForest 



# Dataset loading

In [3]:
def get_data(url):
    df = pd.read_csv(url)
    return df

kidneys = get_data("../raw_data/kidney_disease.csv")
kidneys.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35.0,7300,4.6,no,no,no,good,no,no,ckd


# Dataset preprocess

## cleaning

In [4]:
def replacing_numerical_features(X):
    '''cleaning: strips \t at beginning of number and replaces ? with nan values'''
    X['pcv'] = X['pcv'].astype(str).str.lstrip("\t")
    X['pcv'] = X['pcv'].replace('?',np.nan).astype(float)
    X['wc'] = X['wc'].astype(str).str.lstrip("\t")
    X['wc'] = X['wc'].replace('?',np.nan).astype(float)
    X['rc'] = X['rc'].astype(str).str.lstrip("\t")
    X['rc'] = X['rc'].replace('?',np.nan).astype(float)
    return X

In [5]:
def preprocessing_1(df):
    # convert wc and rc columns to float
    df[['wc', 'rc']] = df[['wc','rc']].apply(pd.to_numeric, errors='coerce')
    df = replacing_numerical_features(df)
    return df


In [6]:
kidneys = preprocessing_1(kidneys)
kidneys.head(4)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd


In [7]:
#def replacing_binary_features(X):
 #   '''encoding: replacing Yes --> 1 no --> 0'''
kidneys[['classification']] = kidneys[['classification']].replace(to_replace={'ckd':1,'notckd':0, 'ckd\t': 1})
kidneys[['htn','dm','cad','pe','ane']] = kidneys[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
kidneys[['pe']] = kidneys[['pe']].replace(to_replace={'good':1})
kidneys[['htn']] = kidneys[['htn']].replace(to_replace={'4':4, '8':8, '?':np.nan, '5.2':5.2})
kidneys[['rbc','pc']] = kidneys[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
kidneys[['pcc','ba']] = kidneys[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
kidneys[['appet']] = kidneys[['appet']].replace(to_replace={'good':2,'poor':1,'no':0})
    ## replacing t_values to 0 or 1, by assuming it s close to 0 or 1, respectively
kidneys[['cad']] = kidneys[['cad']].replace(to_replace='\tno',value=0)
kidneys[['dm']] = kidneys[['dm']].replace(to_replace={'\tno':0,'\tyes':1,' yes':1})
   # return X

In [8]:
def split_num_cat(df):
    # splitting numerical  and categorical features
    feat_num = list(df.nunique()[df.nunique() > 7].index)
    feat_cat = list(df.nunique()[df.nunique() <=7].index)
    feat_cat_bin = list(df[feat_cat].nunique()[df[feat_cat].nunique() ==2].index)
    feat_cat_ord = list(set(df.columns) - set(feat_num) - set(feat_cat_bin))

    #feat_num = df.select_dtypes(include=['float64', 'int64'])
    #feat_cat = df.drop(columns = feat_num.columns)
    # cleaning categorical features 
    
    #df_new = pd.concat([feat_cat, feat_num], axis = 1)
    return (feat_num, feat_cat_bin, feat_cat_ord)


In [9]:

feat_num = split_num_cat(kidneys)[0]
feat_cat_bin = split_num_cat(kidneys)[1]
feat_cat_ord = split_num_cat(kidneys)[2]

kidneys[feat_num]

Unnamed: 0,id,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc
0,0,48.0,80.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,1,7.0,50.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,2,62.0,80.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,3,48.0,70.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,4,51.0,80.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9
396,396,42.0,70.0,75.0,31.0,1.2,141.0,3.5,16.5,54.0,7800.0,6.2
397,397,12.0,80.0,100.0,26.0,0.6,137.0,4.4,15.8,49.0,6600.0,5.4
398,398,17.0,60.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [10]:
def feat_target(a,b,c):
    df_new = pd.concat([pd.concat([a, b], axis =1),c], axis=1)
    return df_new

In [11]:
kidneys_new = feat_target(kidneys[feat_num], kidneys[feat_cat_bin],kidneys[feat_cat_ord])
kidneys_new.head(4)

Unnamed: 0,id,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,...,dm,cad,pe,ane,classification,sg,htn,su,al,appet
0,0,48.0,80.0,121.0,36.0,1.2,,,15.4,44.0,...,1.0,0.0,0.0,0.0,1,1.02,1.0,0.0,1.0,2.0
1,1,7.0,50.0,,18.0,0.8,,,11.3,38.0,...,0.0,0.0,0.0,0.0,1,1.02,0.0,0.0,4.0,2.0
2,2,62.0,80.0,423.0,53.0,1.8,,,9.6,31.0,...,1.0,0.0,0.0,1.0,1,1.01,0.0,3.0,2.0,1.0
3,3,48.0,70.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,...,0.0,0.0,1.0,1.0,1,1.005,1.0,0.0,4.0,1.0


##  imputing and scaling (pipeline) 

In [12]:
def feat_target_split(df):
    # defining X and y
    X = df.drop(columns=['classification','id'])
    y = df['classification']
    X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2)
    return (X,y, X_train, X_test, y_train, y_test)

In [13]:
X = feat_target_split(kidneys_new)[0]
y = feat_target_split(kidneys_new)[1]
X_train = feat_target_split(kidneys_new)[2]
X_test = feat_target_split(kidneys_new)[3]
y_train = feat_target_split(kidneys_new)[4]
y_test = feat_target_split(kidneys_new)[5]
X_train.head()

Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wc,...,ba,dm,cad,pe,ane,sg,htn,su,al,appet
119,60.0,70.0,140.0,27.0,1.2,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.01,0.0,0.0,0.0,2.0
221,66.0,70.0,248.0,30.0,1.7,138.0,5.3,,,,...,0.0,1.0,0.0,0.0,0.0,1.02,1.0,0.0,1.0,2.0
308,43.0,80.0,81.0,46.0,0.6,135.0,4.9,13.9,48.0,6900.0,...,0.0,0.0,0.0,0.0,0.0,1.025,0.0,0.0,0.0,2.0
349,35.0,70.0,82.0,36.0,1.1,150.0,3.5,14.5,52.0,9400.0,...,0.0,0.0,0.0,0.0,0.0,1.025,0.0,0.0,0.0,2.0
359,74.0,60.0,88.0,50.0,0.6,147.0,3.7,17.2,53.0,6000.0,...,0.0,0.0,0.0,0.0,0.0,1.02,0.0,0.0,0.0,2.0


In [14]:
def preprocessing_2(a,b,X):
    #remove y from the categorical features
    feat_cat_bin = b.drop(columns = 'classification')
    #remove id from the numerical features
    feat_num = a.drop(columns = 'id')
    return (list(feat_num.columns), list(feat_cat_bin.columns))

In [15]:

a = preprocessing_2(kidneys[feat_num], kidneys[feat_cat_bin], X)[0]
b = preprocessing_2(kidneys[feat_num], kidneys[feat_cat_bin], X)[1]
c = feat_cat_ord = split_num_cat(kidneys)[2]

In [36]:
def pipe(a,b,c):
    
    # imputing and scaling numerical features
    preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())
    
    # imputing categorical features with unique values < 5
    preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"))
    # scaling remaining (ordinal)
    preproc_cat_ord = make_pipeline(SimpleImputer(strategy="most_frequent"), StandardScaler())

    preproc_baseline = make_column_transformer((preproc_num, a),
    (preproc_cat, b),
    (preproc_cat_ord,c))

    return preproc_baseline

In [37]:
pipe = pipe(a,b,c)
pipe

# Cleaned and scaled X

In [41]:
X_transformed = pipe.fit_transform(X_train)
X_transformed.shape

(320, 24)

## renaming the columns

In [42]:

SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)


In [45]:
pipe.get_feature_names_out()
X_final = pd.DataFrame(X_transformed, 
             columns=pipe.get_feature_names_out()
            )

In [46]:
X_final.shape

(320, 24)

# Add a logistic regression estimator and crossvalidate

## split data into train and test

In [32]:
# Instantiating the model
logreg = LogisticRegression(max_iter = 200)

# adding an estimator to the pipeline
baseline_pipe = make_pipeline(pipe, logreg)   # adding an estimator to the pipeline
baseline_pipe

In [48]:
# crossvalidating the logistic regression
baseline_score = cross_val_score(pipe, X, y, cv = 5, scoring='accuracy').mean() 
baseline_score

Traceback (most recent call last):
  File "/home/jeanne/.pyenv/versions/3.8.12/envs/kidney_kids/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/jeanne/.pyenv/versions/3.8.12/envs/kidney_kids/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/jeanne/.pyenv/versions/3.8.12/envs/kidney_kids/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/jeanne/.pyenv/versions/3.8.12/envs/kidney_kids/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
AttributeError: 'ColumnTransformer' object has no attribute 'predict'

Traceback (most recent call last):
  File "/home/jeanne/.pyenv/versions/3.8.12/envs/kidney_kids/lib/py

nan

# Models evaluation

## feature selection (univariate -  based on their mutual information with  y)

In [24]:
preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())
preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))


preproc_transformer = make_column_transformer(
    (preproc_num, list(feat_num.columns)),
    (preproc_cat, feat_cat_small),
    remainder="passthrough")
    
preproc_selector = SelectPercentile(
    mutual_info_regression,
    percentile=30, # keep only 30% of all features (filters-out features that, taken individually, least explain y)
)

preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

AttributeError: 'list' object has no attribute 'columns'

In [None]:
preproc.fit_transform(X,y).shape



## feature selection (multivariate - based their combined relationship with y)

In [None]:
preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())
preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))


preproc_transformer = make_column_transformer(
    (preproc_num, list(feat_num.columns)),
    (preproc_cat, feat_cat_small),
    remainder="passthrough")
    
preproc_selector = SequentialFeatureSelector(
    mutual_info_regression,
    percentile=30, # keep only 30% of all features (filters-out features that, taken individually, least explain y)
)

preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

In [None]:
selector.estimator_.coef_