***This notebook is dedicated to evaluated the kidney.csv dataset on the following classification models: logistic regression, k-nearest neighbor and random forest***

# Imports

In [136]:
# DATA MANIPULATION
import pandas as pd
import numpy as np

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns


In [209]:
# sklearn preproc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
#from sklearn.feature_selection import SequentialFeatureSelector



# Dataset loading

In [170]:
kidneys =pd.read_csv("/home/jeanne/Projet_kidney_kids/raw_data/kidney_disease.csv")

# Inspecting data

In [171]:
kidneys.head(4)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700,3.9,yes,no,no,poor,yes,yes,ckd


In [172]:
kidneys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             328 non-null    float64
 17  wc              293 non-null    obj

In [173]:
## convert wc column to float

kidneys[['wc', 'rc']] = kidneys[['wc','rc']].apply(pd.to_numeric, errors='coerce')

In [174]:
kidneys.dtypes.value_counts()


float64    14
object     11
int64       1
dtype: int64

### striping '\t' at beginning of number and replaces '?' with nan values

In [175]:
def replacing_numerical_features(X):
    '''cleaning: strips \t at beginning of number and replaces ? with nan values'''
    X['pcv'] = X['pcv'].astype(str).str.lstrip("\t")
    X['pcv'] = X['pcv'].replace('?',np.nan).astype(float)
    X['wc'] = X['wc'].astype(str).str.lstrip("\t")
    X['wc'] = X['wc'].replace('?',np.nan).astype(float)
    X['rc'] = X['rc'].astype(str).str.lstrip("\t")
    X['rc'] = X['rc'].replace('?',np.nan).astype(float)
    return X
kidneys = replacing_numerical_features(kidneys)
kidneys.head(4)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd


## numerical features

In [176]:
# splitting numerical  and categorical features
feat_num = kidneys.select_dtypes(include=['float64', 'int64'])
feat_num.head(10)

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6
5,5,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4
6,6,68.0,70.0,1.01,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,
7,7,24.0,,1.015,2.0,4.0,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0
8,8,52.0,100.0,1.015,3.0,0.0,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0
9,9,53.0,90.0,1.02,2.0,0.0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7


## Categorical features

In [177]:


feat_cat = kidneys.drop(columns = feat_num.columns)
feat_cat

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...
395,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
396,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
397,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
398,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd


### encoding categorical features

In [178]:
def replacing_binary_features(X):
    X['classification'] = X['classification'].map({"ckd":1,"notckd":0, "ckd\t": 1})
    X['rbc'] = X['rbc'].map({"normal":0,"abnormal":1})
    X['pc'] = X['pc'].map({"normal":0,"abnormal":1})
    X['pcc'] = X['pcc'].map({"present":1,"notpresent":0})
    X['ba'] = X['ba'].map({"present":1,"notpresent":0})
    X['appet'] = X['appet'].map({"good":2,"poor":1, "bad":0})
    X['dm'] = X['dm'].map({"yes":1,"no":0, "\tno":0,"\tyes":1})
    X['cad'] = X['cad'].map({"yes":1,"no":0,"\tno":0})
    X['pe'] = X['pe'].map({"yes":1,"no":0})
    X['htn'] = X['htn'].map({"yes":1,"no":0})
    X['ane'] = X['ane'].map({"yes":1,"no":0})
    return X
feat_cat = replacing_binary_features(feat_cat)
feat_cat

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1
1,,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0


### combining categorical and numerical

In [179]:
kidneys_new = pd.concat([feat_num, feat_cat], axis = 1)
kidneys_new.head(4)

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1
1,1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1


# Pipeline

In [180]:
# defining X and y
X = kidneys_new.drop(columns=['classification','id'])
y = kidneys_new['classification']

In [181]:
feat_cat

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1
1,,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0


In [182]:
#remove y from the categorical features
feat_cat = feat_cat.drop(columns = 'classification')
#remove id from the numerical features
feat_num = feat_num.drop(columns = 'id')

In [183]:
feat_cat_nunique = feat_cat.nunique()

In [184]:
# categorical features to one-hot-encode
feat_cat_small = list(feat_cat_nunique[feat_cat_nunique < 5].index)
feat_cat_small

['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

In [185]:
preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())

preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))

preproc_baseline = make_column_transformer((preproc_num, list(feat_num.columns)),
    (preproc_cat, feat_cat_small),
    remainder="passthrough")

preproc_baseline

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('pipeline-1',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0)),
                                                 ('minmaxscaler',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=

In [186]:
#shape_preproc_baseline = preproc_baseline.fit_transform(X).shape
#shape_preproc_baseline

# Add a logistic regression estimator and crossvalidate

## split data into train and test

In [187]:
X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2)

In [188]:
# Instantiating the model
logreg = LogisticRegression(max_iter = 200)

# adding an estimator to the pipeline
baseline_pipe = make_pipeline(preproc_baseline, logreg)   # adding an estimator to the pipeline
baseline_pipe

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                             

In [189]:
# crossvalidating the logistic regression
baseline_score = cross_val_score(baseline_pipe, X, y, cv = 5, scoring = 'recall').mean() 
baseline_score

0.9404255319148935

# Models evaluation

## feature selection (univariate -  based on their mutual information with  y)

In [199]:
preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())
preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))


preproc_transformer = make_column_transformer(
    (preproc_num, list(feat_num.columns)),
    (preproc_cat, feat_cat_small),
    remainder="passthrough")
    
preproc_selector = SelectPercentile(
    mutual_info_regression,
    percentile=30, # keep only 30% of all features (filters-out features that, taken individually, least explain y)
)

preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                             

In [200]:
preproc.fit_transform(X,y).shape



(400, 10)

## feature selection (multivariate - based their combined relationship with y)

In [203]:
preproc_num = make_pipeline(SimpleImputer(strategy = "most_frequent"), MinMaxScaler())
preproc_cat = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))


preproc_transformer = make_column_transformer(
    (preproc_num, list(feat_num.columns)),
    (preproc_cat, feat_cat_small),
    remainder="passthrough")
    
preproc_selector = SequentialFeatureSelector(
    mutual_info_regression,
    percentile=30, # keep only 30% of all features (filters-out features that, taken individually, least explain y)
)

preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

In [204]:
selector.estimator_.coef_

AttributeError: 'Pipeline' object has no attribute 'coef_'