In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn import set_config; set_config(display='diagram')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

### load data, split it and clean it for preoprocessing, retrieve X_train, X_test, y_train, y_test

In [7]:
path = '../raw_data/kidney_disease.csv'

def get_cleaned_data(path=path):
    '''load data from csv
    and use cleaning fct to clean them'''
    df = pd.read_csv(path)
    y = df['classification']
    X = df.drop(columns= {'classification', 'id'})

    X = replacing_numerical_features(X)
    X,y = replacing_binary_features(X,y)

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_cleaned_data()
X_train

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane
153,55.0,90.0,1.010,2.0,1.0,1.0,1.0,0.0,0.0,273.0,...,8.3,22.0,14600.0,2.9,1.0,1.0,0.0,1.0,1.0,1.0
48,73.0,70.0,1.005,0.0,0.0,0.0,0.0,0.0,0.0,70.0,...,10.0,29.0,18900.0,3.5,1.0,1.0,0.0,2.0,1.0,0.0
246,48.0,110.0,1.015,3.0,0.0,1.0,0.0,1.0,0.0,106.0,...,8.6,26.0,5000.0,2.5,1.0,0.0,1.0,2.0,0.0,1.0
292,30.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,89.0,...,16.7,52.0,10200.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0
122,34.0,70.0,,,,,,0.0,0.0,,...,6.0,,,,1.0,0.0,0.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,34.0,60.0,1.020,0.0,0.0,,0.0,0.0,0.0,117.0,...,,,,,0.0,0.0,0.0,2.0,1.0,0.0
120,72.0,90.0,1.025,1.0,3.0,,0.0,0.0,0.0,323.0,...,12.6,,,,0.0,1.0,1.0,1.0,0.0,0.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0
260,30.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,131.0,...,14.1,45.0,9400.0,5.3,0.0,0.0,0.0,2.0,0.0,0.0


### helper functions to clean the data

In [8]:
def replacing_numerical_features(X):
    '''cleaning: strips \t at beginning of number and replaces ? with nan values'''
    X['pcv'] = X['pcv'].str.lstrip('\t')
    X['pcv'] = X['pcv'].replace(to_replace='?',value=np.nan).astype(float)
    X['wc'] = X['wc'].str.lstrip('\t')
    X['wc'] = X['wc'].replace(to_replace='?',value=np.nan).astype(float)
    X['rc'] = X['rc'].str.lstrip('\t')
    X['rc'] = X['rc'].replace(to_replace='?',value=np.nan).astype(float)
    return X


In [9]:
def replacing_binary_features(X,y):
    '''encoding: replacing Yes --> 1 no --> 0'''
    X[['htn','dm','cad','pe','ane']] = X[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
    X[['rbc','pc']] = X[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
    X[['pcc','ba']] = X[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
    X[['appet']] = X[['appet']].replace(to_replace={'good':2,'poor':1,'no':0})
    ## replacing t_values to 0 or 1, by assuming it s close to 0 or 1, respectively
    X['cad'] = X['cad'].replace(to_replace='\tno',value=0)
    X['dm'] = X['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1})

    #encoding the target:
    y= y.replace(to_replace={'ckd':1,'notckd':0, 'ckd\t': 1}).astype(int)
    return X,y

### preprocessing of the data

In [39]:
def preproc(X_train):

    ''' returns preprocessed data for log reg and tree 
    ( as two features are scaled differnetly then in knn )'''
    # creating feat_lists for pipeline
    feat_binary = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
    feat_ordered = ['sg', 'al', 'su']
    feat_continuous = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc',
       'rc']

 

    ordered_transformer = Pipeline([
                                ('cat_imputer', SimpleImputer(strategy='most_frequent')),
                                ('mm_scaler', MinMaxScaler())
                                ])

    binary_transformer = Pipeline([
                                ('cat_imputer', SimpleImputer(strategy='most_frequent'))
                                ])

    cont_transformer = Pipeline([
                                ('num_imputer', SimpleImputer()),
                                ('mm_scaler', MinMaxScaler())
                                ])

    preproc_pipe = ColumnTransformer([
                                        ('ord_trans', ordered_transformer, feat_ordered),
                                        ('bin_trans', binary_transformer, feat_binary),
                                        ('cont_trans', cont_transformer, feat_continuous)
                                    ])


    X_proc = preproc_pipe.fit_transform(X_train)

    return X_proc


In [11]:
def forest_model(X_proc,y_train):
    '''create the model, do the gridsearch
    and return fitted model with best params'''
    rfc=RandomForestClassifier()

    param_grid = {
    'n_estimators': [100, 300, 500],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3,5,7,10,15],
    'min_samples_split' : [2, 3, 5, 7]
    }

    
    search = GridSearchCV(rfc, param_grid=param_grid, scoring='recall')
    result = search.fit(X_proc,y_train)

    df = pd.DataFrame(result.cv_results_)

    return [result.best_estimator_, df, result.best_params_]

    

model, df, best_prams = forest_model(X_preproc, y_train)
model

In [12]:
model.predict(preproc(X_test))


ValueError: X has 23 features, but RandomForestClassifier is expecting 24 features as input.

In [26]:
pd.DataFrame(preproc(X_test)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.460452,0.093351,0.018543,0.826277,0.065329,0.545125,0.366243,0.540142,0.02885867,0.044953
1,0.25,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.213255,0.242021,0.072848,0.835962,0.060606,0.057143,0.209302,0.540142,-1.962389,0.820773
2,0.25,0.25,0.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.316384,1.0,0.417219,1.0,1.0,0.545125,0.366243,0.540142,6.31637e-16,0.820773
3,0.75,0.5,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.19774,0.106383,0.02649,0.826277,0.065329,0.342857,0.366243,0.540142,-0.5400693,0.403024
4,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.064972,0.37234,0.090066,0.804416,0.060606,0.545125,0.366243,0.540142,6.31637e-16,1.357879


In [27]:
pd.DataFrame(preproc(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.25,0.4,0.2,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.536325,0.728549,0.289308,0.608696,0.020225,0.288889,0.512397,0.135593,-1.682267,0.232635
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.102564,0.095164,0.010482,0.456522,0.033708,0.444444,0.690083,0.237288,-1.039732,1.293001
2,0.5,0.6,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.179487,0.666147,0.310273,0.347826,0.07191,0.377778,0.115702,0.067797,-1.568878,-0.179729
3,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.143162,0.126365,0.002096,0.76087,0.05618,0.955556,0.330579,0.491525,1.492609,-1.240095
4,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.270523,0.678627,0.247379,0.565217,0.029213,0.678864,0.255375,0.449038,-2.551578,-1.004458


## Model Mass Production

### LogReg

In [None]:
#function, which procuces all logreg models which can be choosen on website:
log_reg_prams = {'penalty': ['l1', 'l2', 'elasticnet'], 'C': [0.001, 0.1, 1]}
for penalty in log_reg_prams['penalty']:
    
