In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn import set_config; set_config(display='diagram')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

### load data, split it and clean it for preoprocessing, retrieve X_train, X_test, y_train, y_test

In [96]:
path = '../raw_data/kidney_disease.csv'
url = "https://storage.googleapis.com/kidney_disaese/raw_data/kidney_disease.csv"
def get_cleaned_data(path=path):
    '''load data from csv
    and use cleaning fct to clean them'''
    df = pd.read_csv(path)
    y = df['classification']
    X = df.drop(columns= {'classification', 'id'})

    X = replacing_numerical_features(X)
    X,y = replacing_binary_features(X,y)

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_cleaned_data("https://storage.googleapis.com/kidney_disaese/raw_data/kidney_disease.csv")
X_train.columns


Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'],
      dtype='object')

### helper functions to clean the data

In [7]:
def replacing_numerical_features(X):
    '''cleaning: strips \t at beginning of number and replaces ? with nan values'''
    X['pcv'] = X['pcv'].str.lstrip('\t')
    X['pcv'] = X['pcv'].replace(to_replace='?',value=np.nan).astype(float)
    X['wc'] = X['wc'].str.lstrip('\t')
    X['wc'] = X['wc'].replace(to_replace='?',value=np.nan).astype(float)
    X['rc'] = X['rc'].str.lstrip('\t')
    X['rc'] = X['rc'].replace(to_replace='?',value=np.nan).astype(float)
    return X


In [8]:
def replacing_binary_features(X,y):
    '''encoding: replacing Yes --> 1 no --> 0'''
    X[['htn','dm','cad','pe','ane']] = X[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
    X[['rbc','pc']] = X[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
    X[['pcc','ba']] = X[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
    X[['appet']] = X[['appet']].replace(to_replace={'good':2,'poor':1,'no':0})
    ## replacing t_values to 0 or 1, by assuming it s close to 0 or 1, respectively
    X['cad'] = X['cad'].replace(to_replace='\tno',value=0)
    X['dm'] = X['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1})

    #encoding the target:
    y= y.replace(to_replace={'ckd':1,'notckd':0, 'ckd\t': 1}).astype(int)
    return X,y

### preprocessing of the data

In [76]:
def preproc(X_train):
    '''takes in a df with column names. returns preprocessed data as numpy array with 24 columns'''
    # creating feat_lists for pipeline
    feat_binary = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
    feat_ordered = ['sg', 'al', 'su']
    feat_continuous = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc',
       'rc']

 

    ordered_transformer = Pipeline([
                                ('cat_imputer', SimpleImputer(strategy='most_frequent')),
                                ('mm_scaler', MinMaxScaler())
                                ])

    binary_transformer = Pipeline([
                                ('cat_imputer', SimpleImputer(strategy='most_frequent'))
                                ])

    cont_transformer = Pipeline([
                                ('num_imputer', SimpleImputer()),
                                ('mm_scaler', MinMaxScaler())
                                ])

    preproc_pipe = ColumnTransformer([
                                        ('ord_trans', ordered_transformer, feat_ordered),
                                        ('bin_trans', binary_transformer, feat_binary),
                                        ('cont_trans', cont_transformer, feat_continuous)
                                    ])


    X_proc = preproc_pipe.fit_transform(X_train)

    return X_proc
X_preproc = preproc(X_train)
X_preproc.shape


(320, 24)

In [49]:
model = RandomForestClassifier()
model.fit(preproc(X_train), y_train)
numpy_array = np.array([0.75, 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0.29885057, 0.23076923, 0.13034188, 0.12195122, 0.00529801, 0.84858044, 0.01797753, 0.97692308, 0.65, 0.31818182, 0.44067797]).reshape(1, -1)
model.predict(numpy_array)  

array([0])

In [14]:
def forest_model(X_proc,y_train):
    '''create the model, do the gridsearch
    and return fitted model with best params'''
    rfc=RandomForestClassifier()

    param_grid = {
    'n_estimators': [100, 300, 500],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3,5,7,10,15],
    'min_samples_split' : [2, 3, 5, 7]
    }

    
    search = GridSearchCV(rfc, param_grid=param_grid, scoring='recall')
    result = search.fit(X_proc,y_train)

    df = pd.DataFrame(result.cv_results_)

    return [result.best_estimator_, df, result.best_params_]

    

model, df, best_prams = forest_model(X_preproc, y_train)
model

KeyboardInterrupt: 

In [12]:
model.predict(preproc(X_test))


NameError: name 'model' is not defined

In [26]:
pd.DataFrame(preproc(X_test)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.460452,0.093351,0.018543,0.826277,0.065329,0.545125,0.366243,0.540142,0.02885867,0.044953
1,0.25,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.213255,0.242021,0.072848,0.835962,0.060606,0.057143,0.209302,0.540142,-1.962389,0.820773
2,0.25,0.25,0.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.316384,1.0,0.417219,1.0,1.0,0.545125,0.366243,0.540142,6.31637e-16,0.820773
3,0.75,0.5,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.19774,0.106383,0.02649,0.826277,0.065329,0.342857,0.366243,0.540142,-0.5400693,0.403024
4,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.064972,0.37234,0.090066,0.804416,0.060606,0.545125,0.366243,0.540142,6.31637e-16,1.357879


## Bausteine für die api

### PredictEndpoint: erste möglichkeit ( weiß nicht, ob es fkt)

In [67]:
from urllib.parse import urlparse
from urllib.parse import parse_qs

url = 'https://www.example.com/some_path?some_key=6&age=9'
param_query = parse_qs(urlparse(url).query)
param_list = []
for i in param_query.values():
    param_list.append(int(i[0]))
param_list


[6, 9]

### zweite möglichkeit: retrieve all paramters passed to function as list

In [112]:
import inspect
def foo(a=0,b=9,c=4): 
    dict = {}
    args = inspect.getfullargspec(foo)
    arg_val_list = list(args.defaults)
    args_list = ['age', 'bp', 'sg']
    for i,j in zip(args_list, arg_val_list):
        dict[i] = [j]
    print(dict)
    df = pd.DataFrame.from_dict(dict)
    return df
foo()

{'age': [0], 'bp': [9], 'sg': [4]}


Unnamed: 0,age,bp,sg
0,0,9,4


### produce query for api request to test

In [111]:
names = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane']
values = [0.75, 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0.29885057, 0.23076923, 0.13034188, 0.12195122, 0.00529801, 0.84858044, 0.01797753, 0.97692308, 0.65, 0.31818182, 0.44067797]
query=''
for i,j in zip(names, values):
    query = query + i + '=' + str(j) + '&'
query   

'age=0.75&bp=0.0&sg=0.0&al=0.0&su=0.0&rbc=0.0&pc=0.0&pcc=0.0&ba=0.0&bgr=0.0&bu=2.0&sc=0.0&sod=0.0&pot=0.29885057&hemo=0.23076923&pcv=0.13034188&wc=0.12195122&rc=0.00529801&htn=0.84858044&dm=0.01797753&cad=0.97692308&appet=0.65&pe=0.31818182&ane=0.44067797&'

## ScatterEndpoint

In [None]:
from io import BytesIO
from starlette.responses import StreamingResponse
...


@router.get("/graph/{id_file}", name="Return the graph obtained")
async def create_graph(id_file: str):
    data = HAR.createGraph(id_file)
    graph = HAR.scatterplot(data['dateTimes'], data['label'], "Time", "Activity")
    
    # create a buffer to store image data
    buf = BytesIO()
    graph.savefig(buf, format="png")
    buf.seek(0)
        
    return StreamingResponse(buf, media_type="image/png")