In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics

import time

import os

# import eli5

from sklearn import tree
# import graphviz

from matplotlib import pyplot as plt
# from pdpbox import pdp, get_dataset, info_plots

import shap

import warnings

import scipy as sp

import lime
from lime import lime_tabular
# import lightgbm as lgb
import seaborn as sns

import imblearn


import plotly

In [2]:
data = pd.read_csv("../data/alge_data/epoct_ezvir_05dec2018_unlabeled_Jan.csv")

In [3]:
data_labels = pd.read_csv("../data/alge_data/epoct_ezvir_05dec2018_dictionary.csv", encoding="unicode_escape")

In [5]:

selected=['symp_fever_dur',
 'dem_sex',
 'date_d30_01',
 'sign_skininf',
 'sign_respdistress',
 'days_of_fever',
 'dem_child_house',
 'date_season_meteo',
 'signv_temp_d0',
 'signv_hr1_d0',
 'symp_cough',
 'signv_muac_d0',
 'symp_vomit',
 'symp_loa',
 'lab_malaria_any_d0',
 'dem_age_int',
 'symp_diarrhea',
 'symp_skin_any',
 'signv_waz_d0',
 'signv_rr1_d0',
 'symp_pharyngitis',
 'sign_danger',
 'symp_abdopain',
 'symp_feveronly',
 'dem_mat_educat']

In [6]:
data_selected = data[selected]

In [7]:

n_patients=len(data_selected)


In [8]:
data_selected = data_selected[data_selected["lab_malaria_any_d0"].isnull()==False]

In [10]:
data_selected.dropna(axis=1,thresh=n_patients*0.8, inplace=True)
data_selected.fillna(data_selected.median(), inplace=True)

In [11]:
# doing this manually to be sure but should not be

numerical = ["dem_age_int", "signv_temp_d0","days_of_fever","signv_rr1_d0"]
null_values = ["symp_liquidstool","dem_ward_kin"]
non_binary = ["date_season_meteo"]
confounding = ["dxlab_malaria_hi","dxlab_malaria_low", "dxlab_malaria_hsrdt"]
categorical = list(set(selected)-set(numerical)-set(null_values)-set(non_binary)-set(confounding))
all_categorical = categorical+["dry","rainy","post-rainy"]


categorical_confounded = list(set(selected)-set(numerical)-set(null_values)-set(non_binary))
all_categorical_confounded =  categorical_confounded+["date_season_meteo_1","date_season_meteo_2","date_season_meteo_3"]


In [12]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
le = LabelEncoder()
mms = MinMaxScaler()


data_numerical = pd.DataFrame(mms.fit_transform(data_selected[numerical]), columns=numerical)
data_ml = data_selected[categorical]
data_ml = pd.get_dummies(data_selected["date_season_meteo"], 
                         prefix="season")\
                        .merge(data_ml,
                               left_index=True,
                               right_index=True)\
                        .merge(data_numerical,
                               left_index=True,
                               right_index=True)\
                        .rename({"season_1":"dry",
                                "season_2":"rainy",
                                "season_3":"post-rainy"},
                               axis=1)

Here we separate the data between label (y) and features (X)

In [16]:
y_label= ["lab_malaria_any_d0"]

X_labels = list(set(data_ml.columns)-set(y_label))


X = data_ml[X_labels]
y = data_ml[y_label]



In [18]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.combine import SMOTETomek

In [21]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

In [28]:
ros_auto = RandomOverSampler(random_state=0, sampling_strategy="auto")
train_X_ros, train_y_ros = ros_auto.fit_sample(train_X, train_y["lab_malaria_any_d0"])

smote_auto = SMOTE(random_state=0, sampling_strategy="auto")
train_X_smote, train_y_smote = smote_auto.fit_sample(train_X, train_y["lab_malaria_any_d0"])

smotenc = SMOTENC(random_state=0, sampling_strategy="auto", categorical_features=categorical_index)
train_X_smotenc, train_y_smotenc = smote_nc.fit_sample(train_X, train_y["lab_malaria_any_d0"])

smote_tomek = SMOTETomek(random_state=0, sampling_strategy="auto")
train_X_smote_tomek, train_y_smote_tomek = smote_tomek.fit_sample(train_X, train_y["lab_malaria_any_d0"])


In [29]:
import time

In [30]:
start = time.time()
rf = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X, np.ravel(train_y))
stop = time.time()
t = stop-start

start = time.time()
rf_ros = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X_ros, np.ravel(train_y_ros))
stop = time.time()
t_ros = stop-start

start = time.time()
rf_smote = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X_smote, np.ravel(train_y_smote))
stop = time.time()
t_smote = stop-start

start = time.time()
rf_smotenc = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X_smotenc, np.ravel(train_y_smotenc))
stop = time.time()
t_smotenc = stop-start

start = time.time()
rf_smote_tomek = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X_smote_tomek, np.ravel(train_y_smote_tomek))
stop = time.time()
t_smote_tomek = stop-start


In [32]:
import sklearn

In [34]:
models = [ rf, rf_ros, rf_smote, rf_smotenc, rf_smote_tomek]

names = [ "rf", "rf_ros", "rf_smote", "rf_smotenc", "rf_smote_tomek"]


for idx, val in enumerate(models):
    preds = val.predict(val_X)
    accuracy = sklearn.metrics.accuracy_score(y_pred = preds,y_true = val_y)
    precision = sklearn.metrics.precision_score(y_pred = preds,y_true = val_y)
    recall = sklearn.metrics.recall_score(y_pred = preds,y_true = val_y)
    roc_auc = sklearn.metrics.roc_auc_score(y_score = preds, y_true = val_y)

    print("{} \naccuracy {} \nprecision {} \nrecall {}\nAUC {}\n\n".format(names[idx],
                                                                           accuracy, 
                                                                           precision, 
                                                                           recall, 
                                                                           roc_auc,))
    
        
        

rf 
accuracy 0.8753541076487252 
precision 1.0 
recall 0.011235955056179775
AUC 0.5056179775280899


rf_ros 
accuracy 0.8753541076487252 
precision 0.5555555555555556 
recall 0.056179775280898875
AUC 0.5248483965545498


rf_smote 
accuracy 0.8611898016997167 
precision 0.3783783783783784 
recall 0.15730337078651685
AUC 0.5600131116493362


rf_smotenc 
accuracy 0.8243626062322946 
precision 0.21311475409836064 
recall 0.14606741573033707
AUC 0.5341358148343743


rf_smote_tomek 
accuracy 0.8555240793201133 
precision 0.3142857142857143 
recall 0.12359550561797752
AUC 0.5423488062935917




ok what about if we also oversample the test set and do the test train split afterwards?