In [27]:
# Data Exploration
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Pickle
import pickle

# Classification models
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn import neighbors

from sklearn.neural_network import MLPClassifier

# Onehotencoding
from sklearn.preprocessing import OneHotEncoder

# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# train_test_split
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import classification_report, f1_score, precision_score, confusion_matrix, recall_score

# sample
from sklearn.utils import resample

# SMOTE
from imblearn.over_sampling import SMOTE

In [28]:
data = pd.read_csv("data/diabetes_clean.csv")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi_score,haemoglobin_level,blood_glucose_level,is_diabetic
0,Female,80,No,Yes,never,25.19,6.6,140,0
1,Female,54,No,No,no info,27.32,6.6,80,0
2,Male,28,No,No,never,27.32,5.7,158,0
3,Female,36,No,No,current,23.45,5.0,155,0
4,Male,76,Yes,Yes,current,20.14,4.8,155,0


In [29]:
data.shape

(100000, 9)

In [30]:
X_train_processed = pd.read_csv("data/train-test/X_train_processed.csv")
X_train_processed

Unnamed: 0,gender_Male,gender_Other,hypertension_Yes,heart_disease_Yes,smoking_history_former,smoking_history_never,smoking_history_no info,age,bmi_score,haemoglobin_level,blood_glucose_level
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.9125,0.180418,0.000000,0.000000
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0000,0.178340,0.400000,0.295455
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4750,0.175040,0.090909,0.354545
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.3250,0.108300,0.272727,0.090909
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.7625,0.147904,0.181818,0.022727
...,...,...,...,...,...,...,...,...,...,...,...
79995,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.6125,0.280773,0.400000,0.000000
79996,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1875,0.221122,0.272727,0.359091
79997,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5250,0.197164,0.418182,0.022727
79998,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4625,0.182740,0.490909,0.354545


In [31]:
y_train = pd.read_csv("data/train-test/y_train.csv")
y_train

Unnamed: 0,is_diabetic
0,0
1,1
2,0
3,0
4,0
...,...
79995,0
79996,0
79997,0
79998,0


In [32]:
X_test_processed = pd.read_csv("data/train-test/X_test_processed.csv")
X_test_processed

Unnamed: 0,gender_Male,gender_Other,hypertension_Yes,heart_disease_Yes,smoking_history_former,smoking_history_never,smoking_history_no info,age,bmi_score,haemoglobin_level,blood_glucose_level
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1625,0.132135,0.418182,0.209091
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0375,0.134336,0.272727,0.295455
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.7875,0.187141,0.000000,0.545455
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0250,0.090698,0.472727,0.209091
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.4125,0.367559,0.490909,0.545455
...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5500,0.145948,0.454545,0.359091
19996,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.7625,0.390784,0.000000,0.045455
19997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.6125,0.201687,0.236364,0.090909
19998,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.9125,0.211588,0.563636,0.090909


In [33]:
y_test = pd.read_csv("data/train-test/y_test.csv")
y_test

Unnamed: 0,is_diabetic
0,0
1,0
2,0
3,0
4,1
...,...
19995,0
19996,0
19997,0
19998,0


# SMOTE

In [34]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_processed,y_train)

In [35]:
display(X_train_SMOTE.shape)
display(y_train_SMOTE.shape)

(146416, 11)

(146416, 1)

In [36]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred_smote = LR.predict(X_test_processed)

smote_precision = precision_score(y_test,pred_smote)
smote_recall_score = recall_score(y_test,pred_smote)
smote_f1_score = f1_score(y_test,pred_smote)

print("SMOTE precision: ", smote_precision)
print("SMOTE recall: ", smote_recall_score)
print("SMOTE f1: ", smote_f1_score)

  y = column_or_1d(y, warn=True)


SMOTE precision:  0.4287339971550498
SMOTE recall:  0.8823185011709602
SMOTE f1:  0.5770629906184186


In [37]:
LR.score(X_test_processed, y_test)

0.88955

In [38]:
array = confusion_matrix(y_test, pred_smote)

print('             Predicted Labels')
print('             |   0    |     1')
print('---------------------------------')
print('True label 0 | ',array[0][0],'|   ', (array[0][1]))
print('---------------------------------')
print('           1 | ',array[1][0],' |   ', (array[1][1]))

             Predicted Labels
             |   0    |     1
---------------------------------
True label 0 |  16284 |    2008
---------------------------------
           1 |  201  |    1507


In [39]:
total = y_test.shape[0]

array = confusion_matrix(y_test, pred_smote)
print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',round((array[0][0]/total),2),'|   ', (array[0][1])/total)
print('---------------------------------')
print('           B | ',round((array[1][0]/total),2),' |   ', (array[1][1])/total)

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  0.81 |    0.1004
---------------------------------
           B |  0.01  |    0.07535


# SMOTE sampling techniques:
    
SMOTE score: 0.88
SMOTE precision:  0.42
SMOTE recall:  0.88
SMOTE f1:  0.57

# Oversampling/Undersampling technique

In [40]:
# Oversampling/ Undersampling has to be done on train set.

train = pd.concat([X_train_processed,y_train], axis = 1)
print(train.shape)
train.head()

(80000, 12)


Unnamed: 0,gender_Male,gender_Other,hypertension_Yes,heart_disease_Yes,smoking_history_former,smoking_history_never,smoking_history_no info,age,bmi_score,haemoglobin_level,blood_glucose_level,is_diabetic
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.9125,0.180418,0.0,0.0,0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.17834,0.4,0.295455,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.475,0.17504,0.090909,0.354545,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.325,0.1083,0.272727,0.090909,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.7625,0.147904,0.181818,0.022727,0


In [41]:
# Seperating diabetic and non-diabetic data.

no_diabetic = train[train["is_diabetic"] == 0]
yes_diabetic = train[train["is_diabetic"] == 1]

In [42]:
display(no_diabetic.shape)
display(yes_diabetic.shape)

(73208, 12)

(6792, 12)

In [43]:
# We oversample yes_diabetic data since they are low in numbers.

yes_diabetic_oversampled = resample(yes_diabetic, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no_diabetic),#<- make both sets the same size
                                    random_state=0)

In [44]:
# After oversampling now the no and yes diabetic dataset are of same size.

display(no_diabetic.shape)
display(yes_diabetic_oversampled.shape)
yes_diabetic_oversampled.head(30)

(73208, 12)

(73208, 12)

Unnamed: 0,gender_Male,gender_Other,hypertension_Yes,heart_disease_Yes,smoking_history_former,smoking_history_never,smoking_history_no info,age,bmi_score,haemoglobin_level,blood_glucose_level,is_diabetic
32337,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.7,0.283706,0.545455,0.363636,1
30769,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.8625,0.211588,0.4,0.909091,1
19929,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9125,0.211588,0.418182,0.359091,1
38241,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.8875,0.319521,0.490909,0.636364,1
57760,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7375,0.211588,0.963636,0.909091,1
56894,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.975,0.252536,0.472727,1.0,1
68666,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,0.264515,0.454545,0.545455,1
12510,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.675,0.253636,0.6,0.227273,1
51401,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7125,0.230901,0.6,0.340909,1
69338,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.675,0.195086,0.963636,0.227273,1


In [45]:
# Now we concat no and yes oversampled churn dataset

train_oversampled = pd.concat([no_diabetic, yes_diabetic_oversampled], axis = 0)

display(train_oversampled.shape)
train_oversampled.head()

(146416, 12)

Unnamed: 0,gender_Male,gender_Other,hypertension_Yes,heart_disease_Yes,smoking_history_former,smoking_history_never,smoking_history_no info,age,bmi_score,haemoglobin_level,blood_glucose_level,is_diabetic
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.9125,0.180418,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.475,0.17504,0.090909,0.354545,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.325,0.1083,0.272727,0.090909,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.7625,0.147904,0.181818,0.022727,0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.425,0.115512,0.4,0.209091,0


In [46]:
# Now creating target and feature columns for model training

y_train_over = train_oversampled["is_diabetic"]
X_train_over = train_oversampled.drop("is_diabetic", axis = 1)

# Logistic Regression

In [47]:
LR_over = LogisticRegression(random_state=0, solver='lbfgs')
LR_over.fit(X_train_over, y_train_over)
pred_oversampled = LR_over.predict(X_test_processed)

oversample_precision = precision_score(y_test,pred_oversampled)
oversample_recall = recall_score(y_test,pred_oversampled)
oversample_f1score = f1_score(y_test,pred_oversampled)

print("Oversampling precision: ", oversample_precision)
print("Oversampling recall: ",oversample_recall)
print("Oversampling f1: ",oversample_f1score)

Oversampling precision:  0.4287339971550498
Oversampling recall:  0.8823185011709602
Oversampling f1:  0.5770629906184186


In [48]:
LR_over.score(X_test_processed, y_test)

0.88955

In [49]:
pickle.dump(LR_over, open('Pickle/Oversampled_ML_Models/LR_model.p', 'wb'))

# Linear Regression results:

score: 0.88
precision:  0.42
recall:  0.88
f1:  0.57

# DecisionTreeClassifier

In [50]:
tree_model_over = DecisionTreeClassifier(max_depth = 2)
tree_model_over.fit(X_train_over, y_train_over)

In [51]:
y_pred_tree = tree_model_over.predict(X_test_processed)

In [52]:
tree_model_over.score(X_test_processed, y_test)

0.97215

In [53]:
array_tree = confusion_matrix(y_test, y_pred_tree)
total = y_test.shape[0]
print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',round((array_tree[0][0]/total),2),'|   ', (array_tree[0][1])/total)
print('---------------------------------')
print('           B | ',round((array_tree[1][0]/total),2),' |   ', (array_tree[1][1])/total)

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  0.91 |    0.0
---------------------------------
           B |  0.03  |    0.05755


In [54]:
oversample_precision = precision_score(y_test,y_pred_tree)
oversample_recall = recall_score(y_test,y_pred_tree)
oversample_f1score = f1_score(y_test,y_pred_tree)

print("Oversampling precision: ", oversample_precision)
print("Oversampling recall: ",oversample_recall)
print("Oversampling f1: ",oversample_f1score)

Oversampling precision:  1.0
Oversampling recall:  0.6738875878220141
Oversampling f1:  0.8051766351871283


In [55]:
pickle.dump(tree_model_over, open('Pickle/Oversampled_ML_Models/DecisionTree_model.p', 'wb'))

# DecisionTree Oversampling

Model score: 0.97
Precision:  1.0
Recall:  0.67
f1:  0.80

# KNN neighbours

In [56]:
KNN_model_over = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform')
KNN_model_over.fit(X_train_over, y_train_over)

In [57]:
y_pred_knn = KNN_model_over.predict(X_test_processed)

In [58]:
KNN_model_over.score(X_test_processed, y_test)

0.9357

In [59]:
array_KNN = confusion_matrix(y_test, y_pred_knn)
total = y_test.shape[0]

print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',round((array_KNN[0][0]/total),2),'|   ', (array_KNN[0][1])/total)
print('---------------------------------')
print('           B | ',round((array_KNN[1][0]/total),2),' |   ', (array_KNN[1][1])/total)

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  0.87 |    0.04175
---------------------------------
           B |  0.02  |    0.06285


In [60]:
oversample_precision = precision_score(y_test,y_pred_knn)
oversample_recall = recall_score(y_test,y_pred_knn)
oversample_f1score = f1_score(y_test,y_pred_knn)

print("Oversampling precision: ", oversample_precision)
print("Oversampling recall: ",oversample_recall)
print("Oversampling f1: ",oversample_f1score)

Oversampling precision:  0.6008604206500956
Oversampling recall:  0.7359484777517564
Oversampling f1:  0.661578947368421


In [61]:
pickle.dump(KNN_model_over, open('Pickle/Oversampled_ML_Models/KNN_model.p', 'wb'))

# KNN Oversampling:
   
Model score: 0.93
precision:  0.60
recall:  0.73
f1:  0.66

# MLPClassifier

In [62]:
mlp = MLPClassifier(random_state = 42)
mlp.fit(X_train_over, y_train_over)
print(mlp.score(X_test_processed, y_test))

0.9059


In [63]:
y_pred_mlp = mlp.predict(X_test_processed)

In [64]:
array_mlp = confusion_matrix(y_test, y_pred_mlp)
total = y_test.shape[0]

print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',round((array_mlp[0][0]/total),2),'|   ', (array_mlp[0][1])/total)
print('---------------------------------')
print('           B | ',round((array_mlp[1][0]/total),2),' |   ', (array_mlp[1][1])/total)

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  0.83 |    0.08585
---------------------------------
           B |  0.01  |    0.07715


In [65]:
oversample_precision = precision_score(y_test,y_pred_mlp)
oversample_recall = recall_score(y_test,y_pred_mlp)
oversample_f1score = f1_score(y_test,y_pred_mlp)

print("MLPClassifier precision: ", oversample_precision)
print("MLPClassifier recall: ",oversample_recall)
print("MLPClassifier f1: ",oversample_f1score)

MLPClassifier precision:  0.4733128834355828
MLPClassifier recall:  0.9033957845433255
MLPClassifier f1:  0.6211755233494364


In [66]:
pickle.dump(mlp, open('Pickle/Oversampled_ML_Models/MLP_model.p', 'wb'))

# Results After Oversampling

In [67]:
titles = ["ModelName", "Test data score", "F1", "Precision", "Recall"]
models = ["Logistic Regression", "KNN Classifier", "DecisionTreeClassifier", "MLPClassifier"]    
test_score = [0.88, 0.93, 0.97, 0.90]
f1 = [0.57, 0.66, 0.80, 0.62]
precision = [0.42, 0.60, 1.0, 0.47]
recall = [0.88, 0.73, 0.67, 0.90]

results_after_oversampling = pd.DataFrame(zip(models, test_score, f1, precision, recall),columns = titles)
results_after_oversampling

Unnamed: 0,ModelName,Test data score,F1,Precision,Recall
0,Logistic Regression,0.88,0.57,0.42,0.88
1,KNN Classifier,0.93,0.66,0.6,0.73
2,DecisionTreeClassifier,0.97,0.8,1.0,0.67
3,MLPClassifier,0.9,0.62,0.47,0.9


In [68]:
results_after_oversampling.to_csv("results/results_after_oversampling.csv", index = False)

In [69]:
results_before_sampling = pd.read_csv("results/ML_results_before_Sampling.csv")

# Results Before Oversampling

In [70]:
results_before_sampling

Unnamed: 0,ModelName,Test data score,F1,Precision,Recall
0,Logistic Regression,0.95,0.71,0.86,0.61
1,KNN Classifier,0.95,0.72,0.85,0.63
2,DecisionTreeClassifier,0.97,0.8,1.0,0.67
3,MLPClassifier,0.97,0.8,0.99,0.67


# Conclusion:
    
Based on above results, model scores and their metrics have relatively improved.
And among all models, DecisionTree and MLP Classifier have performed better after oversampling.