# Models

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# %run /content/drive/MyDrive/DentistDataAnalysis/Experiments/Mappings.ipynb
# %run /content/drive/MyDrive/DentistDataAnalysis/Experiments/Helpers.ipynb
%run Mappings.ipynb
%run Helpers.ipynb

In [3]:
# DIR = '/content/drive/MyDrive/DentistDataAnalysis/Experiments/'
DIR = ''

In [4]:
data = pd.read_csv(DIR+'dataset/final_data.csv')

In [5]:
X_cols = ['Que1', 'Que2', 'Que3', 'Que4', 'Que5', 'Que6', 'Que10_a', 'Que10_b', 'Que10_c', 'Que10_d', 'Que10_e', 'Que10_f', 'Que14', 'Que15', 'Que17', 'Que18_age', 'Que19', 'Que20', 'Que21', 'Que22', 'Que_smoking']
y_col = ['Que16']

# Strategy 1 - dropping all missing

In [6]:
data_dropped = data.dropna().reset_index(drop=True)
data_dropped = perform_mapping(data_dropped, questions_map_EN)
model_data = data_dropped.drop(columns=['Que16'])
target_data = data_dropped['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [7]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

Optimization terminated successfully.
         Current function value: 0.568071
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                  748
Model:                          Logit   Df Residuals:                      727
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1293
Time:                        13:28:57   Log-Likelihood:                -424.92
converged:                       True   LL-Null:                       -488.02
Covariance Type:            nonrobust   LLR p-value:                 2.008e-17
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.4411      0.244      1.808      0.071      -0.037       0.919
Que2           -0.3173    

### Undersampling

In [8]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

Optimization terminated successfully.
         Current function value: 0.609852
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                  536
Model:                          Logit   Df Residuals:                      515
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1202
Time:                        13:28:59   Log-Likelihood:                -326.88
converged:                       True   LL-Null:                       -371.53
Covariance Type:            nonrobust   LLR p-value:                 9.854e-11
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.6981      0.273      2.553      0.011       0.162       1.234
Que2           -0.4400    

### Oversampling

In [9]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

Optimization terminated successfully.
         Current function value: 0.609854
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                  960
Model:                          Logit   Df Residuals:                      939
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1202
Time:                        13:28:59   Log-Likelihood:                -585.46
converged:                       True   LL-Null:                       -665.42
Covariance Type:            nonrobust   LLR p-value:                 7.768e-24
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.4833      0.203      2.382      0.017       0.086       0.881
Que2           -0.4388    

### SMOTEENN

In [10]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

Optimization terminated successfully.
         Current function value: 0.568071
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                  748
Model:                          Logit   Df Residuals:                      727
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1293
Time:                        13:29:00   Log-Likelihood:                -424.92
converged:                       True   LL-Null:                       -488.02
Covariance Type:            nonrobust   LLR p-value:                 2.008e-17
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.4411      0.244      1.808      0.071      -0.037       0.919
Que2           -0.3173    

## Random Forest

### No balance

In [11]:
perform_random_forest(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 400}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6383772721800891
Test accuracy:  0.6808510638297872
Test precision:  0.7142857142857143
Test recall:  0.811965811965812
Test F1 score:  0.7600000000000001
Test confusion matrix: 
[[33 38]
 [22 95]]


### Undersampling

In [12]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 300}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6613097387745275
Test accuracy:  0.6542553191489362
Test precision:  0.7708333333333334
Test recall:  0.6324786324786325
Test F1 score:  0.6948356807511736
Test confusion matrix: 
[[49 22]
 [43 74]]


### Oversampling

In [13]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 300}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6325990128807031
Test accuracy:  0.6702127659574468
Test precision:  0.7131782945736435
Test recall:  0.7863247863247863
Test F1 score:  0.7479674796747968
Test confusion matrix: 
[[34 37]
 [25 92]]


### SMOTEENN

In [14]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6567352834958469
Test accuracy:  0.6968085106382979
Test precision:  0.7272727272727273
Test recall:  0.8205128205128205
Test F1 score:  0.7710843373493976
Test confusion matrix: 
[[35 36]
 [21 96]]


## Decision Tree

### No balance

In [15]:
perform_decision_tree(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 110,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.66076802696521
Test accuracy:  0.6914893617021277
Test precision:  0.736
Test recall:  0.7863247863247863
Test F1 score:  0.7603305785123966
Test confusion matrix: 
[[38 33]
 [25 92]]


### Undersampling

In [16]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'splitter': 'best'}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6165884194053208
Test accuracy:  0.574468085106383
Test precision:  0.7761194029850746
Test recall:  0.4444444444444444
Test F1 score:  0.5652173913043478
Test confusion matrix: 
[[56 15]
 [65 52]]


### Oversampling

In [17]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.5317804261466234
Test accuracy:  0.5585106382978723
Test precision:  0.646551724137931
Test recall:  0.6410256410256411
Test F1 score:  0.6437768240343348
Test confusion matrix: 
[[30 41]
 [42 75]]


### SMOTEENN

In [18]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([ 71, 117]))
Test ROC AUC:  0.6675695196821958
Test accuracy:  0.6861702127659575
Test precision:  0.75
Test recall:  0.7435897435897436
Test F1 score:  0.7467811158798283
Test confusion matrix: 
[[42 29]
 [30 87]]


# Strategy 2 - median

In [19]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data_median = pd.DataFrame(imputer.fit_transform(data))
data_median.columns = data.columns
data_median.index = data.index
data_median = perform_mapping(data_median, questions_map_EN)
model_data = data_median.drop(columns=['Que16'])
target_data = data_median['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [20]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

Optimization terminated successfully.
         Current function value: 0.550389
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1143
Time:                        13:41:43   Log-Likelihood:                -1897.7
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 3.800e-91
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.0987      0.119      0.830      0.407      -0.135       0.332
Que2           -0.0075    

### Undersampling

In [21]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

Optimization terminated successfully.
         Current function value: 0.619735
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 2158
Model:                          Logit   Df Residuals:                     2137
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1059
Time:                        13:41:43   Log-Likelihood:                -1337.4
converged:                       True   LL-Null:                       -1495.8
Covariance Type:            nonrobust   LLR p-value:                 2.891e-55
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.1821      0.140      1.304      0.192      -0.092       0.456
Que2           -0.1110    

### Oversampling

In [22]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

Optimization terminated successfully.
         Current function value: 0.616760
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 4738
Model:                          Logit   Df Residuals:                     4717
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1102
Time:                        13:41:43   Log-Likelihood:                -2922.2
converged:                       True   LL-Null:                       -3284.1
Covariance Type:            nonrobust   LLR p-value:                1.986e-140
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.1886      0.094      1.997      0.046       0.003       0.374
Que2           -0.1282    

### SMOTEENN

In [23]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

Optimization terminated successfully.
         Current function value: 0.550389
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1143
Time:                        13:41:43   Log-Likelihood:                -1897.7
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 3.800e-91
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.0987      0.119      0.830      0.407      -0.135       0.332
Que2           -0.0075    

## Random Forest

### No balance

In [24]:
perform_random_forest(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 900}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6385358413828165
Test accuracy:  0.7146171693735499
Test precision:  0.7534039334341907
Test recall:  0.8571428571428571
Test F1 score:  0.8019323671497585
Test confusion matrix: 
[[118 163]
 [ 83 498]]


### Undersampling

In [25]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 70,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 800}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6851268827215318
Test accuracy:  0.6832946635730859
Test precision:  0.8195020746887967
Test recall:  0.6798623063683304
Test F1 score:  0.7431796801505174
Test confusion matrix: 
[[194  87]
 [186 395]]


### Oversampling

In [26]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6492824373242845
Test accuracy:  0.7018561484918794
Test precision:  0.7673267326732673
Test recall:  0.8003442340791739
Test F1 score:  0.7834877843302442
Test confusion matrix: 
[[140 141]
 [116 465]]


### SMOTEENN

In [27]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 800}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6376170671501461
Test accuracy:  0.7146171693735499
Test precision:  0.7526395173453997
Test recall:  0.8588640275387264
Test F1 score:  0.8022508038585209
Test confusion matrix: 
[[117 164]
 [ 82 499]]


## Decision Tree

### No balance

In [28]:
perform_decision_tree(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6092177556183045
Test accuracy:  0.6763341067285383
Test precision:  0.7396825396825397
Test recall:  0.802065404475043
Test F1 score:  0.7696118909991743
Test confusion matrix: 
[[117 164]
 [115 466]]


### Undersampling

In [29]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6584487415855593
Test accuracy:  0.6473317865429234
Test precision:  0.8070953436807096
Test recall:  0.6265060240963856
Test F1 score:  0.7054263565891473
Test confusion matrix: 
[[194  87]
 [217 364]]


### Oversampling

In [30]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6728796222000355
Test accuracy:  0.7076566125290024
Test precision:  0.789103690685413
Test recall:  0.7728055077452668
Test F1 score:  0.7808695652173913
Test confusion matrix: 
[[161 120]
 [132 449]]


### SMOTEENN

In [31]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6565744421509117
Test accuracy:  0.703016241299304
Test precision:  0.7740303541315345
Test recall:  0.7900172117039587
Test F1 score:  0.7819420783645655
Test confusion matrix: 
[[147 134]
 [122 459]]


# Strategy 3 - mean

In [32]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data_mean = pd.DataFrame(imputer.fit_transform(data))
data_mean.columns = data.columns
data_mean.index = data.index
# mean values are not integer numbers so rounding to int was necessary
data_mean = data_mean.round()
data_mean = perform_mapping(data_mean, questions_map_EN)
model_data = data_mean.drop(columns=['Que16'])
target_data = data_mean['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [33]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

Optimization terminated successfully.
         Current function value: 0.545146
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1228
Time:                        14:03:30   Log-Likelihood:                -1879.7
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 1.012e-98
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.2643      0.105      2.508      0.012       0.058       0.471
Que2           -0.0023    

### Undersampling

In [34]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

Optimization terminated successfully.
         Current function value: 0.614073
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 2158
Model:                          Logit   Df Residuals:                     2137
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1141
Time:                        14:03:30   Log-Likelihood:                -1325.2
converged:                       True   LL-Null:                       -1495.8
Covariance Type:            nonrobust   LLR p-value:                 2.774e-60
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.2976      0.122      2.432      0.015       0.058       0.537
Que2           -0.0948    

### Oversampling

In [35]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

Optimization terminated successfully.
         Current function value: 0.609498
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 4738
Model:                          Logit   Df Residuals:                     4717
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1207
Time:                        14:03:30   Log-Likelihood:                -2887.8
converged:                       True   LL-Null:                       -3284.1
Covariance Type:            nonrobust   LLR p-value:                5.109e-155
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.3320      0.083      3.997      0.000       0.169       0.495
Que2           -0.1092    

### SMOTEENN

In [36]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

Optimization terminated successfully.
         Current function value: 0.545146
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                  0.1228
Time:                        14:03:30   Log-Likelihood:                -1879.7
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 1.012e-98
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.2643      0.105      2.508      0.012       0.058       0.471
Que2           -0.0023    

## Random Forest

### No balance

In [37]:
perform_random_forest(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6507586012581081
Test accuracy:  0.7273781902552204
Test precision:  0.7597597597597597
Test recall:  0.8709122203098106
Test F1 score:  0.8115477145148355
Test confusion matrix: 
[[121 160]
 [ 75 506]]


### Undersampling

In [38]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 500}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6867898640826653
Test accuracy:  0.6867749419953596
Test precision:  0.8193018480492813
Test recall:  0.6867469879518072
Test F1 score:  0.747191011235955
Test confusion matrix: 
[[193  88]
 [182 399]]


### Oversampling

In [39]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6686226349219961
Test accuracy:  0.7192575406032483
Test precision:  0.7792421746293245
Test recall:  0.8141135972461274
Test F1 score:  0.7962962962962963
Test confusion matrix: 
[[147 134]
 [108 473]]


### SMOTEENN

In [40]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 400}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6535149239561193
Test accuracy:  0.7273781902552204
Test precision:  0.7621212121212121
Test recall:  0.8657487091222031
Test F1 score:  0.8106365834004835
Test confusion matrix: 
[[124 157]
 [ 78 503]]


## Decision Tree

### No balance

In [41]:
perform_decision_tree(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6647852212102094
Test accuracy:  0.7041763341067285
Test precision:  0.7820069204152249
Test recall:  0.7779690189328744
Test F1 score:  0.7799827437446074
Test confusion matrix: 
[[155 126]
 [129 452]]


### Undersampling

In [42]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6360612761161576
Test accuracy:  0.6357308584686775
Test precision:  0.7834394904458599
Test recall:  0.6351118760757315
Test F1 score:  0.7015209125475286
Test confusion matrix: 
[[179 102]
 [212 369]]


### Oversampling

In [43]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.64791040113683
Test accuracy:  0.6925754060324826
Test precision:  0.7696245733788396
Test recall:  0.7762478485370051
Test F1 score:  0.7729220222793488
Test confusion matrix: 
[[146 135]
 [130 451]]


### SMOTEENN

In [44]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 110,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6599004048731786
Test accuracy:  0.7099767981438515
Test precision:  0.7744610281923715
Test recall:  0.8037865748709122
Test F1 score:  0.7888513513513513
Test confusion matrix: 
[[145 136]
 [114 467]]


# Strategy 4 - most frequent

In [45]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_most_frequent = pd.DataFrame(imputer.fit_transform(data))
data_most_frequent.columns = data.columns
data_most_frequent.index = data.index
# mean values are not integer numbers so rounding to int was necessary
data_most_frequent = perform_mapping(data_most_frequent, questions_map_EN)
model_data = data_most_frequent.drop(columns=['Que16'])
target_data = data_most_frequent['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [46]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

Optimization terminated successfully.
         Current function value: 0.574586
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                 0.07538
Time:                        14:24:50   Log-Likelihood:                -1981.2
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 1.564e-56
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.1369      0.115      1.189      0.234      -0.089       0.363
Que2           -0.0607    

### Undersampling

In [47]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

Optimization terminated successfully.
         Current function value: 0.647708
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 2158
Model:                          Logit   Df Residuals:                     2137
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                 0.06556
Time:                        14:24:50   Log-Likelihood:                -1397.8
converged:                       True   LL-Null:                       -1495.8
Covariance Type:            nonrobust   LLR p-value:                 6.585e-31
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.1861      0.135      1.379      0.168      -0.078       0.451
Que2           -0.1732    

### Oversampling

In [48]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

Optimization terminated successfully.
         Current function value: 0.648641
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 4738
Model:                          Logit   Df Residuals:                     4717
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                 0.06421
Time:                        14:24:50   Log-Likelihood:                -3073.3
converged:                       True   LL-Null:                       -3284.1
Covariance Type:            nonrobust   LLR p-value:                 6.240e-77
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.2137      0.091      2.353      0.019       0.036       0.392
Que2           -0.1699    

### SMOTEENN

In [49]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

Optimization terminated successfully.
         Current function value: 0.574586
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  Que16   No. Observations:                 3448
Model:                          Logit   Df Residuals:                     3427
Method:                           MLE   Df Model:                           20
Date:                Wed, 18 Aug 2021   Pseudo R-squ.:                 0.07538
Time:                        14:24:50   Log-Likelihood:                -1981.2
converged:                       True   LL-Null:                       -2142.7
Covariance Type:            nonrobust   LLR p-value:                 1.564e-56
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Que1            0.1369      0.115      1.189      0.234      -0.089       0.363
Que2           -0.0607    

## Random Forest

### No balance

In [50]:
perform_random_forest(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6252779292053828
Test accuracy:  0.7041763341067285
Test precision:  0.7454819277108434
Test recall:  0.8519793459552496
Test F1 score:  0.7951807228915664
Test confusion matrix: 
[[112 169]
 [ 86 495]]


### Undersampling

In [51]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 700}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6610764358909965
Test accuracy:  0.6682134570765661
Test precision:  0.7967806841046278
Test recall:  0.6815834767641996
Test F1 score:  0.7346938775510204
Test confusion matrix: 
[[180 101]
 [185 396]]


### Oversampling

In [52]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6385480917059188
Test accuracy:  0.6960556844547564
Test precision:  0.759349593495935
Test recall:  0.8037865748709122
Test F1 score:  0.7809364548494984
Test confusion matrix: 
[[133 148]
 [114 467]]


### SMOTEENN

In [53]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 400}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6260803253685816
Test accuracy:  0.7064965197215777
Test precision:  0.7455089820359282
Test recall:  0.8571428571428571
Test F1 score:  0.7974379503602882
Test confusion matrix: 
[[111 170]
 [ 83 498]]


## Decision Tree

### No balance

In [54]:
perform_decision_tree(X_train, X_test, y_train, y_test)

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6568071982898549
Test accuracy:  0.6983758700696056
Test precision:  0.7762478485370051
Test recall:  0.7762478485370051
Test F1 score:  0.7762478485370051
Test confusion matrix: 
[[151 130]
 [130 451]]


### Undersampling

In [55]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.5918345471361808
Test accuracy:  0.5649651972157773
Test precision:  0.7627551020408163
Test recall:  0.5146299483648882
Test F1 score:  0.6145940390544707
Test confusion matrix: 
[[188  93]
 [282 299]]


### Oversampling

In [56]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.6290939048517405
Test accuracy:  0.6647331786542924
Test precision:  0.7616487455197133
Test recall:  0.7314974182444062
Test F1 score:  0.7462686567164178
Test confusion matrix: 
[[148 133]
 [156 425]]


### SMOTEENN

In [57]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'criterion': 'entropy',
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'splitter': 'best'}
(array([0., 1.]), array([281, 581]))
Test ROC AUC:  0.573572377971469
Test accuracy:  0.654292343387471
Test precision:  0.7166921898928025
Test recall:  0.8055077452667814
Test F1 score:  0.7585089141004863
Test confusion matrix: 
[[ 96 185]
 [113 468]]
