4 different datasets</br>
1. smote</br>
2. smote with PCA</br>
3. RandomSampling</br>
4. RandomSampling with PCA</br>
</br>
We will try to find the best dataset for logistic regression

In [1]:
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RFC

In [2]:
# this function aims to measure some metrics
def metric(clf, x, y, mode=None, return_output=False):  
    
    preds = clf.predict(x)
    prec = precision_score(y, preds)
    reca = recall_score(y, preds)
    roc = roc_auc_score(y, preds)
    f1 = f1_score(y, preds)
    acc = accuracy_score(y, preds)
    conf_mat = confusion_matrix(y, preds)
    if mode is not None:
        print(f'=============={mode}=============')
        print("Precision: ", prec)
        print("Recall: ", reca)
        print("ROC score: ", roc)
        print("F1 score: ", f1)
        print("Accuracy score: ", acc)
        print("confusion matrix")
        print(conf_mat)
    
    if return_output is True:
        return [prec, reca, roc, f1, acc]

In [9]:
tr = pd.read_excel('./smote_dataset/tr.xlsx')# (82044, 51)
# te = pd.read_excel('./smote_dataset/test_final.xlsx')

In [10]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)


train x shape: (65635, 50)
train y shape: (65635,)
val x shape: (16409, 50)
val y shape: (16409,)


# SMOTE

In [12]:
n_estimators = [50, 100, 200, 250, 300, 350]
criterion = ['gini', 'entropy']
min_samples_leaf = [50, 60, 70, 80, 90]


final_records = []
for n in n_estimators:
    for c in criterion: 
        for m in min_samples_leaf:
            record = []
            record.append(n)
            record.append(c)
            record.append(m)
            clf = RFC(random_state=0,
                     n_estimators=n,
                     criterion=c,
                     min_samples_leaf=m,
                     )
            clf.fit(x_train, y_train)

            me_list = metric(clf, x_val, y_val, None, True)
            record += me_list
            final_records.append(record)

df_smote = pd.DataFrame(final_records, columns=['n_estimators', 'criterion', 'min_sample_leaf', 'precision', 'recall', 'ROC', 'f1', 'acc'])


In [13]:
df_smote

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.937515,0.943571,0.940337,0.940533,0.940338
1,50,gini,60,0.928399,0.937112,0.932415,0.932735,0.932415
2,50,gini,70,0.923467,0.932358,0.927539,0.927891,0.92754
3,50,gini,80,0.921723,0.925655,0.923517,0.923685,0.923518
4,50,gini,90,0.915386,0.921633,0.918215,0.918499,0.918216
5,50,entropy,50,0.941531,0.951859,0.946371,0.946667,0.946371
6,50,entropy,60,0.934851,0.94613,0.940093,0.940457,0.940094
7,50,entropy,70,0.923965,0.936015,0.92949,0.929951,0.92949
8,50,entropy,80,0.921592,0.936868,0.928575,0.929167,0.928576
9,50,entropy,90,0.912887,0.929799,0.920531,0.921266,0.920531


In [14]:
df_smote.to_csv('./RF/RF_result_smote.csv')

# SMOTE w/ PCA

In [15]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (65635, 31)
train y shape: (65635,)
val x shape: (16409, 31)
val y shape: (16409,)


In [16]:
n_estimators = [50, 100, 200, 250, 300, 350]
criterion = ['gini', 'entropy']
min_samples_leaf = [50, 60, 70, 80, 90]


final_records = []
for n in n_estimators:
    for c in criterion: 
        for m in min_samples_leaf:
            record = []
            record.append(n)
            record.append(c)
            record.append(m)
            clf = RFC(random_state=0,
                     n_estimators=n,
                     criterion=c,
                     min_samples_leaf=m,
                     )
            clf.fit(x_train, y_train)

            me_list = metric(clf, x_val, y_val, None, True)
            record += me_list
            final_records.append(record)

df_smote_pca = pd.DataFrame(final_records, columns=['n_estimators', 'criterion', 'min_sample_leaf', 'precision', 'recall', 'ROC', 'f1', 'acc'])


In [17]:
df_smote_pca

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.920587,0.925411,0.922786,0.922993,0.922786
1,50,gini,60,0.914422,0.914199,0.914315,0.91431,0.914315
2,50,gini,70,0.906402,0.907617,0.906941,0.907009,0.906941
3,50,gini,80,0.900646,0.900427,0.900542,0.900536,0.900542
4,50,gini,90,0.896476,0.896039,0.896276,0.896257,0.896276
5,50,entropy,50,0.923522,0.934552,0.928575,0.929004,0.928576
6,50,entropy,60,0.915759,0.922121,0.918642,0.918929,0.918642
7,50,entropy,70,0.909058,0.91493,0.911695,0.911984,0.911695
8,50,entropy,80,0.90354,0.905302,0.904321,0.90442,0.904321
9,50,entropy,90,0.896321,0.899817,0.897861,0.898066,0.897861


In [18]:
df_smote.to_csv('./RF/RF_result_smote_pca.csv')

# RandomSampling

In [19]:
tr = pd.read_excel('./downsampling_dataset/tr.xlsx')# (1406, 51)
te = pd.read_excel('./downsampling_dataset/te.xlsx')

In [20]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)


train x shape: (1124, 50)
train y shape: (1124,)
val x shape: (282, 50)
val y shape: (282,)


In [21]:
n_estimators = [50, 100, 200, 250, 300, 350]
criterion = ['gini', 'entropy']
min_samples_leaf = [50, 60, 70, 80, 90]


final_records = []
for n in n_estimators:
    for c in criterion: 
        for m in min_samples_leaf:
            record = []
            record.append(n)
            record.append(c)
            record.append(m)
            clf = RFC(random_state=0,
                     n_estimators=n,
                     criterion=c,
                     min_samples_leaf=m,
                     )
            clf.fit(x_train, y_train)

            me_list = metric(clf, x_val, y_val, None, True)
            record += me_list
            final_records.append(record)

df_rs = pd.DataFrame(final_records, columns=['n_estimators', 'criterion', 'min_sample_leaf', 'precision', 'recall', 'ROC', 'f1', 'acc'])


In [22]:
df_rs

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.65,0.737589,0.670213,0.69103,0.670213
1,50,gini,60,0.63354,0.723404,0.652482,0.675497,0.652482
2,50,gini,70,0.64557,0.723404,0.663121,0.682274,0.663121
3,50,gini,80,0.624242,0.730496,0.64539,0.673203,0.64539
4,50,gini,90,0.615854,0.716312,0.634752,0.662295,0.634752
5,50,entropy,50,0.654088,0.737589,0.673759,0.693333,0.673759
6,50,entropy,60,0.656051,0.730496,0.673759,0.691275,0.673759
7,50,entropy,70,0.641509,0.723404,0.659574,0.68,0.659574
8,50,entropy,80,0.626506,0.737589,0.648936,0.677524,0.648936
9,50,entropy,90,0.616766,0.730496,0.638298,0.668831,0.638298


In [23]:
df_rs.to_csv('./RF/RF_result_rs.csv')

# RandomSampling w/ PCA

In [24]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (1124, 32)
train y shape: (1124,)
val x shape: (282, 32)
val y shape: (282,)


In [25]:
n_estimators = [50, 100, 200, 250, 300, 350]
criterion = ['gini', 'entropy']
min_samples_leaf = [50, 60, 70, 80, 90]


final_records = []
for n in n_estimators:
    for c in criterion: 
        for m in min_samples_leaf:
            record = []
            record.append(n)
            record.append(c)
            record.append(m)
            clf = RFC(random_state=0,
                     n_estimators=n,
                     criterion=c,
                     min_samples_leaf=m,
                     )
            clf.fit(x_train, y_train)

            me_list = metric(clf, x_val, y_val, None, True)
            record += me_list
            final_records.append(record)

df_rs_pca = pd.DataFrame(final_records, columns=['n_estimators', 'criterion', 'min_sample_leaf', 'precision', 'recall', 'ROC', 'f1', 'acc'])


In [26]:
df_rs_pca

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.597561,0.695035,0.613475,0.642623,0.613475
1,50,gini,60,0.606061,0.70922,0.624113,0.653595,0.624113
2,50,gini,70,0.597561,0.695035,0.613475,0.642623,0.613475
3,50,gini,80,0.598802,0.70922,0.617021,0.649351,0.617021
4,50,gini,90,0.585366,0.680851,0.599291,0.629508,0.599291
5,50,entropy,50,0.621118,0.70922,0.638298,0.662252,0.638298
6,50,entropy,60,0.606061,0.70922,0.624113,0.653595,0.624113
7,50,entropy,70,0.60241,0.70922,0.620567,0.651466,0.620567
8,50,entropy,80,0.60241,0.70922,0.620567,0.651466,0.620567
9,50,entropy,90,0.595092,0.687943,0.609929,0.638158,0.609929


In [27]:
df_rs_pca.to_csv('./RF/RF_result_rs_pca.csv')

#### After calculate four different datasets' metrics
#### we want to use the best one 
#### so, let us check the performance of each dataset

In [28]:
df_smote

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.937515,0.943571,0.940337,0.940533,0.940338
1,50,gini,60,0.928399,0.937112,0.932415,0.932735,0.932415
2,50,gini,70,0.923467,0.932358,0.927539,0.927891,0.92754
3,50,gini,80,0.921723,0.925655,0.923517,0.923685,0.923518
4,50,gini,90,0.915386,0.921633,0.918215,0.918499,0.918216
5,50,entropy,50,0.941531,0.951859,0.946371,0.946667,0.946371
6,50,entropy,60,0.934851,0.94613,0.940093,0.940457,0.940094
7,50,entropy,70,0.923965,0.936015,0.92949,0.929951,0.92949
8,50,entropy,80,0.921592,0.936868,0.928575,0.929167,0.928576
9,50,entropy,90,0.912887,0.929799,0.920531,0.921266,0.920531


In [29]:
df_smote_pca

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.920587,0.925411,0.922786,0.922993,0.922786
1,50,gini,60,0.914422,0.914199,0.914315,0.91431,0.914315
2,50,gini,70,0.906402,0.907617,0.906941,0.907009,0.906941
3,50,gini,80,0.900646,0.900427,0.900542,0.900536,0.900542
4,50,gini,90,0.896476,0.896039,0.896276,0.896257,0.896276
5,50,entropy,50,0.923522,0.934552,0.928575,0.929004,0.928576
6,50,entropy,60,0.915759,0.922121,0.918642,0.918929,0.918642
7,50,entropy,70,0.909058,0.91493,0.911695,0.911984,0.911695
8,50,entropy,80,0.90354,0.905302,0.904321,0.90442,0.904321
9,50,entropy,90,0.896321,0.899817,0.897861,0.898066,0.897861


In [30]:
df_rs

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.65,0.737589,0.670213,0.69103,0.670213
1,50,gini,60,0.63354,0.723404,0.652482,0.675497,0.652482
2,50,gini,70,0.64557,0.723404,0.663121,0.682274,0.663121
3,50,gini,80,0.624242,0.730496,0.64539,0.673203,0.64539
4,50,gini,90,0.615854,0.716312,0.634752,0.662295,0.634752
5,50,entropy,50,0.654088,0.737589,0.673759,0.693333,0.673759
6,50,entropy,60,0.656051,0.730496,0.673759,0.691275,0.673759
7,50,entropy,70,0.641509,0.723404,0.659574,0.68,0.659574
8,50,entropy,80,0.626506,0.737589,0.648936,0.677524,0.648936
9,50,entropy,90,0.616766,0.730496,0.638298,0.668831,0.638298


In [31]:
df_rs_pca

Unnamed: 0,n_estimators,criterion,min_sample_leaf,precision,recall,ROC,f1,acc
0,50,gini,50,0.597561,0.695035,0.613475,0.642623,0.613475
1,50,gini,60,0.606061,0.70922,0.624113,0.653595,0.624113
2,50,gini,70,0.597561,0.695035,0.613475,0.642623,0.613475
3,50,gini,80,0.598802,0.70922,0.617021,0.649351,0.617021
4,50,gini,90,0.585366,0.680851,0.599291,0.629508,0.599291
5,50,entropy,50,0.621118,0.70922,0.638298,0.662252,0.638298
6,50,entropy,60,0.606061,0.70922,0.624113,0.653595,0.624113
7,50,entropy,70,0.60241,0.70922,0.620567,0.651466,0.620567
8,50,entropy,80,0.60241,0.70922,0.620567,0.651466,0.620567
9,50,entropy,90,0.595092,0.687943,0.609929,0.638158,0.609929


### now, we want to choose a dataset which performed best in this task

In [3]:
tr = pd.read_excel('./smote_dataset/tr.xlsx')# (82044, 51)
te = pd.read_excel('./smote_dataset/test_final.xlsx')

# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)
x_te = pca.transform(te)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

            
clf = RFC(random_state=0,
         n_estimators=300,
         criterion='entropy',
         min_samples_leaf=50,
         )
clf.fit(x_train, y_train)

metric(clf, x_val, y_val, True, False)


train x shape: (65635, 31)
train y shape: (65635,)
val x shape: (16409, 31)
val y shape: (16409,)
Precision:  0.9288566243194193
Recall:  0.9356489945155393
ROC score:  0.9319883197833669
F1 score:  0.9322404371584699
Accuracy score:  0.9319885428728137
confusion matrix
[[7616  588]
 [ 528 7677]]


In [60]:
from sklearn.tree import export_graphviz
import os

cols = ["feature"+str(i) for i in range(1, x_te.shape[1]+1)]
le=3 #choose three tree to be example
for i in range(1, le+1):
    with open(f"./RF/Tree{i}.dot", 'w') as f:
        export_graphviz(clf.estimators_[i],
                        feature_names=cols,
                        filled=True,
                        rounded=True,
                        out_file=f)


In [4]:
x_te[0].reshape(-1, 1)# the first datapoint

array([[ 0.45357885],
       [ 0.38423615],
       [ 0.57077094],
       [ 0.48654608],
       [ 1.34554126],
       [ 0.22057829],
       [-0.44774681],
       [ 0.52645516],
       [-0.88663745],
       [ 0.41766925],
       [-0.71471069],
       [-0.10084639],
       [ 0.18070588],
       [-0.432354  ],
       [ 0.00285949],
       [-0.18977016],
       [ 0.13479382],
       [ 0.25130629],
       [ 0.05869713],
       [-0.51888306],
       [ 0.14043726],
       [ 0.52810122],
       [-0.04767489],
       [-0.1102922 ],
       [ 0.46409199],
       [ 0.20167183],
       [-0.02295221],
       [ 0.0142072 ],
       [-0.10034815],
       [-0.00514255],
       [ 0.31183918]])

In [5]:
clf.estimators_[0].predict(x_te[0].reshape(1, -1))

array([0.])

In [6]:
print('final prediction of testing data')
clf.predict(x_te)

final prediction of testing data


array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
x_te[0]

array([ 0.45357885,  0.38423615,  0.57077094,  0.48654608,  1.34554126,
        0.22057829, -0.44774681,  0.52645516, -0.88663745,  0.41766925,
       -0.71471069, -0.10084639,  0.18070588, -0.432354  ,  0.00285949,
       -0.18977016,  0.13479382,  0.25130629,  0.05869713, -0.51888306,
        0.14043726,  0.52810122, -0.04767489, -0.1102922 ,  0.46409199,
        0.20167183, -0.02295221,  0.0142072 , -0.10034815, -0.00514255,
        0.31183918])