4 different datasets</br>
1. smote</br>
2. smote with PCA</br>
3. RandomSampling</br>
4. RandomSampling with PCA</br>
</br>
We will try to find the best dataset for logistic regression

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

In [2]:
# this function aims to measure some metrics
def metric(clf, x, y, mode=None, return_output=False):  
    
    preds = clf.predict(x)
    prec = precision_score(y, preds)
    reca = recall_score(y, preds)
    roc = roc_auc_score(y, preds)
    f1 = f1_score(y, preds)
    acc = accuracy_score(y, preds)
    conf_mat = confusion_matrix(y, preds)
    if mode is not None:
        print(f'=============={mode}=============')
        print("Precision: ", prec)
        print("Recall: ", reca)
        print("ROC score: ", roc)
        print("F1 score: ", f1)
        print("Accuracy score: ", acc)
        print("confusion matrix")
        print(conf_mat)
    
    if return_output is True:
        return [prec, reca, roc, f1, acc]

In [9]:
tr = pd.read_excel('./smote_dataset/tr.xlsx')# (82044, 51)
te = pd.read_excel('./smote_dataset/test_final.xlsx')

In [10]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)


train x shape: (65635, 50)
train y shape: (65635,)
val x shape: (16409, 50)
val y shape: (16409,)


# SMOTE

In [37]:
criterion = ['gini', 'entropy']

final_records = []
for c in criterion:
    record = []
    record.append(c)
    clf = DecisionTreeClassifier(c)
    clf.fit(x_train, y_train)

    me_list = metric(clf, x_val, y_val, None, True)
    record += me_list
    final_records.append(record)
        
df_smote = pd.DataFrame(final_records, columns=['criterion', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [38]:
df_smote


Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.963649,0.982206,0.972575,0.972839,0.972576
1,entropy,0.967893,0.980987,0.974221,0.974396,0.974221


In [39]:
df_smote.to_csv('./c45/c45_result_smote.csv')

# SMOTE w/ PCA

In [40]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (65635, 31)
train y shape: (65635,)
val x shape: (16409, 31)
val y shape: (16409,)


In [16]:
criterion = ['gini', 'entropy']

final_records = []
for c in criterion:
    record = []
    record.append(c)
    clf = DecisionTreeClassifier(c)
    clf.fit(x_train, y_train)

    me_list = metric(clf, x_val, y_val, None, True)
    record += me_list
    final_records.append(record)
        
df_smote_pca = pd.DataFrame(final_records, columns=['criterion', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [17]:
df_smote_pca

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.963295,0.981962,0.972271,0.972539,0.972271
1,entropy,0.966935,0.980134,0.973307,0.97349,0.973307


In [18]:
df_smote_pca.to_csv('./c45/c45_result_smote_pca.csv')

# RandomSampling

In [19]:
tr = pd.read_excel('./downsampling_dataset/tr.xlsx')# (1406, 51)
te = pd.read_excel('./downsampling_dataset/te.xlsx')

In [20]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (1124, 50)
train y shape: (1124,)
val x shape: (282, 50)
val y shape: (282,)


In [21]:
criterion = ['gini', 'entropy']

final_records = []
for c in criterion:
    record = []
    record.append(c)
    clf = DecisionTreeClassifier(c)
    clf.fit(x_train, y_train)

    me_list = metric(clf, x_val, y_val, None, True)
    record += me_list
    final_records.append(record)
        
df_rs = pd.DataFrame(final_records, columns=['criterion', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [22]:
df_rs

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.642857,0.638298,0.641844,0.640569,0.641844
1,entropy,0.678571,0.673759,0.677305,0.676157,0.677305


In [23]:
df_rs.to_csv('./c45/c45_result_rs.csv')

# RandomSampling w/ PCA

In [24]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (1124, 32)
train y shape: (1124,)
val x shape: (282, 32)
val y shape: (282,)


In [25]:
criterion = ['gini', 'entropy']

final_records = []
for c in criterion:
    record = []
    record.append(c)
    clf = DecisionTreeClassifier(c)
    clf.fit(x_train, y_train)

    me_list = metric(clf, x_val, y_val, None, True)
    record += me_list
    final_records.append(record)
        
df_rs_pca = pd.DataFrame(final_records, columns=['criterion', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [26]:
df_rs_pca

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.625954,0.58156,0.617021,0.602941,0.617021
1,entropy,0.622222,0.595745,0.617021,0.608696,0.617021


In [27]:
df_rs_pca.to_csv('./c45/c45_result_rs_pca.csv')

#### After calculate four different datasets' metrics
#### we want to use the best one 
#### so, let us check the performance of each dataset

In [41]:
df_smote
# we can choose 

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.963649,0.982206,0.972575,0.972839,0.972576
1,entropy,0.967893,0.980987,0.974221,0.974396,0.974221


In [29]:
df_smote_pca

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.963295,0.981962,0.972271,0.972539,0.972271
1,entropy,0.966935,0.980134,0.973307,0.97349,0.973307


In [30]:
df_rs

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.642857,0.638298,0.641844,0.640569,0.641844
1,entropy,0.678571,0.673759,0.677305,0.676157,0.677305


In [31]:
df_rs_pca

Unnamed: 0,criterion,precision,recall,ROC,f1,acc
0,gini,0.625954,0.58156,0.617021,0.602941,0.617021
1,entropy,0.622222,0.595745,0.617021,0.608696,0.617021


### best dataset we choose smote w/o PCA

In [27]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

metric(clf, x_val, y_val, True, True)

train x shape: (65635, 50)
train y shape: (65635,)
val x shape: (16409, 50)
val y shape: (16409,)
Precision:  0.9643412707909537
Recall:  0.982205971968312
ROC score:  0.9729411137267207
F1 score:  0.9731916435213137
Accuracy score:  0.9729416783472484
confusion matrix
[[7906  298]
 [ 146 8059]]


[0.9643412707909537,
 0.982205971968312,
 0.9729411137267207,
 0.9731916435213137,
 0.9729416783472484]

In [10]:
from sklearn.tree import export_graphviz
import os

cols = tr.columns[:-1]
with open("./c45/Tree.dot", 'w') as f:
    export_graphviz(clf,
                    feature_names=cols,
                    filled=True,
                    rounded=True,
                    out_file=f)

In [23]:
te.iloc[0, :]

Fluid and Electrolyte Disorders            0.000000
cci3                                       0.000000
Hypothyroidism                             0.000000
elx3                                       0.000000
Valvular Disease                           1.000000
Diagnosis_5                                0.000000
Anemia_0                                   1.000000
Other Neurological Disorders               0.000000
Obesity                                    0.000000
Congestive Heart Failure                   0.000000
Psyciatric disorder_0                      1.000000
Renal Failure                              0.000000
Diagnosis_2                                1.000000
Diagnosis_3                                0.000000
Lung disease_0                             1.000000
Solid Tumor without Metastasis             0.000000
Peptic Ulcer Disease excluding bleeding    0.000000
Rheumatoid Arthritis/collagen              1.000000
Liver Disease                              0.000000
Heart diseas

In [24]:
te.values[0]

array([0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 1.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 1.        ,
       0.6289394 , 0.21569802, 0.3122449 , 0.06896552, 0.14634146,
       0.56551724, 0.51898734, 0.47783251, 0.63278689, 0.56578947,
       1.        , 0.41224152, 0.4175    , 0.38576779, 0.30909091,
       0.17123288, 0.40955631, 0.13034623, 0.13807107, 0.14705882])

In [28]:
clf.predict(te.iloc[0, :].values.reshape(1, -1))



array([1], dtype=int64)

In [29]:
print('final prediction of testing data')
clf.predict(te)

final prediction of testing data


array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,