4 different datasets</br>
1. smote</br>
2. smote with PCA</br>
3. RandomSampling</br>
4. RandomSampling with PCA</br>
</br>
We will try to find the best dataset for logistic regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [5]:
# this function aims to measure some metrics
def metric(clf, x, y, mode=None, return_output=False):  
    
    preds = clf.predict(x)
    prec = precision_score(y, preds)
    reca = recall_score(y, preds)
    roc = roc_auc_score(y, preds)
    f1 = f1_score(y, preds)
    acc = accuracy_score(y, preds)
    conf_mat = confusion_matrix(y, preds)
    if mode is not None:
        print(f'=============={mode}=============')
        print("Precision: ", prec)
        print("Recall: ", reca)
        print("ROC score: ", roc)
        print("F1 score: ", f1)
        print("Accuracy score: ", acc)
        print("confusion matrix")
        print(conf_mat)
    
    if return_output is True:
        return [prec, reca, roc, f1, acc]

In [6]:
tr = pd.read_excel('./smote_dataset/tr.xlsx')# (82044, 51)
# te = pd.read_excel('./smote_dataset/te.xlsx')

KeyboardInterrupt: 

In [None]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)


    

# SMOTE

In [5]:
C = [0.1, 0.5, 1, 5, 10, 15, 20]
penalty=['l1', 'l2', 'elasticnet']

final_records = []
for c in C:
    for p in penalty:  
        record = []
        record.append(c)
        record.append(p)
        clf = LogisticRegression(C=c,
                                 random_state=0,
                                 max_iter=500,
                                 penalty=p,
                                 solver='saga',
                                 l1_ratio=0.5)
        clf.fit(x_train, y_train)
        
        me_list = metric(clf, x_val, y_val, None, True)
        record += me_list
        final_records.append(record)
        
df_smote = pd.DataFrame(final_records, columns=['C', 'penalty', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [6]:
df_smote

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.680537,0.679707,0.680297,0.680122,0.680297
1,0.1,l2,0.675816,0.68117,0.677189,0.678483,0.677189
2,0.1,elasticnet,0.676471,0.68117,0.677677,0.678812,0.677677
3,0.5,l1,0.685224,0.685558,0.685295,0.685391,0.685295
4,0.5,l2,0.68276,0.682511,0.682674,0.682635,0.682674
5,0.5,elasticnet,0.683274,0.683608,0.683344,0.683441,0.683345
6,1.0,l1,0.686117,0.687873,0.686574,0.686994,0.686574
7,1.0,l2,0.683897,0.685314,0.684259,0.684605,0.684259
8,1.0,elasticnet,0.684748,0.686167,0.685112,0.685457,0.685112
9,5.0,l1,0.68561,0.688117,0.68627,0.686861,0.68627


In [7]:
df_smote.to_csv('./LR/LR_result_smote.csv')

# SMOTE w/ PCA

In [8]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (65635, 31)
train y shape: (65635,)
val x shape: (16409, 31)
val y shape: (16409,)


In [9]:
C = [0.1, 0.5, 1, 5, 10, 15, 20]
penalty=['l1', 'l2', 'elasticnet']

final_records = []
for c in C:
    for p in penalty:  
        record = []
        record.append(c)
        record.append(p)
        clf = LogisticRegression(C=c,
                                 random_state=0,
                                 max_iter=500,
                                 penalty=p,
                                 solver='saga',
                                 l1_ratio=0.5)
        clf.fit(x_train, y_train)
        
        me_list = metric(clf, x_val, y_val, None, True)
        record += me_list
        final_records.append(record)
        
df_smote_pca = pd.DataFrame(final_records, columns=['C', 'penalty', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [10]:
df_smote_pca.to_csv('./LR/LR_result_smote_pca.csv')

# RandomSampling

In [11]:
tr = pd.read_excel('./downsampling_dataset/tr.xlsx')# (1406, 51)
te = pd.read_excel('./downsampling_dataset/te.xlsx')

In [12]:
# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (1124, 50)
train y shape: (1124,)
val x shape: (282, 50)
val y shape: (282,)


In [13]:
C = [0.1, 0.5, 1, 5, 10, 15, 20]
penalty=['l1', 'l2', 'elasticnet']

final_records = []
for c in C:
    for p in penalty:  
        record = []
        record.append(c)
        record.append(p)
        clf = LogisticRegression(C=c,
                                 random_state=0,
                                 max_iter=500,
                                 penalty=p,
                                 solver='saga',
                                 l1_ratio=0.5)
        clf.fit(x_train, y_train)
        
        me_list = metric(clf, x_val, y_val, None, True)
        record += me_list
        final_records.append(record)
        
df_rs = pd.DataFrame(final_records, columns=['C', 'penalty', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [14]:
df_rs

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.597403,0.652482,0.606383,0.623729,0.606383
1,0.1,l2,0.613636,0.574468,0.606383,0.593407,0.606383
2,0.1,elasticnet,0.604027,0.638298,0.609929,0.62069,0.609929
3,0.5,l1,0.636364,0.64539,0.638298,0.640845,0.638298
4,0.5,l2,0.631206,0.631206,0.631206,0.631206,0.631206
5,0.5,elasticnet,0.635714,0.631206,0.634752,0.633452,0.634752
6,1.0,l1,0.631206,0.631206,0.631206,0.631206,0.631206
7,1.0,l2,0.661972,0.666667,0.663121,0.664311,0.663121
8,1.0,elasticnet,0.65,0.64539,0.648936,0.647687,0.648936
9,5.0,l1,0.634483,0.652482,0.638298,0.643357,0.638298


In [15]:
df_rs.to_csv('./LR/LR_result_rs.csv')

# RandomSampling w/ PCA

In [16]:
pca = PCA(n_components = 0.98)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)

train x shape: (1124, 32)
train y shape: (1124,)
val x shape: (282, 32)
val y shape: (282,)


In [17]:
C = [0.1, 0.5, 1, 5, 10, 15, 20]
penalty=['l1', 'l2', 'elasticnet']

final_records = []
for c in C:
    for p in penalty:  
        record = []
        record.append(c)
        record.append(p)
        clf = LogisticRegression(C=c,
                                 random_state=0,
                                 max_iter=500,
                                 penalty=p,
                                 solver='saga',
                                 l1_ratio=0.5)
        clf.fit(x_train, y_train)
        
        me_list = metric(clf, x_val, y_val, None, True)
        record += me_list
        final_records.append(record)
        
df_rs_pca = pd.DataFrame(final_records, columns=['C', 'penalty', 'precision', 'recall', 'ROC', 'f1', 'acc'])
        



In [18]:
df_rs_pca

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.659091,0.617021,0.648936,0.637363,0.648936
1,0.1,l2,0.607692,0.560284,0.599291,0.583026,0.599291
2,0.1,elasticnet,0.635036,0.617021,0.631206,0.625899,0.631206
3,0.5,l1,0.585714,0.58156,0.585106,0.58363,0.585106
4,0.5,l2,0.586957,0.574468,0.585106,0.580645,0.585106
5,0.5,elasticnet,0.597122,0.588652,0.595745,0.592857,0.595745
6,1.0,l1,0.604478,0.574468,0.599291,0.589091,0.599291
7,1.0,l2,0.586466,0.553191,0.58156,0.569343,0.58156
8,1.0,elasticnet,0.601504,0.567376,0.595745,0.583942,0.595745
9,5.0,l1,0.605839,0.588652,0.602837,0.597122,0.602837


In [19]:
df_rs_pca.to_csv('./LR/LR_result_rs_pca.csv')

#### After calculate four different datasets' metrics
#### we want to use the best one 
#### so, let us check the performance of each dataset

In [20]:
df_smote
# we can choose 

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.680537,0.679707,0.680297,0.680122,0.680297
1,0.1,l2,0.675816,0.68117,0.677189,0.678483,0.677189
2,0.1,elasticnet,0.676471,0.68117,0.677677,0.678812,0.677677
3,0.5,l1,0.685224,0.685558,0.685295,0.685391,0.685295
4,0.5,l2,0.68276,0.682511,0.682674,0.682635,0.682674
5,0.5,elasticnet,0.683274,0.683608,0.683344,0.683441,0.683345
6,1.0,l1,0.686117,0.687873,0.686574,0.686994,0.686574
7,1.0,l2,0.683897,0.685314,0.684259,0.684605,0.684259
8,1.0,elasticnet,0.684748,0.686167,0.685112,0.685457,0.685112
9,5.0,l1,0.68561,0.688117,0.68627,0.686861,0.68627


In [21]:
df_smote_pca

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.633716,0.639122,0.634834,0.636408,0.634835
1,0.1,l2,0.632611,0.636929,0.633494,0.634763,0.633494
2,0.1,elasticnet,0.633394,0.638026,0.634347,0.635701,0.634347
3,0.5,l1,0.635551,0.641438,0.636784,0.638481,0.636785
4,0.5,l2,0.635519,0.64156,0.636784,0.638525,0.636785
5,0.5,elasticnet,0.635497,0.641926,0.636845,0.638695,0.636846
6,1.0,l1,0.635738,0.642169,0.637089,0.638938,0.637089
7,1.0,l2,0.635706,0.642291,0.637089,0.638982,0.637089
8,1.0,elasticnet,0.635706,0.642291,0.637089,0.638982,0.637089
9,5.0,l1,0.63564,0.642535,0.637089,0.639069,0.637089


In [22]:
df_rs

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.597403,0.652482,0.606383,0.623729,0.606383
1,0.1,l2,0.613636,0.574468,0.606383,0.593407,0.606383
2,0.1,elasticnet,0.604027,0.638298,0.609929,0.62069,0.609929
3,0.5,l1,0.636364,0.64539,0.638298,0.640845,0.638298
4,0.5,l2,0.631206,0.631206,0.631206,0.631206,0.631206
5,0.5,elasticnet,0.635714,0.631206,0.634752,0.633452,0.634752
6,1.0,l1,0.631206,0.631206,0.631206,0.631206,0.631206
7,1.0,l2,0.661972,0.666667,0.663121,0.664311,0.663121
8,1.0,elasticnet,0.65,0.64539,0.648936,0.647687,0.648936
9,5.0,l1,0.634483,0.652482,0.638298,0.643357,0.638298


In [23]:
df_rs_pca

Unnamed: 0,C,penalty,precision,recall,ROC,f1,acc
0,0.1,l1,0.659091,0.617021,0.648936,0.637363,0.648936
1,0.1,l2,0.607692,0.560284,0.599291,0.583026,0.599291
2,0.1,elasticnet,0.635036,0.617021,0.631206,0.625899,0.631206
3,0.5,l1,0.585714,0.58156,0.585106,0.58363,0.585106
4,0.5,l2,0.586957,0.574468,0.585106,0.580645,0.585106
5,0.5,elasticnet,0.597122,0.588652,0.595745,0.592857,0.595745
6,1.0,l1,0.604478,0.574468,0.599291,0.589091,0.599291
7,1.0,l2,0.586466,0.553191,0.58156,0.569343,0.58156
8,1.0,elasticnet,0.601504,0.567376,0.595745,0.583942,0.595745
9,5.0,l1,0.605839,0.588652,0.602837,0.597122,0.602837


### we can choos e smote as our final dataset

In [27]:
tr = pd.read_excel('./smote_dataset/tr.xlsx')# (82044, 51)
te = pd.read_excel('./smote_dataset/test_final.xlsx')

# get labels
y_tr = tr['outcome']
# get data
col = tr.columns.to_list()
del col[-1]
x_tr = tr[col]

x_train, x_val, y_train, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=8, stratify=tr[['outcome']])
print('train x shape:', x_train.shape)
print('train y shape:', y_train.shape)
print('val x shape:', x_val.shape)
print('val y shape:', y_val.shape)


clf = LogisticRegression(C=5.0,
                         random_state=0,
                         max_iter=500,
                         penalty='l2',
                        solver='saga',
                        l1_ratio=0.5)
clf.fit(x_train, y_train)
print(clf.coef_)
metric(clf, x_val, y_val, mode='val', return_output=True)


train x shape: (65635, 50)
train y shape: (65635,)
val x shape: (16409, 50)
val y shape: (16409,)




[[-1.29998608  0.07292377 -3.73342141 -0.13370137 -0.7824367  -1.11292553
  -0.54754667 -2.90481882 -3.09079728 -0.85897628 -0.36022314 -1.0935697
   0.18416164 -0.63728874  0.57036501 -0.97363995 -1.02621664 -0.90479444
  -0.92730987 -0.06294193 -1.08872504 -0.85449013 -0.48859873 -0.38118438
   0.46212003 -0.08377186  0.13924756 -0.38992408 -0.97799253 -0.59110222
   7.04134692  5.23979818 -1.66696381 -0.27388497  0.74934403 -6.40495526
   3.36887013  6.51721503  0.5898311  -0.99317769  0.19275062  0.35279324
   7.07810031 14.99291497  4.30281938 -1.52381976  2.8396113  -7.47043259
  -2.4969501   8.46713896]]
Precision:  0.6871345029239766
Recall:  0.6873857404021938
ROC score:  0.6871838502108482
F1 score:  0.6872600987022482
Accuracy score:  0.6871838625144737
confusion matrix
[[5636 2568]
 [2565 5640]]


[0.6871345029239766,
 0.6873857404021938,
 0.6871838502108482,
 0.6872600987022482,
 0.6871838625144737]

In [32]:
import numpy as np
clf.predict_proba(te),2

(array([[1.88007317e-03, 9.98119927e-01],
        [3.55935792e-05, 9.99964406e-01],
        [2.78800643e-04, 9.99721199e-01],
        [2.73808801e-05, 9.99972619e-01],
        [7.60080965e-05, 9.99923992e-01],
        [3.90098311e-03, 9.96099017e-01],
        [1.09439604e-04, 9.99890560e-01],
        [1.53505625e-03, 9.98464944e-01],
        [2.74361460e-01, 7.25638540e-01],
        [9.36825704e-05, 9.99906317e-01],
        [7.85705803e-06, 9.99992143e-01],
        [1.43809874e-01, 8.56190126e-01],
        [5.79298415e-04, 9.99420702e-01],
        [1.22861173e-02, 9.87713883e-01],
        [2.62247111e-04, 9.99737753e-01],
        [9.17334037e-04, 9.99082666e-01],
        [5.56582757e-03, 9.94434172e-01],
        [4.36149885e-05, 9.99956385e-01],
        [1.04320868e-04, 9.99895679e-01],
        [5.84860699e-04, 9.99415139e-01],
        [2.04045466e-01, 7.95954534e-01],
        [3.94583511e-04, 9.99605416e-01],
        [3.69486612e-05, 9.99963051e-01],
        [1.43548950e-03, 9.9856451

In [29]:
clf.intercept_

array([-7.76094227])

In [34]:

print(clf.coef_.T)
pred_0 = te@clf.coef_.T + clf.intercept_
prob = 1/(1 + np.exp(pred_0))
prob

[[-1.29998608]
 [ 0.07292377]
 [-3.73342141]
 [-0.13370137]
 [-0.7824367 ]
 [-1.11292553]
 [-0.54754667]
 [-2.90481882]
 [-3.09079728]
 [-0.85897628]
 [-0.36022314]
 [-1.0935697 ]
 [ 0.18416164]
 [-0.63728874]
 [ 0.57036501]
 [-0.97363995]
 [-1.02621664]
 [-0.90479444]
 [-0.92730987]
 [-0.06294193]
 [-1.08872504]
 [-0.85449013]
 [-0.48859873]
 [-0.38118438]
 [ 0.46212003]
 [-0.08377186]
 [ 0.13924756]
 [-0.38992408]
 [-0.97799253]
 [-0.59110222]
 [ 7.04134692]
 [ 5.23979818]
 [-1.66696381]
 [-0.27388497]
 [ 0.74934403]
 [-6.40495526]
 [ 3.36887013]
 [ 6.51721503]
 [ 0.5898311 ]
 [-0.99317769]
 [ 0.19275062]
 [ 0.35279324]
 [ 7.07810031]
 [14.99291497]
 [ 4.30281938]
 [-1.52381976]
 [ 2.8396113 ]
 [-7.47043259]
 [-2.4969501 ]
 [ 8.46713896]]


Unnamed: 0,0
0,0.001880
1,0.000036
2,0.000279
3,0.000027
4,0.000076
...,...
484,0.000894
485,0.000147
486,0.002558
487,0.000018


In [35]:
print('final prediction of testing data')
clf.predict(te)

final prediction of testing data


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,