In [2]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import export_graphviz
from collections import Counter
from sklearn import metrics

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")

In [4]:
data.head()

Unnamed: 0,censor,event,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,gender,str2,strat,symptom,cd40,cd420,cd496,r,cd80,cd820
0,0,1090,43,66.679,0,1,0,100,0,1,...,1,1,3,0,504,353,660,1,870,782
1,1,794,31,73.03,0,1,0,100,0,1,...,1,1,3,0,244,225,106,1,708,699
2,0,957,41,66.226,0,1,1,100,0,1,...,1,1,3,0,401,366,453,1,889,720
3,1,188,35,78.019,0,1,0,100,0,1,...,1,1,3,0,221,132,-1,0,221,759
4,1,308,40,83.009,0,1,0,100,0,1,...,1,1,3,1,150,90,20,1,1730,1160


In [5]:
data['censor'].value_counts()

censor
0    351
1    181
Name: count, dtype: int64

In [6]:
Y = data['censor']
X = data.drop(columns=['censor'])

In [7]:
SKFold = StratifiedKFold(n_splits=5, shuffle=True)

In [8]:
train_idx = []
valid_idx = []
data_split = SKFold.split(X, Y)
data_split=next(data_split)
train_idx = data_split[0]
valid_idx = data_split[1]

In [9]:
model = GradientBoostingClassifier(random_state=0)
grid = dict()
grid['n_estimators'] = range(50,150,10)
grid['learning_rate'] = [0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1]
grid['subsample'] = [0.5, 0.75, 1]
grid['max_features'] = [0.7, 0.5, 0.9]
grid['max_depth'] = [3, 5, 7, 9]
model = GridSearchCV(model, grid, scoring='f1')
model.fit(X.iloc[train_idx], Y.iloc[train_idx])

In [10]:
best_params = model.best_params_
best_params

{'learning_rate': 0.08,
 'max_depth': 7,
 'max_features': 0.5,
 'n_estimators': 80,
 'subsample': 0.5}

In [11]:
model.best_score_

0.8608398705450266

In [12]:
model = GradientBoostingClassifier(learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators']
                                  , subsample=best_params['subsample'], max_features=best_params['max_features'], max_depth=best_params['max_depth'])
model.fit(X.iloc[train_idx], Y.iloc[train_idx])

In [13]:
y_pre_test = model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum())) 
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))

Test Confusion Matrix
[[63  8]
 [ 7 29]]
TesT Acc : 0.8598130841121495
Test F1-Score : 0.7945205479452055


In [14]:
feature_map = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)

       Score  Feature
0   0.379264    event
1   0.122631    cd420
2   0.079200    cd496
3   0.061371     wtkg
4   0.060930      age
5   0.054316     cd80
6   0.044592  preanti
7   0.044226     cd40
8   0.042872    cd820
9   0.042006        r
10  0.017503     race
11  0.012368   karnof
12  0.006896     hemo
13  0.005706    drugs
14  0.005041  symptom
15  0.004991     homo
16  0.004342     str2
17  0.004107    strat
18  0.004066      z30
19  0.002824   gender
20  0.000747   oprior
21  0.000000   zprior


In [15]:
feature_map = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)

       Score  Feature
0   0.379264    event
1   0.122631    cd420
2   0.079200    cd496
3   0.061371     wtkg
4   0.060930      age
5   0.054316     cd80
6   0.044592  preanti
7   0.044226     cd40
8   0.042872    cd820
9   0.042006        r
10  0.017503     race
11  0.012368   karnof
12  0.006896     hemo
13  0.005706    drugs
14  0.005041  symptom
15  0.004991     homo
16  0.004342     str2
17  0.004107    strat
18  0.004066      z30
19  0.002824   gender
20  0.000747   oprior
21  0.000000   zprior


In [16]:
metrics.roc_auc_score(Y[valid_idx], y_pre_test)

0.8464397496087638