In [19]:
import os
import gc
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score

from xgboost import XGBClassifier, XGBRegressor

from sklearn.tree import export_graphviz
from collections import Counter
from sklearn import metrics

In [20]:
data = pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")

In [21]:
data.head()

Unnamed: 0,censor,event,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,gender,str2,strat,symptom,cd40,cd420,cd496,r,cd80,cd820
0,0,1090,43,66.679,0,1,0,100,0,1,...,1,1,3,0,504,353,660,1,870,782
1,1,794,31,73.03,0,1,0,100,0,1,...,1,1,3,0,244,225,106,1,708,699
2,0,957,41,66.226,0,1,1,100,0,1,...,1,1,3,0,401,366,453,1,889,720
3,1,188,35,78.019,0,1,0,100,0,1,...,1,1,3,0,221,132,-1,0,221,759
4,1,308,40,83.009,0,1,0,100,0,1,...,1,1,3,1,150,90,20,1,1730,1160


In [22]:
data['censor'].value_counts()

censor
0    351
1    181
Name: count, dtype: int64

In [23]:
Y = data['censor']
X = data.drop(columns=['censor'])

In [24]:
SKFold = StratifiedKFold(n_splits=5, shuffle=True)

In [25]:
train_idx = []
valid_idx = []
data_split = SKFold.split(X, Y)
data_split=next(data_split)
train_idx = data_split[0]
valid_idx = data_split[1]

quality

In [26]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

In [27]:
data.columns

Index(['censor', 'event', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'zprior', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'cd40', 'cd420', 'cd496', 'r', 'cd80', 'cd820'],
      dtype='object')

In [28]:
[regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

['censor',
 'event',
 'age',
 'wtkg',
 'hemo',
 'homo',
 'drugs',
 'karnof',
 'oprior',
 'z30',
 'zprior',
 'preanti',
 'race',
 'gender',
 'str2',
 'strat',
 'symptom',
 'cd40',
 'cd420',
 'cd496',
 'r',
 'cd80',
 'cd820']

In [29]:
# Data Quality Checking
col = []
missing = []
level = [] 
for name in data.columns:
    
    # Missing
    missper = data[name].isnull().sum() / data.shape[0]
    missing.append(round(missper, 4))

    # Leveling
    lel = data[name].dropna()
    level.append(len(list(set(lel))))

    # Columns
    col.append(name)

summary = pd.concat([pd.DataFrame(col, columns=['name']), 
                     pd.DataFrame(missing, columns=['Missing Percentage']), 
                     pd.DataFrame(level, columns=['Level'])], axis=1)

drop_col = summary['name'][(summary['Level'] <= 1) | (summary['Missing Percentage'] >= 0.8)]
data.drop(columns=drop_col, inplace=True)
print(">>>> Data Shape : {}".format(data.shape))

>>>> Data Shape : (532, 22)


In [30]:
summary

Unnamed: 0,name,Missing Percentage,Level
0,censor,0.0,2
1,event,0.0,358
2,age,0.0,52
3,wtkg,0.0,312
4,hemo,0.0,2
5,homo,0.0,2
6,drugs,0.0,2
7,karnof,0.0,4
8,oprior,0.0,2
9,z30,0.0,2


In [31]:
model = XGBClassifier(random_state=0)
grid = dict()
grid['n_estimators'] = range(50,150,10)
grid['learning_rate'] = [0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1]
grid['reg_alpha'] = [0.1, 0.3, 0.5]
grid['max_depth'] = [3, 5, 7, 9]
model = GridSearchCV(model, grid, scoring='f1')
model.fit(X.iloc[train_idx], Y.iloc[train_idx])

In [32]:
best_params = model.best_params_
best_params

{'learning_rate': 0.08, 'max_depth': 5, 'n_estimators': 60, 'reg_alpha': 0.1}

In [33]:
model.best_score_

0.8626929392446634

In [34]:
model = XGBClassifier(learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'], 
                      reg_alpha=best_params['reg_alpha'], max_depth=best_params['max_depth'])
model.fit(X.iloc[train_idx], Y.iloc[train_idx])

In [35]:
y_pre_test = model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum())) 
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))

Test Confusion Matrix
[[61 10]
 [ 8 28]]
TesT Acc : 0.8317757009345794
Test F1-Score : 0.7567567567567567


In [36]:
feature_map = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)

       Score  Feature
0   0.294254    event
1   0.110652    cd496
2   0.071143    cd420
3   0.068844   gender
4   0.061916      age
5   0.061380  preanti
6   0.059102     race
7   0.044209     wtkg
8   0.039110    drugs
9   0.036779      z30
10  0.034878    cd820
11  0.034722     cd80
12  0.031012     homo
13  0.028827     cd40
14  0.023174   karnof
15  0.000000   zprior
16  0.000000  symptom
17  0.000000    strat
18  0.000000     str2
19  0.000000        r
20  0.000000   oprior
21  0.000000     hemo


In [38]:
metrics.roc_auc_score(Y[valid_idx], y_pre_test)

0.8184663536776212