In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
cd ./drive/MyDrive/미니_프로젝트/data

/content/drive/MyDrive/미니_프로젝트/data


In [2]:
!pip install xgboost



In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 4.3 MB/s 
Collecting alembic
  Downloading alembic-1.7.6-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 43.9 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.2 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.2 MB/s 
Collecting autopage>=0.4.0
  Downloading autopage-0.5.0-py3-none-any.whl (29 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.5.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 6.2 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.4.0-py3-none-any.whl (150 kB)
[K     |████████████

In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

In [5]:
import os, random # random seed 고정 
def set_seeds(seed): 
    os.environ['PYTHONHASHSEED'] = str(seed) 
    random.seed(seed) 
    np.random.seed(seed) 
    # tf.random.set_seed(seed) # Tensorflow 사용시 
    
SEED = 555
set_seeds(SEED)

In [10]:
train_fast = pd.read_csv('./train_fast_final.csv')
test_fast = pd.read_csv('./test_fast_final.csv')
train_slow = pd.read_csv('./train_slow_final.csv')
test_slow = pd.read_csv('./test_slow_final.csv')

In [11]:
train_fast.shape, test_fast.shape, train_slow.shape, test_slow.shape

((102316, 68), (25575, 68), (102294, 68), (25575, 68))

In [12]:
X_fast_train = train_fast.drop(['fast_exist'], axis=1)
y_fast_train = train_fast.fast_exist

X_fast_test = test_fast.drop(['fast_exist'], axis=1)
y_fast_test = test_fast.fast_exist

In [13]:
X_slow_train = train_slow.drop(['slow_exist'], axis=1)
y_slow_train = train_slow.slow_exist

X_slow_test = test_slow.drop(['slow_exist'], axis=1)
y_slow_test = test_slow.slow_exist

## non-selection

In [14]:
XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1,
              n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1)

### fast

In [None]:
XGB.fit(X_fast_train, y_fast_train)
y_fast_prob = XGB.predict_proba(X_fast_test)

for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_fast_pred = np.where(y_fast_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_fast_test, y_fast_pred[:,1])}')

0.1 :0.6086956521739131
0.2 :0.6885245901639344
0.3 :0.6909090909090909
0.4 :0.7037037037037037
0.5 :0.6923076923076923
0.6 :0.6666666666666666
0.7 :0.6956521739130435
0.8 :0.6666666666666665
0.9 :0.6666666666666665


### slow

In [None]:
XGB.fit(X_slow_train, y_slow_train)
y_slow_prob = XGB.predict_proba(X_slow_test)

for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_slow_pred = np.where(y_slow_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_slow_test, y_slow_pred[:,1])}')

0.1 :0.6451612903225806
0.2 :0.6976744186046512
0.3 :0.6904761904761905
0.4 :0.7160493827160493
0.5 :0.7272727272727273
0.6 :0.767123287671233
0.7 :0.7714285714285714
0.8 :0.7575757575757577
0.9 :0.7419354838709677


## tuning 후 feature selection (10~65)

In [16]:
ks = [10,15,20,25,30,35,40,45,50,55,60,65]

### fast
* 최적 score = 0.7777
* 최적 k = 60
* 최적 threshold = 0.4

In [18]:
fit = SelectKBest(chi2, k=45).fit(X_fast_train, y_fast_train)
new_X_fast_train = fit.transform(X_fast_train)
new_X_fast_test = fit.transform(X_fast_test)

XGB.fit(new_X_fast_train, y_fast_train)
y_fast_prob = XGB.predict_proba(new_X_fast_test)
print(f'================================{45}================================')
for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_fast_pred = np.where(y_fast_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_fast_test, y_fast_pred[:,1])}')

0.1 :0.6268656716417911
0.2 :0.6885245901639344
0.3 :0.7241379310344828
0.4 :0.7407407407407408
0.5 :0.7547169811320756
0.6 :0.7692307692307692
0.7 :0.6938775510204083
0.8 :0.6666666666666666
0.9 :0.6521739130434783


In [19]:
import joblib

In [21]:
pwd

'/content/drive/MyDrive/미니_프로젝트/data'

In [22]:
joblib.dump(XGB, '../model/fast_XGB_60k.pkl')

['../model/fast_XGB_60k.pkl']

In [17]:
for k in ks:
    fit = SelectKBest(chi2, k=k).fit(X_fast_train, y_fast_train)
    new_X_fast_train = fit.transform(X_fast_train)
    new_X_fast_test = fit.transform(X_fast_test)
    
    XGB.fit(new_X_fast_train, y_fast_train)
    y_fast_prob = XGB.predict_proba(new_X_fast_test)
    print(f'================================{k}================================')
    for threshold in range(1,10):
        threshold = round(threshold*0.1, 1)
        y_fast_pred = np.where(y_fast_prob>=threshold, 1, 0)
        print(f'{threshold} :{f1_score(y_fast_test, y_fast_pred[:,1])}')

0.1 :0.028530670470756067
0.2 :0.4044943820224719
0.3 :0.4473684210526316
0.4 :0.5074626865671642
0.5 :0.5614035087719298
0.6 :0.5454545454545454
0.7 :0.5454545454545454
0.8 :0.5714285714285714
0.9 :0.5957446808510639
0.1 :0.5121951219512195
0.2 :0.6176470588235294
0.3 :0.6774193548387097
0.4 :0.6428571428571428
0.5 :0.6296296296296295
0.6 :0.6
0.7 :0.6122448979591836
0.8 :0.5531914893617023
0.9 :0.5333333333333332
0.1 :0.5833333333333334
0.2 :0.6363636363636365
0.3 :0.6557377049180328
0.4 :0.6779661016949153
0.5 :0.7272727272727272
0.6 :0.6792452830188679
0.7 :0.72
0.8 :0.7083333333333334
0.9 :0.6363636363636364
0.1 :0.6551724137931034
0.2 :0.7169811320754718
0.3 :0.7307692307692308
0.4 :0.7058823529411765
0.5 :0.72
0.6 :0.72
0.7 :0.6938775510204083
0.8 :0.7083333333333334
0.9 :0.6818181818181819
0.1 :0.6333333333333332
0.2 :0.6909090909090909
0.3 :0.7169811320754718
0.4 :0.7169811320754718
0.5 :0.6923076923076923
0.6 :0.6530612244897959
0.7 :0.6382978723404256
0.8 :0.6511627906976745

### slow
* 최적 score = 0.7936
* 최적 k = 65
* 최적 threshold = 0.9

In [23]:
for k in ks:
    fit = SelectKBest(chi2, k=k).fit(X_slow_train, y_slow_train)
    new_X_slow_train = fit.transform(X_slow_train)
    new_X_slow_test = fit.transform(X_slow_test)
    
    XGB.fit(new_X_slow_train, y_slow_train)
    y_slow_prob = XGB.predict_proba(new_X_slow_test)
    print(f'================================{k}================================')
    for threshold in range(1,10):
        threshold = round(threshold*0.1, 1)
        y_slow_pred = np.where(y_slow_prob>=threshold, 1, 0)
        print(f'{threshold} :{f1_score(y_slow_test, y_slow_pred[:,1])}')

0.1 :0.5769230769230769
0.2 :0.6741573033707865
0.3 :0.6419753086419753
0.4 :0.6493506493506493
0.5 :0.684931506849315
0.6 :0.676470588235294
0.7 :0.7076923076923077
0.8 :0.6984126984126984
0.9 :0.711864406779661
0.1 :0.6451612903225806
0.2 :0.7058823529411764
0.3 :0.725
0.4 :0.7272727272727273
0.5 :0.7297297297297297
0.6 :0.7323943661971831
0.7 :0.7352941176470588
0.8 :0.7384615384615385
0.9 :0.7096774193548386
0.1 :0.631578947368421
0.2 :0.6506024096385542
0.3 :0.6575342465753425
0.4 :0.6567164179104478
0.5 :0.65625
0.6 :0.6666666666666666
0.7 :0.7
0.8 :0.7
0.9 :0.711864406779661
0.1 :0.6741573033707865
0.2 :0.7012987012987012
0.3 :0.7105263157894737
0.4 :0.6933333333333332
0.5 :0.7323943661971831
0.6 :0.7761194029850748
0.7 :0.78125
0.8 :0.78125
0.9 :0.78125
0.1 :0.5436893203883495
0.2 :0.6292134831460674
0.3 :0.6190476190476191
0.4 :0.6419753086419753
0.5 :0.65
0.6 :0.6486486486486486
0.7 :0.6666666666666666
0.8 :0.6176470588235295
0.9 :0.6557377049180327
0.1 :0.5714285714285715
0.

In [24]:
fit = SelectKBest(chi2, k=25).fit(X_slow_train, y_slow_train)
new_X_slow_train = fit.transform(X_slow_train)
new_X_slow_test = fit.transform(X_slow_test)

XGB.fit(new_X_slow_train, y_slow_train)
y_slow_prob = XGB.predict_proba(new_X_slow_test)
print(f'================================{k}================================')
for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_slow_pred = np.where(y_slow_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_slow_test, y_slow_pred[:,1])}')

0.1 :0.6741573033707865
0.2 :0.7012987012987012
0.3 :0.7105263157894737
0.4 :0.6933333333333332
0.5 :0.7323943661971831
0.6 :0.7761194029850748
0.7 :0.78125
0.8 :0.78125
0.9 :0.78125


In [26]:
joblib.dump(XGB, '../model/slow_XGB_25k.pkl')

['../model/slow_XGB_25k.pkl']

## 기본 모델로 feature selection

In [None]:
XGB = XGBClassifier()

## fast

In [None]:
for k in ks:
    fit = SelectKBest(chi2, k=k).fit(X_slow_train, y_slow_train)
    new_X_slow_train = fit.transform(X_slow_train)
    new_X_slow_test = fit.transform(X_slow_test)
    
    XGB.fit(new_X_slow_train, y_slow_train)
    y_slow_prob = XGB.predict_proba(new_X_slow_test)
    print(f'================================{k}================================')
    for threshold in range(1,10):
        threshold = round(threshold*0.1, 1)
        y_slow_pred = np.where(y_slow_prob>=threshold, 1, 0)
        print(f'{threshold} :{f1_score(y_slow_test, y_slow_pred[:,1])}')

0.1 :0.09933774834437085
0.2 :0.12244897959183672
0.3 :0.1477832512315271
0.4 :0.16901408450704225
0.5 :0.1948051948051948
0.6 :0.21212121212121213
0.7 :0.23684210526315788
0.8 :0.27807486631016043
0.9 :0.3968253968253969
0.1 :0.11131725417439704
0.2 :0.1348314606741573
0.3 :0.15706806282722513
0.4 :0.18237082066869303
0.5 :0.2028985507246377
0.6 :0.24561403508771928
0.7 :0.288659793814433
0.8 :0.37241379310344824
0.9 :0.4444444444444445
0.1 :0.11049723756906078
0.2 :0.14218009478672988
0.3 :0.17341040462427745
0.4 :0.19999999999999998
0.5 :0.2426778242677824
0.6 :0.27999999999999997
0.7 :0.345679012345679
0.8 :0.3941605839416058
0.9 :0.5252525252525253
0.1 :0.11494252873563218
0.2 :0.15424164524421594
0.3 :0.18867924528301885
0.4 :0.22304832713754646
0.5 :0.25641025641025644
0.6 :0.3005181347150259
0.7 :0.31952662721893493
0.8 :0.41600000000000004
0.9 :0.5106382978723405
0.1 :0.11695906432748539
0.2 :0.145985401459854
0.3 :0.17804154302670622
0.4 :0.20863309352517986
0.5 :0.2478632478

## slow

In [None]:
for k in ks:
    fit = SelectKBest(chi2, k=k).fit(X_fast_train, y_fast_train)
    new_X_fast_train = fit.transform(X_fast_train)
    new_X_fast_test = fit.transform(X_fast_test)
    
    XGB.fit(new_X_fast_train, y_fast_train)
    y_fast_prob = XGB.predict_proba(new_X_fast_test)
    print(f'================================{k}================================')
    for threshold in range(1,10):
        threshold = round(threshold*0.1, 1)
        y_fast_pred = np.where(y_fast_prob>=threshold, 1, 0)
        print(f'{threshold} :{f1_score(y_fast_test, y_fast_pred[:,1])}')

0.1 :0.01694915254237288
0.2 :0.03881700554528651
0.3 :0.05517241379310345
0.4 :0.06041335453100159
0.5 :0.0670391061452514
0.6 :0.08579088471849865
0.7 :0.11494252873563218
0.8 :0.1346153846153846
0.9 :0.2696629213483146
0.1 :0.05234460196292258
0.2 :0.07603305785123966
0.3 :0.09954751131221719
0.4 :0.10644257703081232
0.5 :0.13620071684587814
0.6 :0.16521739130434782
0.7 :0.2
0.8 :0.26277372262773724
0.9 :0.36363636363636365
0.1 :0.05700123915737299
0.2 :0.07299270072992702
0.3 :0.0970873786407767
0.4 :0.1162079510703364
0.5 :0.14400000000000002
0.6 :0.18947368421052632
0.7 :0.2411347517730496
0.8 :0.3300970873786408
0.9 :0.3947368421052632
0.1 :0.09341825902335456
0.2 :0.1301775147928994
0.3 :0.16666666666666669
0.4 :0.22448979591836732
0.5 :0.2763157894736842
0.6 :0.27692307692307694
0.7 :0.3185840707964602
0.8 :0.3655913978494624
0.9 :0.4788732394366197
0.1 :0.09565217391304348
0.2 :0.13749999999999998
0.3 :0.19047619047619047
0.4 :0.25142857142857145
0.5 :0.2837837837837838
0.6 :