In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd ./drive/MyDrive/미니_프로젝트/data

/content/drive/MyDrive/미니_프로젝트/data


In [3]:
import joblib
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
# import optuna
# from optuna import Trial, visualization
# from optuna.samplers import TPESampler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

In [4]:
import os, random # random seed 고정 
def set_seeds(seed): 
    os.environ['PYTHONHASHSEED'] = str(seed) 
    random.seed(seed) 
    np.random.seed(seed) 
    # tf.random.set_seed(seed) # Tensorflow 사용시 
    
SEED = 555
set_seeds(SEED)

In [5]:
train_fast = pd.read_csv('./train_fast_final.csv')
test_fast = pd.read_csv('./test_fast_final.csv')
train_slow = pd.read_csv('./train_slow_final.csv')
test_slow = pd.read_csv('./test_slow_final.csv')

In [6]:
X_fast_train = train_fast.drop(['fast_exist'], axis=1)
y_fast_train = train_fast.fast_exist

X_fast_test = test_fast.drop(['fast_exist'], axis=1)
y_fast_test = test_fast.fast_exist

In [7]:
X_slow_train = train_slow.drop(['slow_exist'], axis=1)
y_slow_train = train_slow.slow_exist

X_slow_test = test_slow.drop(['slow_exist'], axis=1)
y_slow_test = test_slow.slow_exist

In [8]:
slow_model = joblib.load('../model/slow_XGB_25k.pkl')
fast_model = joblib.load('../model/fast_XGB_60k.pkl')

In [9]:
fast_model

XGBClassifier(interaction_constraints='', learning_rate=0.300000012,
              max_depth=6, missing=nan, n_jobs=-1, num_parallel_tree=1,
              random_state=42, tree_method='exact', validate_parameters=1)

In [10]:
slow_model

XGBClassifier(interaction_constraints='', learning_rate=0.300000012,
              max_depth=6, missing=nan, n_jobs=-1, num_parallel_tree=1,
              random_state=42, tree_method='exact', validate_parameters=1)

In [11]:
fit_fast = SelectKBest(chi2, k=45).fit(X_fast_train, y_fast_train)
new_X_fast_train = fit_fast.transform(X_fast_train)
new_X_fast_test = fit_fast.transform(X_fast_test)

# XGB.fit(new_X_fast_train, y_fast_train)
y_fast_prob = fast_model.predict_proba(new_X_fast_test)
print(f'================================{45}================================')
for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_fast_pred = np.where(y_fast_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_fast_test, y_fast_pred[:,1])}')

0.1 :0.6268656716417911
0.2 :0.6885245901639344
0.3 :0.7241379310344828
0.4 :0.7407407407407408
0.5 :0.7547169811320756
0.6 :0.7692307692307692
0.7 :0.6938775510204083
0.8 :0.6666666666666666
0.9 :0.6521739130434783


In [12]:
fit_slow = SelectKBest(chi2, k=25).fit(X_slow_train, y_slow_train)
new_X_slow_train = fit_slow.transform(X_slow_train)
new_X_slow_test = fit_slow.transform(X_slow_test)

# XGB.fit(new_X_slow_train, y_slow_train)
y_slow_prob = slow_model.predict_proba(new_X_slow_test)
print(f'================================{25}================================')
for threshold in range(1,10):
    threshold = round(threshold*0.1, 1)
    y_slow_pred = np.where(y_slow_prob>=threshold, 1, 0)
    print(f'{threshold} :{f1_score(y_slow_test, y_slow_pred[:,1])}')

0.1 :0.6741573033707865
0.2 :0.7012987012987012
0.3 :0.7105263157894737
0.4 :0.6933333333333332
0.5 :0.7323943661971831
0.6 :0.7761194029850748
0.7 :0.78125
0.8 :0.78125
0.9 :0.78125


### Inference

In [13]:
inference_fast = pd.read_csv('./inference_fast.csv')

In [14]:
inference_slow = pd.read_csv('./inference_slow.csv')

In [18]:
X_inference_fast = inference_fast.drop(['fast_exist'], axis=1)
X_inference_slow = inference_slow.drop(['slow_exist'], axis=1)

In [19]:
new_inference_fast = fit_fast.transform(X_inference_fast)

In [20]:
new_inference_slow = fit_slow.transform(X_inference_slow)

In [45]:
y_fast_prob = fast_model.predict_proba(new_inference_fast)[:,1]
y_fast_prob

array([2.0266297e-07, 3.7643304e-06, 2.8689217e-05, ..., 8.2573399e-07,
       8.2573399e-07, 8.2573399e-07], dtype=float32)

In [46]:
fast_top_k = dict()
for _ in range(20):
    max_idx = np.argmax(y_fast_prob)
    max_value = y_fast_prob[max_idx]
    fast_top_k[max_idx] = max_value
    y_fast_prob[max_idx] = 0

In [47]:
fast_top_k_sorted = sorted(fast_top_k.items(), key = lambda item: item[1], reverse=True)
print(fast_top_k_sorted)

[(23118, 0.9999635), (47294, 0.9999635), (71288, 0.9999635), (95361, 0.9999635), (13171, 0.9967776), (37461, 0.9967776), (61456, 0.9967776), (85415, 0.99598145), (5374, 0.97619677), (29698, 0.97619677), (53692, 0.97619677), (77619, 0.97619677), (13563, 0.9715035), (37844, 0.9715035), (61839, 0.9715035), (85807, 0.9715035), (13964, 0.9039389), (38208, 0.9039389), (62203, 0.9039389), (86208, 0.9039389)]


In [48]:
y_slow_prob = slow_model.predict_proba(new_inference_slow)[:,1]
y_slow_prob

array([7.5245084e-06, 5.9769087e-07, 5.9769087e-07, ..., 2.2002214e-07,
       2.2002214e-07, 2.2002214e-07], dtype=float32)

In [49]:
slow_top_k = dict()
for _ in range(20):   
    max_idx = np.argmax(y_slow_prob)
    max_value = y_slow_prob[max_idx]
    slow_top_k[max_idx] = max_value
    y_slow_prob[max_idx] = 0

In [50]:
slow_top_k_sorted = sorted(slow_top_k.items(), key = lambda item: item[1], reverse=True)
print(slow_top_k_sorted)

[(12416, 0.999894), (36714, 0.999894), (60709, 0.999894), (84657, 0.999894), (12902, 0.9997956), (37196, 0.9997956), (61190, 0.9997956), (85143, 0.9997956), (18580, 0.9957776), (42800, 0.9957776), (66794, 0.9957776), (90821, 0.9957776), (12901, 0.99563664), (37195, 0.99563664), (61189, 0.99563664), (85142, 0.99563664), (14107, 0.995216), (38348, 0.995216), (62342, 0.995216), (86348, 0.995216)]
