# 1. 디렉토리 및 라이브러리, 데이터 불러오기
# 1. Set directory, get related data and libraries

In [1]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.model_selection import KFold
import pickle
import joblib 

In [2]:
# 전처리한 데이터 불러오기
# Get preprocessed data (csv file)
data = pd.read_csv('train_preprocessed.csv')

# 2. 모델링
# 2. Modeling

#### 사용 모델 : LGBM 
#### 파라미터 튜닝 방법 : 너무 오래걸려서 손튜닝..^^

#### Model: LGBM (Light GBM)
#### Parameter tuning: Due to lack of time given + computationally taxing circumstances (severely), had attempted to use GridSearchCV only to resort to manual tuning at the end

### a. 모델 훈련 및 성능 평가
### a. Train model and check performance

In [3]:
# 모델링을 위해 X변수와 target 변수로 나누어줌.
# Split x,y data for modeling
y = data['class']
x = data.drop('class', axis = 1)

5-fold CV를 통해 모델의 모델의 성능 파악 및 반복 훈련
Check overall performance of the model using 5-fold cross validation

In [4]:
CM = []
f1_scorea = []

# 5개의 fold로 나누어줌
# with n_splits=5, split data with 5 folds
folds = KFold(n_splits = 5, shuffle = True, random_state = 26)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    #train data 만들기
    #set train data
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    
    #validation data 만들기
    #set validation data
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]  
    
    # 손튜닝(..^^)을 통해 만든 파라미터 조합
    # parameter values manually set
    params = {'learning_rate': 0.3,
              'num_iterations': 1000, 
              'max_depth': -1, 
              'boosting': 'gbdt', 
              'objective': 'binary', 
              'metric': 'auc', 
              'is_training_metric': True, 
              'num_leaves': 31, 
              'feature_fraction': 0.9, 
              'bagging_fraction': 1.0, 
              'bagging_freq': 5, 
              'seed':26}

    # LGBM에서 사용한 데이터의 형식으로 바꾸어줌
    # change format of datasets in order to apply LGBM modeling
    train_ds = lgb.Dataset(train_X, label=train_y) 
    valid_ds = lgb.Dataset(valid_X, label=valid_y) 
    
    # 모델 생성 후 fitting
    # fit after instantiating the model
    model = lgb.LGBMClassifier(**params, random_state=26)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds= 100, verbose=1000)

    # validation set에 대해 y 예측
    # predict y values regarding validation set defined
    y_pred = model.predict(valid_X)

    # confusion matrix
    CM.append(confusion_matrix(valid_y, y_pred))
    # F1 score
    f1_scorea.append(f1_score(valid_y, y_pred))
    #classification report: class 1에 대한 f1-score 확인하기 위해 report 출력
    #check f1-score for class 1
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

# 5개의 validation set에 대해 나온 confusion matrix 합쳐줌
# combine 5 confusion matrices each from 5 different validation sets
CM = sum(CM)

# 5개의 validation set에 대한 f1-score 평균
# average out f1-score for prediction of labels from 5 different validation sets
f1_scorea = np.mean(f1_scorea)
print("F1 score : %f" % f1_scorea)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[351]	valid_0's auc: 0.999912
              precision    recall  f1-score   support

     class 0       1.00      0.99      1.00      7932
     class 1       0.99      1.00      1.00      7802

    accuracy                           1.00     15734
   macro avg       1.00      1.00      1.00     15734
weighted avg       1.00      1.00      1.00     15734

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	valid_0's auc: 0.999478
              precision    recall  f1-score   support

     class 0       1.00      0.99      1.00      7898
     class 1       0.99      1.00      1.00      7835

    accuracy                           1.00     15733
   macro avg       1.00      1.00      1.00     15733
weighted avg       1.00      1.00      1.00     15733

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration

In [5]:
# 5개 validation set에 대한 결과
# results, 5 validation sets combined
tn, fp, fn, tp = CM.ravel()
print(tn, fp, fn, tp)

38966 367 12 39321


In [6]:
# 5개의 confusion matrix를 모두 합쳐 cost 계산
# compute cost using results from 5 confusion matrices combined
cost = fp*10 + fn*500
cost

9670

# 3. 최종 모델링 및 저장
# 3. Final modeling and save the model

In [7]:
# 최종 모델 저장
# Save final model
model = lgb.LGBMClassifier(learning_rate = 0.3,
                        # 5-fold를 통해 600번의 iteration이면 충분하다고 생각해 변경
                        # changed num_iterations
                          num_iterations = 600, 
                          max_depth = -1, 
                          boosting = 'gbdt', 
                          objective = 'binary', 
                          metric = 'auc', 
                          is_training_metric = True, 
                          num_leaves = 31, 
                          feature_fraction = 0.9, 
                          bagging_fraction = 1.0, 
                          bagging_freq = 5, 
                          seed = 26)
 
joblib.dump(model, 'LGBM.pkl') 

['LGBM.pkl']