## Dacon 15회 원자력발전소 상태 판단 경진대회
### 밍둥이
### 2020년 2월 16일
### 1. 라이브러리 및 데이터
### Library & Data

In [1]:
from sklearn.model_selection import KFold
import lightgbm

import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2
import os
import pandas as pd
import numpy as np
import joblib

path= "C:/dacon/nuclear/"
train_folder = path+"train/"
test_folder = path+"test/"
train_label_path = path+"train_label.csv"

### 2. 데이터 전처리
### Data Cleansing & Pre-Processing

In [5]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [6]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    
    return combined_df

In [7]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=20, nrows=60)

y = train['label']
train.drop('label',axis=1,inplace=True)

In [25]:
print("train shape : {}".format(train.shape))
print("test shape : {}".format(test.shape))
print("y shape : {}".format(y.shape))

train shape : (41350, 5121)
test shape : (28720, 5121)
y shape : (41350,)


In [12]:
train.head()

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5111,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120
0,30.474394,8.691177,8.714483,8.687399,8.72123,207.697895,165.86573,-6.018876999999999e-19,0.0,-0.002136,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,1.42162e-05,85.4,0.0
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.66508,191.006871,-3.9187579999999997e-19,0.0,0.00171,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.7991789999999997e-19,0.0,0.000493,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636970999999999e-19,0.0,0.000318,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0
0,30.475773,8.790241,8.735125,8.703167,8.72103,193.269046,195.98489,-6.379752e-20,0.0,-9.1e-05,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0


In [13]:
test.head()

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5111,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120
1000,30.466746,8.76634,8.688246,8.725044,8.691887,164.129395,192.704677,4.585456e-19,0.0,0.00262,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-7e-06,85.4,0.0
1000,30.461517,8.615086,8.707004,8.672025,8.726185,200.608939,180.603857,6.708315e-20,0.0,0.001803,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-2e-06,85.4,0.0
1000,30.462584,8.734225,8.711064,8.700966,8.723912,180.84752,170.057365,2.970597e-19,0.0,-0.001302,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-1.8e-05,85.4,0.0
1000,30.486272,8.547549,8.70539,8.67728,8.684212,201.880368,194.272476,1.908473e-19,0.0,0.000687,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-1.8e-05,85.4,0.0
1000,30.4695,8.83884,8.72313,8.707952,8.768843,204.950262,210.994656,-4.367418999999999e-19,0.0,0.000318,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,1.3e-05,85.4,0.0


In [31]:
print("y 앞 5개\n\n{}".format(y.head(5)))
print()
print("y 뒤 5개\n\n{}".format(y.tail(5)))

y 앞 5개

0    110
0    110
0    110
0    110
0    110
Name: label, dtype: int64

y 뒤 5개

99    156
99    156
99    156
99    156
99    156
Name: label, dtype: int64


### 3. 모델 학습, 검증, 저장
### Model Tuning & Evaluation

[Hyper Parameters of XGBoost](https://apple-rbox.tistory.com/6)<br>
[Hyper Parameters of LightGBM 1](http://machinelearningkorea.com/2019/09/29/lightgbm-%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0/)
[Hyper Parameters of LightGBM 2](https://sites.google.com/view/lauraepp/parameters)

In [32]:
parms = {
    'learning_rate' : 0.06,     # default = 0.3, [0, 1] 학습률 : 낮을수록 보수적
    'num_leaves' : 400,         # 최대 잎사귀 수 ex) depth = 4 -> (2의 4제곱 -1) = 15 
    'n_estimators' : 300,       # default = 100, 1000개 정도 해도 좋고, 너무 크면 overfitting
    'max_depth': -1,            # default = 6, [0, ∞] 나무깊이 : -1의 경우 제한이 없다. feature가 많다면 이렇게 설정.
    
    #    - child의 관측(?)에서 요구되는 최소 가중치의 합
    #    - over-fitting vs under-fitting을 조정하기 위한 파라미터.
    #    - 너무 큰 값이 주어지면 under-fitting.    
    'min_child_weight' : 3,     #  min_child_weight [default=1] (Should be tuned using CV)
    
    # subsample : training데이터 셋에서 subset을 만들지 전부를 사용할지 정하는 파라미터, 매번 나무를 만들 때(=iteration) 적용하며 overfitting 문제를 방지하려고 사용.
    'subsample' : 0.8,          # default = 1, (0, 1]
    
    'colsample_bytree' : 0.5,   # default = 1, (0,1] 나무 만들 때 변수를 샘플링할 지
    'objective' : 'multiclass', # 이진분류할 지 다중분류할 지 등등
    'n_jobs': -1                #
}

### LGBM은 왜 이렇게 유명해졌나?
[LGBM에 대한 설명](https://greatjoy.tistory.com/72)
- 데이터의 크기가 커짐에 따라 빠른 결과를 내는 것도 중요해지고 있다. 그런점에서 Light GBM은 'Light'의 접두사와 같이 속도가 빠른 것이 장점이다. 메모리를 적게 차지하고 속도가 빠른다는 장점 외에도, LGBM은 결과의 정확도가 높다는 장점이 있다. 또한, GPU를 활용할 수 있기 때문에 널리 사용되고 있다. <br>
- 하지만, 위에서 말했듯이 overfitting에 민감하여 데이터의 크기가 작을 경우 기존의 머신러닝 알고리즘이 더 좋을 수 있다. 경험저긍로 데이터의 개수(행 수)가 10,000개 이상일 때 추천한다.

In [46]:
# 4FOLD, 3SEED ENSEMBLE
# 총 12개의 모델을 평균내어 예측한다

lucky_seed=[4885,1992,1022]

for num,rs in enumerate(lucky_seed):
    
    kfold = KFold(n_splits=4, random_state = rs, shuffle = True)

    # dacon code
    cv=np.zeros((train.shape[0],198))

    for n, (train_idx, validation_idx) in enumerate(kfold.split(train)):
        
        x_train, x_validation = train.iloc[train_idx], train.iloc[validation_idx]
        y_train, y_validation = y.iloc[train_idx], y.iloc[validation_idx]

        model = lightgbm.LGBMClassifier(**parms, random_state=rs)

        model.fit(x_train, y_train, eval_set=[(x_validation, y_validation)], early_stopping_rounds= 30,
                  verbose=100) 
        #joblib.dump(model, '../2_Code_pred/%s_fold_model_%s.pkl'%(n,rs))
        joblib.dump(model, path+'%s_fold_model_%s.pkl'%(n,rs))

        # CROSS-VALIDATION , EVALUATE CV
        cv[validation_idx,:] = model.predict_proba(x_validation)

Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.327013
Early stopping, best iteration is:
[162]	valid_0's multi_logloss: 0.306125
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.327785
Early stopping, best iteration is:
[155]	valid_0's multi_logloss: 0.307373
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.330042
Early stopping, best iteration is:
[162]	valid_0's multi_logloss: 0.306855
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.331275
Early stopping, best iteration is:
[155]	valid_0's multi_logloss: 0.309593
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.349587
Early stopping, best iteration is:
[160]	valid_0's multi_logloss: 0.327402
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.326719
Early stopping, bes

KeyboardInterrupt: 

In [47]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
models = os.listdir(path)
models_list = [x for x in models if x.endswith(".pkl")]
assert len(models_list) ==11
temp_predictions = np.zeros((test.shape[0],198))

for m in models_list:
    model = joblib.load(path+m)
    predict_proba = model.predict_proba(test)
    temp_predictions += predict_proba/11

KeyboardInterrupt: 

In [None]:
# dacon code
submission = pd.DataFrame(data=np.zeros((test.shape[0],198)))
submission.index = test.index 
submission.index.name = 'id'
submission+=temp_predictions

submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True)

## 6. 결과 및 결언
## Conclusion & Discussion
## 데이터 전처리
PCA, Feature 정규화, Min-Max Scaling은 성능 향상에 도움이 되지 않음 Object와 NAN 값을 0으로 바꾸어 주는 전처리만 진행

## 모델 학습 검증
- Lgbm 모델 선택<br>
Random Forest, Xgboost, LightGBM 모델 비교 결과 lgbm의 성능이 가장 좋았음<br>

- K-fold & Random seed를 사용한 모델 하이퍼 파라미터 튜닝<br>
Robust 한 모델을 만들기 위해 4Kfold * 3seed 총 12개의 모델을 만듬 Early stopping 값을 작게 설정하여 over-fitting 방지 min_child_weight 값을 CV를 통해 최적화 하여 over-fitting 방지 Soft-voting 예측 방법 선택<br>

- Soft-voting 예측<br>
예측 시 Hard-voting 방식과 Probability를 평균내는 Soft-voting 방식을 실험 evaluation metric이 log-loss였기 때문에 probability를 평균내는 방식의 성능이 좋았음 12개의 모델의 예측을 평균 하는 방식으로 최종 결과물 제출

In [None]:
import os

os.system('shutdown -s')