# 모델링 전 과정
 - 세번째 파이프 라인은 앞서 도출한 훈련데이터로 모델을 학습, 튜닝, 앙상블시켜 최종 제출결과물을 뽑아내는 '모델링 전 과정'입니다.
    
         - 1. 먼저 단일 모델별 기본 성능을 살펴보았습니다.  
         - 2. 모델들을 랜덤서치를 이용하여 튜닝하였습니다.
         - 3. 앙상블에 사용할 튜닝된 모델들의 모든 조합에 대해서 VotingRegressor를 사용하여 좋은 성능을 내었습니다.
         - 이렇게 크게 3가지 과정을 거쳐서 최종 서브미션을 생성해 내었습니다.


***

In [2]:
# Data Handling
import pandas as pd
import numpy as np
import datetime
#pd.set_option('max_columns', 100, 'max_rows', 20)


# Visualization
import matplotlib.pylab as plt
%matplotlib inline


# OS
import os
import time
import warnings; warnings.filterwarnings("ignore")
from tqdm import tqdm


# Modeling
from sklearn.model_selection import KFold
n_splits=5; seed = 42
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# Hyperparameter Optimization
from sklearn.model_selection import RandomizedSearchCV


# Ensemble
from itertools import combinations
from sklearn.ensemble import VotingRegressor


# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from scipy.stats.mstats import gmean


# Saving
import joblib

 - 데이터 로드

In [3]:
train = pd.read_csv('./data/extra_select_train.csv').values
test = pd.read_csv('./data/extra_select_test.csv').values
target = pd.read_csv('./data/target.csv').values

print('train shape:', train.shape)
print('test shape:', test.shape)

pd.read_csv('./data/extra_select_train.csv').head()

train shape: (2891, 72)
test shape: (160, 72)


Unnamed: 0,유역평균강수,강우(A지역),강우(B지역),강우(C지역),강우(D지역),수위(E지역),수위(D지역),유역평균강수.1,강우(A지역).1,강우(B지역).1,...,저수량(예년),방수로수위,강우량_해당시간,자체유입,총방류량,강우_0집단,강우_1집단,강우_2집단,강우_3집단,수위_0집단
0,6.4,7.0,7.0,7.0,8.0,2.54,122.56875,6.3,7.0,7.0,...,1093.9,66.74,0.4,143.889,0.0,7.0,7.0,7.5,8.0,2.54
1,6.3,7.0,8.0,7.0,8.0,2.53,122.5625,6.4,7.0,8.0,...,1093.9,69.94,0.9,129.754,560.87,7.0,8.0,8.5,9.0,2.53
2,6.4,7.0,9.0,7.0,8.0,2.53,122.55625,7.3,7.0,9.0,...,1093.9,70.03,0.9,97.138,671.58,7.0,9.0,8.666667,9.5,2.53
3,7.3,7.0,10.0,7.0,8.0,2.53,122.55625,8.2,7.0,10.0,...,1093.9,70.13,3.1,268.04,698.04,8.0,10.0,11.666667,11.333333,2.53
4,8.2,7.0,12.0,8.0,10.0,2.53,122.55625,11.3,9.0,12.0,...,1093.9,70.13,3.1,416.401,703.07,10.166667,12.0,13.666667,13.166667,2.53


#### 1. 단일 모델 기본 성능 확인

In [3]:
# 단일모델 정의
knn_reg = KNeighborsRegressor(n_jobs=-1)
extra_reg = ExtraTreesRegressor(random_state=seed, n_jobs=-1)
gbm_reg = GradientBoostingRegressor(random_state=seed)
xgb_reg = XGBRegressor(random_state=seed, n_jobs=-1)
lgb_reg = LGBMRegressor(random_state=seed, n_jobs=-1)
cat_reg = CatBoostRegressor(random_state=seed, verbose=False)

regs = [knn_reg, extra_reg, gbm_reg, xgb_reg, lgb_reg, cat_reg]


# Cross_val_score 함수정의
def get_model_cv_prediction(model, feature_data, y_target):
    neg_mse_scores = cross_val_score(model, feature_data, y_target, scoring='neg_mean_squared_error', cv=kfold, n_jobs=-1)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print(f'{model.__class__.__name__} 모델의 평균 RMSE: {avg_rmse:.3f}')


# 단일 모델별 평균 성능 출력
for reg in regs:
    get_model_cv_prediction(reg, train, target)

KNeighborsRegressor 모델의 평균 RMSE: 450.350
ExtraTreesRegressor 모델의 평균 RMSE: 211.593
GradientBoostingRegressor 모델의 평균 RMSE: 263.952
XGBRegressor 모델의 평균 RMSE: 246.192
LGBMRegressor 모델의 평균 RMSE: 257.305
CatBoostRegressor 모델의 평균 RMSE: 173.764


#### 2. 랜덤서치를 활용한 모델 튜닝 

In [4]:
regs = [
    (
        KNeighborsRegressor(),              
        {'n_neighbors': [3,5,7,9,11],        
         'weights': ['uniform','distance']}
    ),
    (
        ExtraTreesRegressor(),
        {'n_estimators': [100, 150, 200, 250, 300],
         'max_depth': [10, 12, 15, 17, 20],
         'max_features': [0.8, 0.85, 0.9, 0.95],
         'min_samples_split': [1, 2, 3, 4, 5],
         'min_samples_leaf': [1, 2, 3, 4, 5]}
    ),
    (
        GradientBoostingRegressor(),
        {
         'n_estimators': [100,300,500,1000],
         'learning_rate': [0.01,0.03,0.05,0.1],
         'max_depth': [3,5,6],
         'min_samples_leaf' : [3,5,7,9,10],
         'min_samples_split' : [2,4,6,8,10],
         'subsample' : [0.8,0.9,0.95,1]
         }
    ),
    (
        XGBRegressor(),
        {
        'n_estimators' :[100,200,300,400,500], # kaggle competition에서 best = 1000 
        'learning_rate': [0.01,0.03,0.05,0.1],
         'max_depth': [3,5,6],
         'colsample_bytree' :[0,0.1,0.3,0.5,0.7,0.9,1],
         'min_child_weight' :[1,3,5,6],
         'subsample' :[0.8,0.9,0.95,1],
         'objective' : ['reg:squarederror']
        }
    ),
    (
        LGBMRegressor(),
        {'n_estimators': [300,500,700,1000,1100],
         'learning_rate': [0.01,0.03,0.05,0.1],
         'max_depth': [3,5,7,9,10],
         #'boosting' : ['gbrt','dart'],
         'colsample_bytree' : [0,0.1,0.3,0.5,0.7,0.9,1],
         'subsample' :[0.8,0.9,0.95,1],
         'num_leaves' :[30,31,33,35,39,40]
         #'feature_fraction' : [0.1,0.3,0.5,0.7,0.9]
        }
    ),
    (
        CatBoostRegressor(),
        {'learning_rate': [0.05, 0.1, 0.2, 1, 1.5],
         'depth': [3, 5, 7, 9, 10],
         'iterations' : [500, 700, 1000, 1200],
         'l2_leaf_reg' : [2, 5, 7, 10, 20],
         'verbose':[False]}
    )
]


RS_tuned_regs = []  # 튜닝된 모델을 저장
for reg, param_grid in regs:
    start = time.time()
    rand_search = RandomizedSearchCV(reg, param_grid, n_iter=20, scoring='neg_mean_squared_error', 
                                     cv=kfold, random_state=seed, n_jobs=-1)
    rand_search.fit(train, target)
    reg_name = reg.__class__.__name__
    reg_score = np.sqrt(-rand_search.best_score_) 
    print(f'{reg_name:30s} mean_rmse: {reg_score:.3f}, takes {time.time() - start:.1f} secs')
    RS_tuned_regs.append((reg_name, rand_search.best_estimator_, reg_score))

KNeighborsRegressor            mean_rmse: 378.961, takes 0.6 secs
ExtraTreesRegressor            mean_rmse: 212.808, takes 72.3 secs
GradientBoostingRegressor      mean_rmse: 184.889, takes 430.7 secs
XGBRegressor                   mean_rmse: 177.448, takes 84.0 secs
LGBMRegressor                  mean_rmse: 171.787, takes 48.7 secs
CatBoostRegressor              mean_rmse: 159.890, takes 4193.1 secs


In [5]:
# 튜닝된 모델들의 결과
RS_tuned_regs

[('KNeighborsRegressor',
  KNeighborsRegressor(n_neighbors=3, weights='distance'),
  378.96128127016635),
 ('ExtraTreesRegressor',
  ExtraTreesRegressor(max_depth=15, max_features=0.85),
  212.80782526155588),
 ('GradientBoostingRegressor',
  GradientBoostingRegressor(min_samples_leaf=7, n_estimators=1000, subsample=0.8),
  184.888704060011),
 ('XGBRegressor',
  XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints=None,
               learning_rate=0.1, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints=None,
               n_estimators=400, n_jobs=0, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
               tree_method=None, validate_parameters=False, verbosity=None),
  177.44764958091315),
 ('LGBMRegressor',
  LGBMReg

 - 튜닝된 모델들 저장

In [6]:
now = datetime.datetime.now()
nowDate = now.strftime('%Y-%m-%d')
joblib.dump(RS_tuned_regs,f'./models/RS_tuned_regs_{nowDate}_Final.pkl')

['./models/RS_tuned_regs_2021-09-15_Final.pkl']

In [4]:
# 튜닝된 모델들 로드
RS_tuned_regs = joblib.load('./models/RS_tuned_regs_2021-09-15_Final.pkl')

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



#### 3. 튜닝된 모델들로 Voting 앙상블

In [5]:
# 모델 훈련 함수 정의
def return_fitted_model(model, train, target):
    rmse_scores = []
    for iter_count, (train_idx, valid_idx) in enumerate(kfold.split(train, target)):

        X_train, X_valid = train[train_idx], train[valid_idx]
        y_train, y_valid = target[train_idx], target[valid_idx]

        model.fit(X_train, y_train)

        pred = model.predict(X_valid)
        rmse_score = np.sqrt(mean_squared_error(y_valid, pred))
        rmse_scores.append(rmse_score)
    return model, np.mean(rmse_scores)

# ex: extra_reg, extra_reg_score = return_fitted_model(extra_reg, train, target)
# print(f'모델의 평균 성능:  {extra_reg_score:.3f}')

 - 튜닝된 모델들 중 성능이 비슷하고 좋은 4개의 모델에 대해서 앙상블을 진행함

In [6]:
selected_reg = [
    #'KNeighborsRegressor', 
    #'ExtraTreesRegressor',
    'CatBoostRegressor', 
    'GradientBoostingRegressor', 
    'XGBRegressor',
    'LGBMRegressor'
]
models_for_ensemble = [(reg[0], reg[1]) for reg in RS_tuned_regs if reg[0] in selected_reg]

 - 모든 조합에 대해서 앙상블을 수행

In [None]:
# 3개의 모델 중 2개씩 averaging, 3개씩 averging 모두 해보기
start = time.time()
best_avg_score = np.inf
for model_nums in range(2, len(models_for_ensemble ) + 1):
    for avg_estimator in (combinations(models_for_ensemble , model_nums)):
        avg_reg = VotingRegressor(estimators = avg_estimator, n_jobs=-1)
        avg_model, avg_score = return_fitted_model(avg_reg, train, target)
        print(f'{"●".join([reg_name for reg_name, _, in avg_estimator])}: {avg_score:.3f}')
        if avg_score < best_avg_score:
            best_avg_score = avg_score
            best_avg_reg = avg_reg
print(time.time() - start)

GradientBoostingRegressor●XGBRegressor: 170.492
GradientBoostingRegressor●LGBMRegressor: 160.247
GradientBoostingRegressor●CatBoostRegressor: 159.566
XGBRegressor●LGBMRegressor: 154.449
XGBRegressor●CatBoostRegressor: 151.652
LGBMRegressor●CatBoostRegressor: 149.236
GradientBoostingRegressor●XGBRegressor●LGBMRegressor: 151.671
GradientBoostingRegressor●XGBRegressor●CatBoostRegressor: 155.294
GradientBoostingRegressor●LGBMRegressor●CatBoostRegressor: 144.646
XGBRegressor●LGBMRegressor●CatBoostRegressor: 144.082


 - 가장 좋은 성능의 모델 출력

In [16]:
best_avg_reg

VotingRegressor(estimators=(('XGBRegressor',
                             XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=0.5, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=6, min_child_weight=1,
                                          missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=400, n_jobs=20,
                                          nu...llel_tree=1, random_state=0,
                                          reg_alpha=0, reg_lambda=1,
                                        

 - 앙상블 모델 저장

In [17]:
now = datetime.datetime.now()
nowDate = now.strftime('%Y-%m-%d')
joblib.dump(best_avg_reg,f'./models/Average_Ensemble_{nowDate}_Final.pkl')

['../Model/Average_Ensemble_2021-09-15_Final.pkl']

In [3]:
# 앙상블 모델 로드
best_avg_reg = joblib.load('./models/Average_Ensemble_2021-09-15_Final.pkl')

#### 최종 서브미션 출력

In [4]:
sub = pd.read_excel('./data/홍수ZERO_데이터/02_평가데이터/2021 빅콘테스트_데이터분석분야_퓨처스리그_홍수ZERO_평가데이터_210803.xlsx')
sub = sub.drop(sub.index[0]).drop('NO',axis=1)

sub['유입량']=  best_avg_reg.predict(test)
sub 

sub.to_csv('2021 빅콘테스트_데이터분석분야_퓨처스리그_홍수ZERO_평가데이터_210803.csv',index=False)

# ───────────────── End of Pipeline 3/4  ─────────────────