In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import math

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Lightgbm
 2. Catboost

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [3]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        pred_valid = lgb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)

    # lgbm 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = cb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [5]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용_원핫인코딩)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용_원핫인코딩)_train_ver3.csv')

In [6]:
hitter_final = pd.DataFrame()

for ID in hitter['ID'].unique():
    
    tmp = hitter.loc[hitter['ID'] == ID].reset_index(drop=True)
    tmp['작년연봉'] = tmp['연봉'].shift(-1)
    
    hitter_final = pd.concat([hitter_final, tmp]).reset_index(drop=True)
    
hitter_final = hitter_final.dropna().reset_index(drop=True)
hitter.shape, hitter_final.shape

((3033, 88), (2447, 89))

In [7]:
pitcher_final = pd.DataFrame()

for ID in pitcher['ID'].unique():
    
    tmp = pitcher.loc[pitcher['ID'] == ID].reset_index(drop=True)
    tmp['작년연봉'] = tmp['연봉'].shift(-1)
    
    pitcher_final = pd.concat([pitcher_final, tmp]).reset_index(drop=True)
    
pitcher_final = pitcher_final.dropna().reset_index(drop=True)
pitcher.shape, pitcher_final.shape

((2345, 83), (1807, 84))

#### 1. 타자

In [8]:
# train, test 분할
hitter_final = hitter_final.drop(['ID', '선수명'], axis=1)
col_dict, X, y = Xy_split(hitter_final)

In [9]:
# 타자데이터 도출
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's rmse: 16058.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's rmse: 14363.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's rmse: 16121.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 15456.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 14694.6
Learning rate set to 0.008092
0:	learn: 37998.4727664	test: 37850.2897523	best: 37850.2897523 (0)	total: 159ms	remaining: 52m 49s
1000:	learn: 8958.4437082	test: 15889.9272891	best: 15889.9272891 (1000)	total: 9.75s	remaining: 3m 5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 15627.69381
bestIteration = 1521

Shrink model to first 1522 iteration

Unnamed: 0,lgbm,cb
cv1,16058.5,15627.7
cv2,14363.3,14163.7
cv3,16121.3,17329.3
cv4,15456.8,15829.8
cv5,14694.6,16091.6
평균,15338.9,15808.4


#### 2. 투수

In [10]:
# train, test 분할
pitcher_final = pitcher_final.drop(['ID', '선수명'], axis=1)
col_dict, X, y = Xy_split(pitcher_final)

In [11]:
# 투수데이터 도출
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's rmse: 13133.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's rmse: 12554.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[132]	valid_0's rmse: 14901.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[144]	valid_0's rmse: 14606.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[251]	valid_0's rmse: 15552.2
Learning rate set to 0.007641
0:	learn: 31552.8181701	test: 20230.8839887	best: 20230.8839887 (0)	total: 10.2ms	remaining: 3m 23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10737.80161
bestIteration = 460

Shrink model to first 461 iterations.
Learning rate set to 0.007641
0:	learn: 30576.9446907	test: 25590.0383379	best: 25590.0383379 (0)	to

Unnamed: 0,lgbm,cb
cv1,13133.4,10737.8
cv2,12554.7,12356.1
cv3,14901.3,15922.3
cv4,14606.9,15046.3
cv5,15552.2,15927.4
평균,14149.7,13998.0
