In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Lightgbm
 2. Catboost

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [3]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    lgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        lgb_models[step] = lgb_model

        step += 1
        
    # lgbm 성능종합
    performance = [lgb_models[step].best_score['valid_0']['rmse'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        cb_models[step] = cb_model

        step += 1
        
    # cb 성능종합
    performance = [cb_models[step].best_score_['validation']['RMSE'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [9]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용_원핫인코딩)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용_원핫인코딩)_train_ver3.csv')

In [23]:
fa_ids = hitter.loc[hitter['FA여부'] == 1, 'ID'].unique()
fa_hitter = hitter[hitter['ID'].isin(list(fa_ids))].drop(['ID','선수명'], axis=1).reset_index(drop=True)
nonfa_hitter = hitter[~hitter['ID'].isin(list(fa_ids))].drop(['ID','선수명'], axis=1).reset_index(drop=True)

fa_ids = pitcher.loc[pitcher['FA여부'] == 1, 'ID'].unique()
fa_pitcher = pitcher[pitcher['ID'].isin(list(fa_ids))].drop(['ID','선수명'], axis=1).reset_index(drop=True)
nonfa_pitcher = pitcher[~pitcher['ID'].isin(list(fa_ids))].drop(['ID','선수명'], axis=1).reset_index(drop=True)

#### 1. 타자

In [37]:
# train, test 분할
col_dict, X, y = Xy_split(fa_hitter)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

pd.concat([lgbm_model(X, y),
           cb_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's rmse: 25603.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[62]	valid_0's rmse: 28754.7


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 29710.3
Early stopping, best iteration is:
[1884]	valid_0's rmse: 29702.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's rmse: 27343.5


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 31153.5
Early stopping, best iteration is:
[1345]	valid_0's rmse: 31146.5
Learning rate set to 0.006969
0:	learn: 55049.8889677	test: 50603.3773949	best: 50603.3773949 (0)	total: 20.7ms	remaining: 6m 54s
1000:	learn: 18675.2293728	test: 27541.3922055	best: 27541.3922055 (1000)	total: 5.12s	remaining: 1m 37s
2000:	learn: 12001.1570007	test: 26660.9683757	best: 26660

Unnamed: 0,lgbm,cb
cv1,25603.200796,26096.542987
cv2,28754.678604,26747.230806
cv3,29702.861914,28690.023013
cv4,27343.525095,26159.412551
cv5,31146.450186,29235.949312
평균,28510.143319,27385.831734


In [38]:
# train, test 분할
col_dict, X, y = Xy_split(nonfa_hitter)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

pd.concat([lgbm_model(X, y),
           cb_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 5144


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 4968.9
Early stopping, best iteration is:
[1430]	valid_0's rmse: 4953.56


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 3299.81


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[179]	valid_0's rmse: 7658.27


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 4346.83
Learning rate set to 0.007731
0:	learn: 9990.5115004	test: 9523.5120046	best: 9523.5120046 (0)	total: 7.54ms	remaining: 2m 30s
1000:	learn: 2169.0556987	test: 5107.8360036	best: 5107.6137167 (986)	total: 5.44s	remaining: 1m 43s
2000:	learn: 1394.6714996	test: 5000.0793991	best: 5000.0190297 (1997)	total: 10.9s	remaining: 1m 38s


Unnamed: 0,lgbm,cb
cv1,5144.004903,4972.681102
cv2,4953.562543,5064.683491
cv3,3299.811318,2702.952935
cv4,7658.269677,7431.652447
cv5,4346.834502,4764.979161
평균,5080.496589,4987.389827


#### 2. 투수

In [39]:
# train, test 분할
col_dict, X, y = Xy_split(fa_pitcher)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

pd.concat([lgbm_model(X, y),
           cb_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[538]	valid_0's rmse: 28597.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[415]	valid_0's rmse: 24110.1


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[321]	valid_0's rmse: 25294


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[68]	valid_0's rmse: 25259.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[751]	valid_0's rmse: 21186
Learning rate set to 0.006132
0:	learn: 46614.4599141	test: 56680.4976973	best: 56680.4976973 (0)	total: 5.4ms	remaining: 1m 47s
1000:	learn: 13619.4187805	test: 30123.1572478	best: 30123.1572478 (1000)	total: 4.32s	remaining: 1m 22s
2000:	learn: 7485.1100189	test: 29154.4972460	best: 29151.1616796 (1945)	total: 8.99s	remaining: 1m 20s
Stopped by overfitting

Unnamed: 0,lgbm,cb
cv1,28597.839275,29141.468889
cv2,24110.139264,25373.591532
cv3,25293.953093,24262.68995
cv4,25259.772296,23231.237139
cv5,21185.967394,22074.048383
평균,24889.534264,24816.607179


In [40]:
# train, test 분할
col_dict, X, y = Xy_split(nonfa_pitcher)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

pd.concat([lgbm_model(X, y),
           cb_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 5045.75


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	valid_0's rmse: 10359.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 8497.33


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's rmse: 6673.67


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 6187
Learning rate set to 0.00762
0:	learn: 12091.6837535	test: 8564.8943011	best: 8564.8943011 (0)	total: 6.23ms	remaining: 2m 4s
1000:	learn: 3647.0813424	test: 4586.9198487	best: 4586.2796268 (989)	total: 5.26s	remaining: 1m 39s
2000:	learn: 2390.0432454	test: 4425.9792631	best: 4425.5248396 (1996)	total: 10.7s	remaining: 1m 35s
3000:	learn: 1666.0886548	test: 43

Unnamed: 0,lgbm,cb
cv1,5045.745964,4290.245601
cv2,10359.903365,10198.549202
cv3,8497.331948,8537.733295
cv4,6673.666704,5692.411241
cv5,6186.995144,5257.122005
평균,7352.728625,6795.212269
