# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split2(dataset1, dataset2):
    
    X1 = dataset1.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X1.columns)}
    X1 = X1.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X1.columns)})
    y1 = dataset1['연봉']
    
    X2 = dataset2.drop('연봉', axis=1)
    X2 = X2.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X2.columns)})
    y2 = dataset2['연봉']
    
    return column_dict, X1, y1, X2, y2

In [3]:
def rf_model2(data_x1, data_y1, data_x2, data_y2):

    # rf 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = list(rf_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = list(rf_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # rf 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'rf':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def lgbm_model2(data_x1, data_y1, data_x2, data_y2):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []
    
    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :].values
        y_train = data_y1[tr_idx].values

        X_valid = data_x1.iloc[val_idx, :].values
        y_valid = data_y1[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        
        pred_valid = list(lgb_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :].values
        y_train = data_y2[tr_idx].values

        X_valid = data_x2.iloc[val_idx, :].values
        y_valid = data_y2[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        
        pred_valid = list(lgb_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))

    # lgbm 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [5]:
def xgb_model2(data_x1, data_y1, data_x2, data_y2):
    
    # xgb 모델링
    xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }
    
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :].values
        y_train = data_y1[tr_idx].values

        X_valid = data_x1.iloc[val_idx, :].values
        y_valid = data_y1[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

        pred_valid = list(xgb_model.predict(xgb.DMatrix(data = X_valid)))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :].values
        y_train = data_y2[tr_idx].values

        X_valid = data_x2.iloc[val_idx, :].values
        y_valid = data_y2[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

        pred_valid = list(xgb_model.predict(xgb.DMatrix(data = X_valid)))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # xgb 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'xgb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [6]:
def cb_model2(data_x1, data_y1, data_x2, data_y2):

    # cb 모델링
    cat_cols = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)

        pred_valid = list(cb_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        
        pred_valid = list(cb_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # cb 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))
    
    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [7]:
def knn_model2(data_x1, data_y1, data_x2, data_y2):

    # knn 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = list(knn_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = list(knn_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # knn 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'knn':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [8]:
def regression_model2(data_x1, data_y1, data_x2, data_y2):

    # knn 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = list(regression_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = list(regression_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # knn 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'regression':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [9]:
hitter_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/타자_fa(모델링용).csv')
hitter_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/타자_nonfa(모델링용).csv')
pitcher_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/투수_fa(모델링용).csv')
pitcher_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/투수_nonfa(모델링용).csv')

#### 1. 타자

In [10]:
# train, test 분할
col_dict, X1, y1, X2, y2 = Xy_split2(hitter_fa, hitter_nonfa)

In [11]:
# 타자데이터 도출
hitter_performance = pd.concat([rf_model2(X1, y1, X2, y2),
                                  lgbm_model2(X1, y1, X2, y2),
                                  xgb_model2(X1, y1, X2, y2),
                                  cb_model2(X1, y1, X2, y2),
                                  knn_model2(X1, y1, X2, y2),
                                  regression_model2(X1, y1, X2, y2)], axis=1)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's rmse: 37369.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's rmse: 33883.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[97]	valid_0's rmse: 52365
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[97]	valid_0's rmse: 29719.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's rmse: 41615.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[197]	valid_0's rmse: 14422.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 12996.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[211]	valid_0's rmse: 1

1000:	learn: 7387.2848838	test: 8751.8301171	best: 8751.8301171 (1000)	total: 6.58s	remaining: 2m 4s
2000:	learn: 4864.4274611	test: 8202.9999715	best: 8202.9999715 (2000)	total: 13.1s	remaining: 1m 58s
3000:	learn: 3427.2369209	test: 7992.9899190	best: 7990.9397175 (2989)	total: 21.3s	remaining: 2m
4000:	learn: 2546.2230469	test: 7878.7084388	best: 7878.6857296 (3999)	total: 29.9s	remaining: 1m 59s
5000:	learn: 1978.3562898	test: 7800.7860999	best: 7800.6288480 (4998)	total: 38.5s	remaining: 1m 55s
6000:	learn: 1589.1952506	test: 7751.0242247	best: 7751.0242247 (6000)	total: 45.9s	remaining: 1m 47s
7000:	learn: 1330.2612901	test: 7720.9534036	best: 7720.6580019 (6993)	total: 52.6s	remaining: 1m 37s
8000:	learn: 1094.2133393	test: 7696.9941805	best: 7696.9941805 (8000)	total: 59.5s	remaining: 1m 29s
9000:	learn: 915.3022894	test: 7687.2975921	best: 7687.2975921 (9000)	total: 1m 5s	remaining: 1m 20s
10000:	learn: 771.6923955	test: 7678.6750044	best: 7678.5622884 (9990)	total: 1m 11s	rem

#### 2. 투수

In [12]:
# train, test 분할
col_dict, X1, y1, X2, y2 = Xy_split2(pitcher_fa, pitcher_nonfa)

In [13]:
# 타자데이터 도출
pitcher_performance = pd.concat([rf_model2(X1, y1, X2, y2),
                                  lgbm_model2(X1, y1, X2, y2),
                                  xgb_model2(X1, y1, X2, y2),
                                  cb_model2(X1, y1, X2, y2),
                                  knn_model2(X1, y1, X2, y2),
                                  regression_model2(X1, y1, X2, y2)], axis=1)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[167]	valid_0's rmse: 29255.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 29044.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[253]	valid_0's rmse: 52871.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[83]	valid_0's rmse: 32721.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's rmse: 20086.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[89]	valid_0's rmse: 12235.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[142]	valid_0's rmse: 10499.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[499]	valid_0's rm

1000:	learn: 16103.0371750	test: 30575.1667676	best: 30573.9881815 (999)	total: 2.95s	remaining: 55.9s
2000:	learn: 4045.0455414	test: 27104.5385329	best: 27104.5385329 (2000)	total: 5.34s	remaining: 48s
3000:	learn: 1272.0242232	test: 26183.5984005	best: 26183.5984005 (3000)	total: 7.84s	remaining: 44.4s
4000:	learn: 467.5874021	test: 26053.3771343	best: 26049.7119744 (3936)	total: 10.2s	remaining: 40.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 26049.71197
bestIteration = 3936

Shrink model to first 3937 iterations.
Learning rate set to 0.008603
0:	learn: 21064.7122495	test: 22026.7029646	best: 22026.7029646 (0)	total: 6.91ms	remaining: 2m 18s
1000:	learn: 6129.1739784	test: 12921.6585604	best: 12920.9858445 (997)	total: 6.95s	remaining: 2m 11s
2000:	learn: 4001.4708885	test: 12313.2390119	best: 12312.4581250 (1999)	total: 13.7s	remaining: 2m 3s
3000:	learn: 3050.1731553	test: 12174.5155394	best: 12174.5155394 (3000)	total: 20.5s	remaining: 1m 56s
4000:	learn

In [14]:
hitter_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,16405.411612,15885.514666,17267.666753,14410.17565,22145.011112,23373.294522
cv2,13940.518782,14333.34964,14126.56177,13617.059918,20989.903946,21471.967119
cv3,17999.342629,16865.628321,16986.159356,16479.137359,22706.473249,19416.498183
cv4,10572.543452,10485.271761,10840.332039,9763.495523,18659.826414,16387.64978
cv5,13287.946102,13847.791233,13623.967795,12229.562948,20726.940909,18259.758665
평균,14441.152515,14283.511124,14568.937543,13299.88628,21045.631126,19781.833654


In [15]:
pitcher_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,14177.155631,12807.256778,13571.298772,12577.445725,17013.181034,19211.621909
cv2,12283.564973,11184.928287,13439.363937,11247.450432,14161.283073,22176.769228
cv3,13950.893091,12650.861876,14510.926725,13491.005187,17734.035519,18047.057972
cv4,9522.37513,10230.674634,10074.869743,9420.055243,12295.378325,23426.527546
cv5,14095.760095,13216.551739,12678.081934,11416.927473,14427.945003,24092.146305
평균,12805.949784,12018.054663,12854.908222,11630.576812,15126.364591,21390.824592
