# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split2(dataset1, dataset2):
    
    X1 = dataset1.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X1.columns)}
    X1 = X1.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X1.columns)})
    y1 = dataset1['연봉']
    
    X2 = dataset2.drop('연봉', axis=1)
    X2 = X2.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X2.columns)})
    y2 = dataset2['연봉']
    
    return column_dict, X1, y1, X2, y2

In [3]:
def rf_model2(data_x1, data_y1, data_x2, data_y2):

    # rf 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = list(rf_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = list(rf_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # rf 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'rf':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def lgbm_model2(data_x1, data_y1, data_x2, data_y2):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []
    
    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :].values
        y_train = data_y1[tr_idx].values

        X_valid = data_x1.iloc[val_idx, :].values
        y_valid = data_y1[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        
        pred_valid = list(lgb_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :].values
        y_train = data_y2[tr_idx].values

        X_valid = data_x2.iloc[val_idx, :].values
        y_valid = data_y2[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        
        pred_valid = list(lgb_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))

    # lgbm 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [5]:
def xgb_model2(data_x1, data_y1, data_x2, data_y2):
    
    # xgb 모델링
    xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }
    
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :].values
        y_train = data_y1[tr_idx].values

        X_valid = data_x1.iloc[val_idx, :].values
        y_valid = data_y1[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

        pred_valid = list(xgb_model.predict(xgb.DMatrix(data = X_valid)))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :].values
        y_train = data_y2[tr_idx].values

        X_valid = data_x2.iloc[val_idx, :].values
        y_valid = data_y2[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

        pred_valid = list(xgb_model.predict(xgb.DMatrix(data = X_valid)))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # xgb 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'xgb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [6]:
def cb_model2(data_x1, data_y1, data_x2, data_y2):

    # cb 모델링
    cat_cols = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    label_list = []
    pred_list = []

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)

        pred_valid = list(cb_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        
        pred_valid = list(cb_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # cb 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))
    
    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [7]:
def knn_model2(data_x1, data_y1, data_x2, data_y2):

    # knn 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = list(knn_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = list(knn_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # knn 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'knn':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [8]:
def regression_model2(data_x1, data_y1, data_x2, data_y2):

    # knn 모델링
    label_list = []
    pred_list = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x1):  

        X_train = data_x1.iloc[tr_idx, :]
        y_train = data_y1[tr_idx]

        X_valid = data_x1.iloc[val_idx, :]
        y_valid = data_y1[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = list(regression_model.predict(X_valid))
        pred_list.append(pred_valid)
        label_list.append(list(y_valid))
        
        
    for idx, (tr_idx, val_idx) in enumerate(cv.split(data_x2)):  

        X_train = data_x2.iloc[tr_idx, :]
        y_train = data_y2[tr_idx]

        X_valid = data_x2.iloc[val_idx, :]
        y_valid = data_y2[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = list(regression_model.predict(X_valid))
        pred_list[idx].extend(pred_valid)
        label_list[idx].extend(list(y_valid))
        
    # knn 성능종합
    performance = [mean_squared_error(label_list[num], pred_list[num], squared=False) for num in range(5)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'regression':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [9]:
hitter_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver2/타자_fa(모델링용).csv')
hitter_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver2/타자_nonfa(모델링용).csv')
pitcher_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver2/투수_fa(모델링용).csv')
pitcher_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver2/투수_nonfa(모델링용).csv')

#### 1. 타자

In [10]:
# train, test 분할
col_dict, X1, y1, X2, y2 = Xy_split2(hitter_fa, hitter_nonfa)

In [11]:
# 타자데이터 도출
hitter_performance = pd.concat([rf_model2(X1, y1, X2, y2),
                                  lgbm_model2(X1, y1, X2, y2),
                                  xgb_model2(X1, y1, X2, y2),
                                  cb_model2(X1, y1, X2, y2),
                                  knn_model2(X1, y1, X2, y2),
                                  regression_model2(X1, y1, X2, y2)], axis=1)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 27310.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[168]	valid_0's rmse: 22281.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[288]	valid_0's rmse: 22987.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 28922
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[67]	valid_0's rmse: 27681.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 6444.97
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 4924.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 

Learning rate set to 0.007716
0:	learn: 9962.5694678	test: 8731.2286523	best: 8731.2286523 (0)	total: 5.19ms	remaining: 1m 43s
1000:	learn: 2133.1928860	test: 4595.5991221	best: 4595.5991221 (1000)	total: 5.73s	remaining: 1m 48s
2000:	learn: 1369.9664860	test: 4411.0018127	best: 4411.0018127 (2000)	total: 11.1s	remaining: 1m 39s
3000:	learn: 969.1378455	test: 4355.3102298	best: 4354.8391117 (2992)	total: 16.8s	remaining: 1m 35s
4000:	learn: 722.1192871	test: 4334.1252893	best: 4333.9768134 (3983)	total: 22.6s	remaining: 1m 30s
5000:	learn: 550.2900285	test: 4324.8895580	best: 4324.8448962 (4993)	total: 28.4s	remaining: 1m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 4320.826708
bestIteration = 5635

Shrink model to first 5636 iterations.
Learning rate set to 0.007716
0:	learn: 10082.9302969	test: 8178.3960145	best: 8178.3960145 (0)	total: 6.74ms	remaining: 2m 14s
1000:	learn: 2161.4914220	test: 3898.3859864	best: 3898.3859864 (1000)	total: 5.87s	remaining: 1m 

#### 2. 투수

In [12]:
# train, test 분할
col_dict, X1, y1, X2, y2 = Xy_split2(pitcher_fa, pitcher_nonfa)

In [13]:
# 타자데이터 도출
pitcher_performance = pd.concat([rf_model2(X1, y1, X2, y2),
                                  lgbm_model2(X1, y1, X2, y2),
                                  xgb_model2(X1, y1, X2, y2),
                                  cb_model2(X1, y1, X2, y2),
                                  knn_model2(X1, y1, X2, y2),
                                  regression_model2(X1, y1, X2, y2)], axis=1)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[213]	valid_0's rmse: 27140.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 22597.6
Early stopping, best iteration is:
[1807]	valid_0's rmse: 22595.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[441]	valid_0's rmse: 22253.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[282]	valid_0's rmse: 22813.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[438]	valid_0's rmse: 19944.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 5071.22
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 10165.2
Early stopping, best iteration is:
[1476]	valid_0's rmse: 10155.2
Training until validation scores don't improve f

1000:	learn: 3590.8933266	test: 10958.8817143	best: 10958.8817143 (1000)	total: 5.51s	remaining: 1m 44s
2000:	learn: 2291.7391210	test: 10537.0313822	best: 10536.8762805 (1999)	total: 11.2s	remaining: 1m 41s
3000:	learn: 1623.2229411	test: 10461.8522457	best: 10461.7201636 (2993)	total: 17.1s	remaining: 1m 36s
4000:	learn: 1198.2722074	test: 10416.2499857	best: 10416.2499857 (4000)	total: 22.9s	remaining: 1m 31s
5000:	learn: 889.0198635	test: 10394.0144227	best: 10393.5625432 (4945)	total: 28.5s	remaining: 1m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10392.87111
bestIteration = 5043

Shrink model to first 5044 iterations.
Learning rate set to 0.007614
0:	learn: 10749.3147426	test: 8022.8783040	best: 8022.8783040 (0)	total: 6.87ms	remaining: 2m 17s
1000:	learn: 3405.8282848	test: 4796.2683041	best: 4796.2062182 (996)	total: 5.42s	remaining: 1m 42s
2000:	learn: 2155.4990993	test: 4564.8534521	best: 4564.8534521 (2000)	total: 10.4s	remaining: 1m 33s
3000:	lear

In [14]:
hitter_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,17756.511877,17344.780459,17615.850723,16803.113841,24883.364546,20083.710893
cv2,14739.023109,14084.926387,16057.724276,14045.360562,20386.988808,20107.498774
cv3,13968.258519,14239.525544,17409.261194,14004.013357,25941.790938,20096.2401
cv4,19168.326139,17928.614851,20482.2984,16334.671917,25543.224658,22059.259482
cv5,16411.554127,17348.643264,15849.512035,15017.596038,23176.984634,19163.790041
평균,16408.734754,16189.298101,17482.929326,15240.951143,23986.470717,20302.099858


In [15]:
pitcher_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,14278.349071,14048.687759,14932.137469,14162.344062,17663.649946,19593.67482
cv2,17679.692605,14195.185456,20991.016021,14215.29398,21348.173849,19977.069332
cv3,14805.813316,11825.288097,14478.598724,11298.968623,14078.143041,16096.854689
cv4,12523.484782,12189.648393,12442.969417,11728.052739,16100.025268,18404.907935
cv5,11609.102568,11423.31705,11023.18698,11645.887732,15818.664859,19446.543509
평균,14179.288468,12736.425351,14773.581723,12610.109427,17001.731393,18703.810057
