# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [3]:
def rf_model(data_x, data_y):

    # rf 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = rf_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # rf 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'rf':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    lgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        lgb_models[step] = lgb_model

        step += 1
        
    # lgbm 성능종합
    performance = [lgb_models[step].best_score['valid_0']['rmse'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [5]:
def xgb_model(data_x, data_y):
    
    # xgb 모델링
    xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }

    xgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])
        xgb_models[step] = xgb_model

        step += 1
        
    # xgb 성능종합
    performance = [xgb_models[step].best_score for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'xgb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [6]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        cb_models[step] = cb_model

        step += 1
        
    # cb 성능종합
    performance = [cb_models[step].best_score_['validation']['RMSE'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [7]:
def knn_model(data_x, data_y):

    # knn 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = knn_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # knn 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'knn':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [8]:
def regression_model(data_x, data_y):

    # regression 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = regression_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # regression 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'regression':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [9]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/타자(모델링용).csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/투수(모델링용).csv')

In [10]:
hitter_score1 = pd.read_csv('../변수중요도데이터/RFE_타자.csv')
hitter_score2 = pd.read_csv('../변수중요도데이터/filter_타자.csv')
hitter_score = pd.concat([hitter_score1, hitter_score2[['fscore순위', 'mutual_info순위']]], axis=1)

pitcher_score1 = pd.read_csv('../변수중요도데이터/RFE_투수.csv')
pitcher_score2 = pd.read_csv('../변수중요도데이터/filter_투수.csv')
pitcher_score = pd.concat([pitcher_score1, pitcher_score2[['fscore순위', 'mutual_info순위']]], axis=1)
pitcher_score

Unnamed: 0,변수명,실제변수명,rf순위,lgbm순위,xgb순위,cb순위,regression순위,fscore순위,mutual_info순위
0,variable1,데뷔년도,15,3,4,12,32,36.0,3.0
1,variable2,연도,11,19,9,10,31,47.0,5.0
2,variable3,평균자책점,30,29,2,19,64,59.0,48.0
3,variable4,경기,37,39,10,37,27,39.0,10.0
4,variable5,승리,68,44,39,52,45,6.0,15.0
...,...,...,...,...,...,...,...,...,...
75,variable76,팀명_롯데,46,59,60,40,18,73.0,74.0
76,variable77,팀명_삼성,42,71,70,45,17,57.0,75.0
77,variable78,팀명_우리/히어로즈/넥센/키움,78,79,69,75,13,67.0,67.0
78,variable79,팀명_한화,53,64,62,50,60,76.0,64.0


#### 1. 타자

In [11]:
# 데이터프레임 초기화
hitter_rf = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(hitter_score.loc[hitter_score['rf순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    hitter_rf = pd.concat([hitter_rf, rf_model(X_tmp, y).rename(columns={'rf':f'rf{n_features}'})], axis=1)

In [12]:
# 데이터프레임 초기화
hitter_lgbm = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(hitter_score.loc[hitter_score['lgbm순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    hitter_lgbm = pd.concat([hitter_lgbm, lgbm_model(X_tmp, y).rename(columns={'lgbm':f'lgbm{n_features}'})], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's rmse: 18608.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 14564.3


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[149]	valid_0's rmse: 16810.5


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[39]	valid_0's rmse: 15736.4


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 18629.1


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[106]	valid_0's rmse: 18210.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 14089.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11

Early stopping, best iteration is:
[36]	valid_0's rmse: 15025.2


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15446.3
Early stopping, best iteration is:
[1781]	valid_0's rmse: 15443.4


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[198]	valid_0's rmse: 15665.6


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[188]	valid_0's rmse: 10993.7


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14379.1
Early stopping, best iteration is:
[917]	valid_0's rmse: 14378.6


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 15089.6


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15200
[2000]	valid_0's rmse: 15194
Early stopping, best iteration is:
[2864]	valid_0's rmse: 15193.4


Training until validation scores don't 

Early stopping, best iteration is:
[358]	valid_0's rmse: 13677.4


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 15115.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[712]	valid_0's rmse: 15182.2


In [13]:
# 데이터프레임 초기화
hitter_xgb = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(hitter_score.loc[hitter_score['xgb순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    hitter_xgb = pd.concat([hitter_xgb, xgb_model(X_tmp, y).rename(columns={'xgb':f'xgb{n_features}'})], axis=1)

[0]	train-rmse:27579.32617	eval-rmse:30322.42578
[134]	train-rmse:1205.22717	eval-rmse:15472.21582
[0]	train-rmse:28863.91797	eval-rmse:23437.20898
[106]	train-rmse:1649.41382	eval-rmse:13058.66113
[0]	train-rmse:27813.23438	eval-rmse:27877.73633
[114]	train-rmse:1503.58252	eval-rmse:17030.24023
[0]	train-rmse:27880.75781	eval-rmse:28623.58008
[119]	train-rmse:1451.96948	eval-rmse:15758.56836
[0]	train-rmse:27257.52734	eval-rmse:32738.72070
[108]	train-rmse:1493.16711	eval-rmse:17996.31836
[0]	train-rmse:27382.06445	eval-rmse:30109.27148
[116]	train-rmse:813.19452	eval-rmse:15240.43848
[0]	train-rmse:28676.99023	eval-rmse:23411.15430
[105]	train-rmse:1043.24561	eval-rmse:13652.44336
[0]	train-rmse:27769.54492	eval-rmse:27468.09570
[118]	train-rmse:913.79382	eval-rmse:16015.91406
[0]	train-rmse:27613.12305	eval-rmse:28463.67383
[108]	train-rmse:1028.87793	eval-rmse:15907.90430
[0]	train-rmse:27158.96289	eval-rmse:31929.81250
[107]	train-rmse:1138.12805	eval-rmse:18492.29297
[0]	train-rm

[0]	train-rmse:26929.67578	eval-rmse:31118.37500
[213]	train-rmse:92.95282	eval-rmse:13577.94141


In [14]:
# 데이터프레임 초기화
hitter_cb = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(hitter_score.loc[hitter_score['cb순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    hitter_cb = pd.concat([hitter_cb, cb_model(X_tmp, y).rename(columns={'cb':f'cb{n_features}'})], axis=1)

Learning rate set to 0.009887
0:	learn: 32533.9283553	test: 34465.1945244	best: 34465.1945244 (0)	total: 136ms	remaining: 45m 15s
1000:	learn: 10097.8546760	test: 15325.1317612	best: 15323.3621230 (998)	total: 2.46s	remaining: 46.6s
2000:	learn: 7956.0947652	test: 14811.0435572	best: 14808.3190380 (1962)	total: 4.78s	remaining: 43s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 14770.22329
bestIteration = 2323

Shrink model to first 2324 iterations.
Learning rate set to 0.009887
0:	learn: 34144.7923279	test: 27575.0598527	best: 27575.0598527 (0)	total: 2.67ms	remaining: 53.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13047.74037
bestIteration = 299

Shrink model to first 300 iterations.
Learning rate set to 0.009887
0:	learn: 33014.0258553	test: 32585.0838724	best: 32585.0838724 (0)	total: 2.62ms	remaining: 52.4s
1000:	learn: 10089.2105511	test: 13825.3440124	best: 13825.0680563 (997)	total: 2.27s	remaining: 43.1s
2000:	learn: 7747.9216166	t

2000:	learn: 5583.3106209	test: 12772.1424695	best: 12771.8158383 (1999)	total: 6.04s	remaining: 54.3s
3000:	learn: 4073.9143102	test: 12507.2786057	best: 12506.6394399 (2990)	total: 9.36s	remaining: 53s
4000:	learn: 3119.8068523	test: 12354.6895787	best: 12354.4694874 (3999)	total: 12.7s	remaining: 50.6s
5000:	learn: 2478.1312650	test: 12269.5073271	best: 12269.5062473 (4999)	total: 16s	remaining: 48s
6000:	learn: 2045.5778036	test: 12237.2280888	best: 12237.2280888 (6000)	total: 19.4s	remaining: 45.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12220.60783
bestIteration = 6774

Shrink model to first 6775 iterations.
Learning rate set to 0.009887
0:	learn: 32855.5945959	test: 33228.7982378	best: 33228.7982378 (0)	total: 23.4ms	remaining: 7m 48s
1000:	learn: 7948.2027897	test: 14151.9089907	best: 14151.9089907 (1000)	total: 3.04s	remaining: 57.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 14016.42205
bestIteration = 1365

Shrink model to f

2000:	learn: 5551.4109701	test: 14347.5774914	best: 14347.5774914 (2000)	total: 8.93s	remaining: 1m 20s
3000:	learn: 3982.2544899	test: 14047.3454817	best: 14045.5423772 (2985)	total: 13.4s	remaining: 1m 16s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 14005.9921
bestIteration = 3388

Shrink model to first 3389 iterations.
Learning rate set to 0.009887
0:	learn: 34146.7896950	test: 27587.3959663	best: 27587.3959663 (0)	total: 34.6ms	remaining: 11m 31s
1000:	learn: 8342.4449138	test: 11504.0171257	best: 11495.8287339 (972)	total: 4.49s	remaining: 1m 25s
2000:	learn: 5568.9777106	test: 11007.4553189	best: 11007.4553189 (2000)	total: 8.87s	remaining: 1m 19s
3000:	learn: 3955.5651418	test: 10823.0365188	best: 10823.0365188 (3000)	total: 13.3s	remaining: 1m 15s
4000:	learn: 2972.2894532	test: 10739.3659330	best: 10739.3659330 (4000)	total: 17.8s	remaining: 1m 11s
5000:	learn: 2289.1453537	test: 10675.1009553	best: 10674.5449716 (4969)	total: 22.3s	remaining: 1m 6s
Stop

3000:	learn: 3771.1909679	test: 12478.6460119	best: 12477.8397130 (2997)	total: 16.3s	remaining: 1m 32s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12408.46539
bestIteration = 3786

Shrink model to first 3787 iterations.
Learning rate set to 0.009887
0:	learn: 32866.1346235	test: 33246.3433175	best: 33246.3433175 (0)	total: 15.2ms	remaining: 5m 4s
1000:	learn: 7703.6667774	test: 14401.9707525	best: 14401.9707525 (1000)	total: 5.42s	remaining: 1m 42s
2000:	learn: 5128.4271276	test: 13935.3310701	best: 13935.3310701 (2000)	total: 10.8s	remaining: 1m 37s
3000:	learn: 3717.6413587	test: 13730.8176005	best: 13730.7200705 (2999)	total: 16.2s	remaining: 1m 31s
4000:	learn: 2875.8914752	test: 13652.6817135	best: 13652.4765802 (3944)	total: 21.7s	remaining: 1m 26s
5000:	learn: 2248.3735675	test: 13554.3786015	best: 13554.3283143 (4999)	total: 27.2s	remaining: 1m 21s
6000:	learn: 1794.1339086	test: 13505.4107038	best: 13505.3742576 (5996)	total: 32.7s	remaining: 1m 16s
700

1000:	learn: 7928.6641999	test: 14972.4352798	best: 14972.4352798 (1000)	total: 6.2s	remaining: 1m 57s
2000:	learn: 5256.4209689	test: 14471.5970158	best: 14470.6845398 (1964)	total: 12.3s	remaining: 1m 50s
3000:	learn: 3787.7779887	test: 14290.2632428	best: 14287.8874807 (2951)	total: 18.8s	remaining: 1m 46s
4000:	learn: 2791.2836649	test: 14213.3757517	best: 14213.1826528 (3999)	total: 24.9s	remaining: 1m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 14196.4187
bestIteration = 4390

Shrink model to first 4391 iterations.
Learning rate set to 0.009887
0:	learn: 32548.7811677	test: 34489.2688825	best: 34489.2688825 (0)	total: 29.6ms	remaining: 9m 51s
1000:	learn: 8224.2411798	test: 14950.4274694	best: 14950.3686800 (999)	total: 6.59s	remaining: 2m 5s
2000:	learn: 5342.7192766	test: 14335.1931219	best: 14335.1931219 (2000)	total: 13.1s	remaining: 1m 57s
3000:	learn: 3830.6669076	test: 14158.2524713	best: 14157.8418694 (2997)	total: 19.7s	remaining: 1m 51s
Stoppe

2000:	learn: 5227.7228748	test: 11066.2496476	best: 11063.2306741 (1993)	total: 14.5s	remaining: 2m 10s
3000:	learn: 3707.9013639	test: 10877.0753622	best: 10876.8501761 (2963)	total: 22s	remaining: 2m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10858.41732
bestIteration = 3147

Shrink model to first 3148 iterations.
Learning rate set to 0.009887
0:	learn: 33017.4004582	test: 32586.9377196	best: 32586.9377196 (0)	total: 32.6ms	remaining: 10m 51s
1000:	learn: 7874.2405408	test: 13643.4140440	best: 13643.4140440 (1000)	total: 6.81s	remaining: 2m 9s
2000:	learn: 4959.8197272	test: 13077.9661780	best: 13076.3983862 (1998)	total: 13.6s	remaining: 2m 1s
3000:	learn: 3621.6580517	test: 12904.5740603	best: 12904.5740603 (3000)	total: 20.3s	remaining: 1m 54s
4000:	learn: 2635.3072090	test: 12821.3819598	best: 12819.5043179 (3982)	total: 27.1s	remaining: 1m 48s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12817.19245
bestIteration = 4118

Shrink mo

3000:	learn: 3570.8990408	test: 13873.7447880	best: 13873.7447880 (3000)	total: 21s	remaining: 1m 58s
4000:	learn: 2721.5379251	test: 13722.6910849	best: 13722.6910849 (4000)	total: 27.9s	remaining: 1m 51s
5000:	learn: 2095.2865366	test: 13649.2624670	best: 13649.0629444 (4999)	total: 35s	remaining: 1m 44s
6000:	learn: 1671.1170865	test: 13612.8306130	best: 13612.6490726 (5988)	total: 42s	remaining: 1m 38s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13598.82011
bestIteration = 6684

Shrink model to first 6685 iterations.
Learning rate set to 0.009887
0:	learn: 32064.3329477	test: 36213.4698447	best: 36213.4698447 (0)	total: 30.5ms	remaining: 10m 10s
1000:	learn: 7922.8999704	test: 15127.3799845	best: 15127.3068155 (999)	total: 7.01s	remaining: 2m 13s
2000:	learn: 5125.1944595	test: 14541.9134607	best: 14541.2743629 (1996)	total: 14s	remaining: 2m 5s
3000:	learn: 3659.5735699	test: 14420.4461785	best: 14419.1522506 (2977)	total: 20.9s	remaining: 1m 58s
Stopped by 

In [15]:
# 데이터프레임 초기화
hitter_regression = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(hitter_score.loc[hitter_score['regression순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    hitter_regression = pd.concat([hitter_regression, regression_model(X_tmp, y).rename(columns={'regression':f'regression{n_features}'})], axis=1)

#### 2. 투수

In [16]:
# 데이터프레임 초기화
pitcher_rf = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(pitcher_score.loc[pitcher_score['rf순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    pitcher_rf = pd.concat([pitcher_rf, rf_model(X_tmp, y).rename(columns={'rf':f'rf{n_features}'})], axis=1)

In [17]:
# 데이터프레임 초기화
pitcher_lgbm = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(pitcher_score.loc[pitcher_score['lgbm순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    pitcher_lgbm = pd.concat([pitcher_lgbm, lgbm_model(X_tmp, y).rename(columns={'lgbm':f'lgbm{n_features}'})], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27]	valid_0's rmse: 15162.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 14532.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[289]	valid_0's rmse: 14549.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[743]	valid_0's rmse: 13092.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[78]	valid_0's rmse: 11225.2


Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13570.2
Early stopping, best iteration is:
[1417]	valid_0's rmse: 13521.7


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's rmse: 12316.4


Training until validation scores don't improve for 100 rounds
Early

Early stopping, best iteration is:
[80]	valid_0's rmse: 13602.8


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	valid_0's rmse: 11142


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's rmse: 13932.5


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[126]	valid_0's rmse: 11476.6


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[428]	valid_0's rmse: 11798.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 12986.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[104]	valid_0's rmse: 11134.5


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's rmse: 13916.9


Training until validation scores don'

In [18]:
# 데이터프레임 초기화
pitcher_xgb = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(pitcher_score.loc[pitcher_score['xgb순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    pitcher_xgb = pd.concat([pitcher_xgb, xgb_model(X_tmp, y).rename(columns={'xgb':f'xgb{n_features}'})], axis=1)

[0]	train-rmse:20271.81055	eval-rmse:23588.64453
[105]	train-rmse:1178.63098	eval-rmse:17843.50586
[0]	train-rmse:20026.11914	eval-rmse:24356.30078
[105]	train-rmse:1147.38757	eval-rmse:17678.06836
[0]	train-rmse:20851.26953	eval-rmse:21835.69531
[107]	train-rmse:1147.02087	eval-rmse:14475.16797
[0]	train-rmse:20978.04883	eval-rmse:20881.84961
[109]	train-rmse:1240.34570	eval-rmse:15263.26758
[0]	train-rmse:21335.24805	eval-rmse:19289.15430
[128]	train-rmse:880.02801	eval-rmse:13292.15137
[0]	train-rmse:20027.07812	eval-rmse:23421.13086
[127]	train-rmse:530.30536	eval-rmse:15347.20410
[0]	train-rmse:19875.26758	eval-rmse:24183.48438
[166]	train-rmse:338.39832	eval-rmse:14956.10938
[0]	train-rmse:20734.53906	eval-rmse:21247.09961
[113]	train-rmse:673.66895	eval-rmse:14405.57812
[0]	train-rmse:20798.08984	eval-rmse:21094.20898
[107]	train-rmse:755.73309	eval-rmse:15218.62402
[0]	train-rmse:20968.03711	eval-rmse:18888.59766
[130]	train-rmse:505.70123	eval-rmse:12277.59570
[0]	train-rmse:2

In [19]:
# 데이터프레임 초기화
pitcher_cb = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(pitcher_score.loc[pitcher_score['cb순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    pitcher_cb = pd.concat([pitcher_cb, cb_model(X_tmp, y).rename(columns={'cb':f'cb{n_features}'})], axis=1)

Learning rate set to 0.009605
0:	learn: 23594.6332823	test: 25876.9941388	best: 25876.9941388 (0)	total: 2.47ms	remaining: 49.5s
1000:	learn: 7744.3130674	test: 12087.7350343	best: 12087.6820178 (998)	total: 2.18s	remaining: 41.4s
2000:	learn: 5606.5858878	test: 11119.1690225	best: 11118.0308329 (1999)	total: 4.34s	remaining: 39s
3000:	learn: 4565.5616123	test: 10676.6621573	best: 10676.5400760 (2999)	total: 6.5s	remaining: 36.8s
4000:	learn: 3912.1553329	test: 10504.0769020	best: 10502.9356357 (3999)	total: 8.7s	remaining: 34.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10501.94862
bestIteration = 4116

Shrink model to first 4117 iterations.
Learning rate set to 0.009605
0:	learn: 23291.3438766	test: 26987.1293926	best: 26987.1293926 (0)	total: 12ms	remaining: 4m
1000:	learn: 7946.8158580	test: 11623.4481794	best: 11623.4481794 (1000)	total: 2.2s	remaining: 41.7s
2000:	learn: 5999.2660189	test: 10761.9292064	best: 10761.9138368 (1999)	total: 4.35s	remaining: 3

1000:	learn: 6108.8456588	test: 11904.7154493	best: 11904.4674548 (999)	total: 3.21s	remaining: 1m
2000:	learn: 4051.5661819	test: 11333.7909841	best: 11332.8645719 (1993)	total: 6.41s	remaining: 57.7s
3000:	learn: 2924.4406378	test: 11179.8846902	best: 11179.8846902 (3000)	total: 9.6s	remaining: 54.4s
4000:	learn: 2288.8709355	test: 11047.1501817	best: 11047.1280768 (3996)	total: 12.8s	remaining: 51.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11031.75053
bestIteration = 4424

Shrink model to first 4425 iterations.
Learning rate set to 0.009605
0:	learn: 24046.8839728	test: 24138.2732647	best: 24138.2732647 (0)	total: 17.8ms	remaining: 5m 55s
1000:	learn: 6384.7175808	test: 11902.8518445	best: 11898.4854141 (982)	total: 3.23s	remaining: 1m 1s
2000:	learn: 4055.7899979	test: 11262.8434504	best: 11261.8802921 (1998)	total: 6.43s	remaining: 57.8s
3000:	learn: 2888.3428836	test: 10997.4852086	best: 10997.4852086 (3000)	total: 9.66s	remaining: 54.7s
4000:	learn: 21

3000:	learn: 3129.5539545	test: 10826.7716112	best: 10826.7716112 (3000)	total: 11.2s	remaining: 1m 3s
4000:	learn: 2391.7114689	test: 10743.8438503	best: 10743.8438503 (4000)	total: 14.9s	remaining: 59.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10730.32303
bestIteration = 4516

Shrink model to first 4517 iterations.
Learning rate set to 0.009605
0:	learn: 24872.3819214	test: 20554.6365578	best: 20554.6365578 (0)	total: 32.2ms	remaining: 10m 44s
1000:	learn: 6135.1607175	test: 10395.7141659	best: 10395.4228992 (999)	total: 3.82s	remaining: 1m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10082.39203
bestIteration = 1859

Shrink model to first 1860 iterations.
Learning rate set to 0.009605
0:	learn: 23598.8545198	test: 25858.9592505	best: 25858.9592505 (0)	total: 7.94ms	remaining: 2m 38s
1000:	learn: 6141.5581111	test: 12451.7721204	best: 12451.1065604 (999)	total: 4.32s	remaining: 1m 21s
2000:	learn: 3946.1563518	test: 12142.1592981	b

Learning rate set to 0.009605
0:	learn: 24485.8410856	test: 22262.2599384	best: 22262.2599384 (0)	total: 28.9ms	remaining: 9m 38s
1000:	learn: 6336.8232171	test: 12247.2318567	best: 12247.2318567 (1000)	total: 4.89s	remaining: 1m 32s
2000:	learn: 4089.1411866	test: 11713.0664216	best: 11713.0664216 (2000)	total: 9.63s	remaining: 1m 26s
3000:	learn: 3066.2651025	test: 11553.9121199	best: 11553.9121199 (3000)	total: 14.4s	remaining: 1m 21s
4000:	learn: 2351.2747069	test: 11470.0478921	best: 11469.8740865 (3999)	total: 19.1s	remaining: 1m 16s
5000:	learn: 1858.0593616	test: 11425.6557520	best: 11425.6557520 (5000)	total: 24s	remaining: 1m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11421.67478
bestIteration = 5095

Shrink model to first 5096 iterations.
Learning rate set to 0.009605
0:	learn: 24870.5640046	test: 20552.9088980	best: 20552.9088980 (0)	total: 16.2ms	remaining: 5m 24s
1000:	learn: 6021.1971326	test: 10489.4698858	best: 10489.4698858 (1000)	total: 4.

2000:	learn: 4121.4628454	test: 12231.4403718	best: 12227.9708440 (1990)	total: 11.4s	remaining: 1m 42s
3000:	learn: 2968.5674935	test: 12078.6202062	best: 12077.8409943 (2997)	total: 17s	remaining: 1m 36s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12077.76176
bestIteration = 3010

Shrink model to first 3011 iterations.
Learning rate set to 0.009605
0:	learn: 24054.1827532	test: 24188.3020948	best: 24188.3020948 (0)	total: 15.1ms	remaining: 5m 1s
1000:	learn: 6696.1284376	test: 12943.2284658	best: 12943.2284658 (1000)	total: 5.71s	remaining: 1m 48s
2000:	learn: 4640.6042859	test: 12589.3930224	best: 12587.7820058 (1990)	total: 11.4s	remaining: 1m 42s
3000:	learn: 3198.7323241	test: 12312.0034909	best: 12312.0034909 (3000)	total: 17.1s	remaining: 1m 37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12260.03353
bestIteration = 3625

Shrink model to first 3626 iterations.
Learning rate set to 0.009605
0:	learn: 24517.7183162	test: 22293.9852856

1000:	learn: 6019.1634646	test: 12706.9214212	best: 12706.7484093 (998)	total: 6.15s	remaining: 1m 56s
2000:	learn: 3817.9249292	test: 12411.5403631	best: 12411.5403631 (2000)	total: 12.2s	remaining: 1m 50s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12386.91193
bestIteration = 2160

Shrink model to first 2161 iterations.
Learning rate set to 0.009605
0:	learn: 23287.4426146	test: 27012.8850838	best: 27012.8850838 (0)	total: 23.5ms	remaining: 7m 50s
1000:	learn: 6403.4697067	test: 13087.8669315	best: 13087.8669315 (1000)	total: 6.14s	remaining: 1m 56s
2000:	learn: 4259.1107788	test: 12551.8923441	best: 12549.0583104 (1987)	total: 12.2s	remaining: 1m 49s
3000:	learn: 3099.6265858	test: 12395.4166438	best: 12395.0089511 (2998)	total: 18.2s	remaining: 1m 43s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12354.6437
bestIteration = 3354

Shrink model to first 3355 iterations.
Learning rate set to 0.009605
0:	learn: 24049.6843337	test: 24169.666214

In [20]:
# 데이터프레임 초기화
pitcher_regression = pd.DataFrame()

# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수개수별 성능파악
for n_features in range(0, len(col_dict)+1, 5):
    
    if n_features == 0:
        continue
    
    ### 변수목록 선정
    features = list(pitcher_score.loc[pitcher_score['regression순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

    # 모델링 후 데이터프레임 생성
    X_tmp = X[features]
    pitcher_regression = pd.concat([pitcher_regression, regression_model(X_tmp, y).rename(columns={'regression':f'regression{n_features}'})], axis=1)

# 체크

In [39]:
def checking_hitter(data, model_name):
    data2 = pd.DataFrame(data.loc['평균']).transpose()
    data2.columns = [f'{i}개' for i in range(5, 86, 5)]
    data2.index = [model_name]
    
    return data2

def checking_pitcher(data, model_name):
    data2 = pd.DataFrame(data.loc['평균']).transpose()
    data2.columns = [f'{i}개' for i in range(5, 81, 5)]
    data2.index = [model_name]
    
    return data2

In [40]:
pd.concat([checking_hitter(hitter_rf, 'rf'),
          checking_hitter(hitter_lgbm, 'lgbm'),
          checking_hitter(hitter_xgb, 'xgb'),
          checking_hitter(hitter_cb, 'cb'),
          checking_hitter(hitter_regression, 'regression')]).transpose()

Unnamed: 0,rf,lgbm,xgb,cb,regression
5개,14495.850421,16869.849828,15278.624024,13805.610012,28604.155512
10개,14463.782532,16602.329789,15380.580274,12790.217912,28450.60391
15개,14255.345626,14642.120257,14385.336719,12855.151478,19640.908335
20개,14175.135453,14451.998523,14454.083984,13021.966256,18812.382601
25개,14190.197974,14576.049287,14417.206445,13004.232191,18752.849521
30개,14282.268496,14450.996273,14624.839648,12877.397005,18751.75783
35개,14428.450747,14798.549061,14175.166992,13063.763707,18686.448232
40개,14352.392406,14703.860622,14418.500586,13067.793489,18600.114958
45개,14528.980184,14598.3119,14615.41836,12933.883691,18573.646683
50개,14487.132437,14264.169141,14654.470117,12980.295399,18568.796642


In [41]:
pd.concat([checking_pitcher(pitcher_rf, 'rf'),
          checking_pitcher(pitcher_lgbm, 'lgbm'),
          checking_pitcher(pitcher_xgb, 'xgb'),
          checking_pitcher(pitcher_cb, 'cb'),
          checking_pitcher(pitcher_regression, 'regression')]).transpose()

Unnamed: 0,rf,lgbm,xgb,cb,regression
5개,13273.352597,13712.714431,15126.199805,10849.785592,17223.073928
10개,12492.823083,11999.900123,14323.328711,10900.627393,17226.396128
15개,11695.131837,12443.123729,13487.203906,11116.530194,17247.311302
20개,11972.160557,12586.541024,13766.102539,10748.307786,17180.564756
25개,12316.14692,12239.498352,13096.075,10962.280336,17095.015323
30개,12507.706753,12136.688174,12316.707227,11017.67835,17105.434247
35개,12698.173862,12212.053199,12955.603516,11228.517575,16911.736201
40개,12790.824726,12001.69283,12691.025586,11477.882531,16819.325954
45개,12835.193614,12322.975539,13383.95625,11374.383746,16890.554572
50개,12902.616973,12265.745045,13262.266602,11533.149717,16873.549031
