# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE


n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

# train, test분할 후 모델링

In [3]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/타자(모델링용).csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/투수(모델링용).csv')

#### 1. 타자

In [4]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수의 순위도출
model = RandomForestRegressor(random_state = 42)
selector = RFE(model, n_features_to_select = 1, step = 1)
selector = selector.fit(X, y)

# 데이터프레임으로 생성
hitter_rf = pd.DataFrame({'변수명':list(col_dict.keys()),
                  '실제변수명':list(col_dict.values()),
                  'rf순위':list(selector.ranking_)})

In [5]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
lgb_params = {"objective" : "rmse",
             "verbosity" : -1}

remove_cols = []

while True:

    # lgbm 모델링
    lgb_dtrain = lgb.Dataset(data = X, label = y) 
    lgb_dvalid = lgb.Dataset(data = X_test, label = y_test) 

    lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)

    # 가장 낮은 변수들 중 1개 삭제
    remove_col = [col for col, importance in zip(list(col_dict.keys()), lgb_model.feature_importance()) if importance == min(lgb_model.feature_importance())][-1]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
hitter_lgbm = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'lgbm순위':[i for i in range(len(col_dict_origin), 0, -1)]})

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 16069.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 

Early stopping, best iteration is:
[151]	valid_0's rmse: 16666.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[159]	valid_0's rmse: 16594.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[158]	valid_0's rmse: 16034
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[149]	valid_0's rmse: 16525.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[149]	valid_0's rmse: 16358
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[337]	valid_0's rmse: 16235.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[423]	valid_0's rmse: 16514.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[277]	valid_0's rmse: 16963
Training until validation scores don't improve for 1

In [6]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }

remove_cols = []

while True:
    
    print(X.shape[1])

    # xgb 모델링
    xgb_dtrain = xgb.DMatrix(data = X, label = y) 
    xgb_dvalid = xgb.DMatrix(data = X_test, label = y_test) 

    xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

    # 가장 낮은 변수들 중 1개 삭제
    remove_col = sorted(xgb_model.get_score().items(),  key = lambda item: item[1])[0][0]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
hitter_xgb = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'xgb순위':[i for i in range(len(col_dict_origin), 0, -1)]})

85
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[116]	train-rmse:340.26849	eval-rmse:16826.00977
84
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[116]	train-rmse:341.34064	eval-rmse:16832.26562
83
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[141]	train-rmse:218.37823	eval-rmse:16831.61133
82
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[142]	train-rmse:214.87942	eval-rmse:16833.35938
81
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[233]	train-rmse:59.84266	eval-rmse:16661.29102
80
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[186]	train-rmse:103.28033	eval-rmse:16630.18945
79
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[205]	train-rmse:77.08006	eval-rmse:16638.30273
78
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[121]	train-rmse:327.47858	eval-rmse:16798.95312
77
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[143]	train-rmse:238.16538	eval-rmse:16591.86914
76
[0]	train-rmse:27527.58594	eval-rmse:28039.86719
[204]	train-rmse:82.33493	eval-rmse:16821

[111]	train-rmse:1776.58862	eval-rmse:15908.44141
3
[0]	train-rmse:28031.76562	eval-rmse:28224.58789
[128]	train-rmse:2119.26221	eval-rmse:14853.97461
2
[0]	train-rmse:28336.88477	eval-rmse:28727.57031
[110]	train-rmse:3841.89648	eval-rmse:19776.70703


In [7]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
remove_cols = []

while True:
    
    print(X.shape[1])

    # cb 모델링
    cat_cols = []
    cb_dtrain = Pool(data=X, label=y, cat_features=cat_cols)
    cb_dvalid = Pool(data=X_test, label=y_test, cat_features=cat_cols)

    cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
    cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
    
    # 가장 낮은 변수들 중 1개 삭제
    remove_col = [col for col, importance in zip(list(col_dict.keys()), cb_model.feature_importances_) if importance == min(cb_model.feature_importances_)][-1]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
hitter_cb = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'cb순위':[i for i in range(len(col_dict_origin), 0, -1)]})

85
Learning rate set to 0.009788
0:	learn: 33144.2942830	test: 32448.7293867	best: 32448.7293867 (0)	total: 162ms	remaining: 53m 57s
1000:	learn: 8200.1371800	test: 14417.8767489	best: 14417.8767489 (1000)	total: 7.08s	remaining: 2m 14s
2000:	learn: 5183.7493040	test: 13834.7590057	best: 13834.7590057 (2000)	total: 14s	remaining: 2m 5s
3000:	learn: 3680.4705635	test: 13603.3988834	best: 13603.3988834 (3000)	total: 21s	remaining: 1m 58s
4000:	learn: 2691.8876817	test: 13504.4838907	best: 13503.2596307 (3954)	total: 28.1s	remaining: 1m 52s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13503.25963
bestIteration = 3954

Shrink model to first 3955 iterations.
84
Learning rate set to 0.009788
0:	learn: 33104.5931387	test: 32357.9473044	best: 32357.9473044 (0)	total: 31.2ms	remaining: 10m 23s
1000:	learn: 8303.2371684	test: 14216.8675655	best: 14213.3078089 (995)	total: 7.07s	remaining: 2m 14s
2000:	learn: 5277.6373078	test: 13615.8258086	best: 13615.8258086 (2000)	total:

3000:	learn: 3629.9291062	test: 13685.5283350	best: 13685.5283350 (3000)	total: 23.9s	remaining: 2m 15s
4000:	learn: 2737.5750223	test: 13574.5060609	best: 13574.5060609 (4000)	total: 31.8s	remaining: 2m 7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13562.43166
bestIteration = 4291

Shrink model to first 4292 iterations.
74
Learning rate set to 0.009788
0:	learn: 33120.4250300	test: 32413.9475056	best: 32413.9475056 (0)	total: 38.7ms	remaining: 12m 54s
1000:	learn: 8253.3911501	test: 14387.4589338	best: 14387.4589338 (1000)	total: 7.6s	remaining: 2m 24s
2000:	learn: 5260.3092049	test: 13841.7248330	best: 13841.6440785 (1996)	total: 15.3s	remaining: 2m 17s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13672.58394
bestIteration = 2735

Shrink model to first 2736 iterations.
73
Learning rate set to 0.009788
0:	learn: 33121.0200567	test: 32407.0489957	best: 32407.0489957 (0)	total: 26.4ms	remaining: 8m 47s
1000:	learn: 8180.4309552	test: 14330.8

6000:	learn: 1721.0758407	test: 13430.5405758	best: 13429.5307862 (5982)	total: 40.9s	remaining: 1m 35s
7000:	learn: 1399.5444384	test: 13410.9651143	best: 13410.6304761 (6916)	total: 47.8s	remaining: 1m 28s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13410.63048
bestIteration = 6916

Shrink model to first 6917 iterations.
62
Learning rate set to 0.009788
0:	learn: 33119.8458259	test: 32408.6834095	best: 32408.6834095 (0)	total: 39.1ms	remaining: 13m 2s
1000:	learn: 8234.3961695	test: 14324.9865314	best: 14324.9865314 (1000)	total: 6.75s	remaining: 2m 8s
2000:	learn: 5220.6293922	test: 13644.3321593	best: 13644.3321593 (2000)	total: 13.5s	remaining: 2m 1s
3000:	learn: 3781.1488741	test: 13478.3113348	best: 13477.8809161 (2991)	total: 20.3s	remaining: 1m 55s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13419.99089
bestIteration = 3472

Shrink model to first 3473 iterations.
61
Learning rate set to 0.009788
0:	learn: 33119.6176326	test: 32375.

6000:	learn: 1724.0424759	test: 13452.2948145	best: 13451.7615642 (5974)	total: 37.4s	remaining: 1m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13445.46446
bestIteration = 6367

Shrink model to first 6368 iterations.
51
Learning rate set to 0.009788
0:	learn: 33113.8692349	test: 32389.0873500	best: 32389.0873500 (0)	total: 23.5ms	remaining: 7m 50s
1000:	learn: 8270.5232275	test: 14247.9127032	best: 14247.0657786 (998)	total: 6.19s	remaining: 1m 57s
2000:	learn: 5407.6204977	test: 13692.4945783	best: 13692.4630551 (1999)	total: 12.3s	remaining: 1m 50s
3000:	learn: 3865.9883099	test: 13449.6076186	best: 13448.6897008 (2997)	total: 18.4s	remaining: 1m 44s
4000:	learn: 2881.1802963	test: 13307.5511112	best: 13307.4394948 (3998)	total: 24.5s	remaining: 1m 38s
5000:	learn: 2223.1625426	test: 13262.7253206	best: 13262.7253206 (5000)	total: 30.7s	remaining: 1m 32s
6000:	learn: 1765.8255678	test: 13224.0698512	best: 13224.0698512 (6000)	total: 37s	remaining: 1m 26s
70

Stopped by overfitting detector  (100 iterations wait)

bestTest = 13457.74597
bestIteration = 3186

Shrink model to first 3187 iterations.
39
Learning rate set to 0.009788
0:	learn: 33123.5102869	test: 32373.0587367	best: 32373.0587367 (0)	total: 36.8ms	remaining: 12m 16s
1000:	learn: 8336.4823682	test: 14387.2688031	best: 14387.2688031 (1000)	total: 6.05s	remaining: 1m 54s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13891.19717
bestIteration = 1820

Shrink model to first 1821 iterations.
38
Learning rate set to 0.009788
0:	learn: 33117.6900621	test: 32385.9339716	best: 32385.9339716 (0)	total: 19.7ms	remaining: 6m 33s
1000:	learn: 8269.4668246	test: 14359.9037670	best: 14359.9037670 (1000)	total: 5.57s	remaining: 1m 45s
2000:	learn: 5361.2236416	test: 13952.2011247	best: 13949.8549768 (1999)	total: 12.1s	remaining: 1m 48s
3000:	learn: 3794.5536213	test: 13757.0346607	best: 13755.5238352 (2991)	total: 17.8s	remaining: 1m 41s
4000:	learn: 2815.0265069	test: 13643

3000:	learn: 3968.2094390	test: 13712.1650086	best: 13712.1650086 (3000)	total: 10.8s	remaining: 1m 1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13597.6221
bestIteration = 3899

Shrink model to first 3900 iterations.
26
Learning rate set to 0.009788
0:	learn: 33106.8102682	test: 32400.7748498	best: 32400.7748498 (0)	total: 12.9ms	remaining: 4m 18s
1000:	learn: 8494.6706541	test: 14426.5500655	best: 14426.0042286 (998)	total: 3.59s	remaining: 1m 8s
2000:	learn: 5633.6689524	test: 13821.5063574	best: 13819.4884841 (1987)	total: 7.09s	remaining: 1m 3s
3000:	learn: 4001.8402452	test: 13624.7286704	best: 13624.1851787 (2982)	total: 10.6s	remaining: 1m
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13550.25874
bestIteration = 3765

Shrink model to first 3766 iterations.
25
Learning rate set to 0.009788
0:	learn: 33103.0124444	test: 32389.8463059	best: 32389.8463059 (0)	total: 10.9ms	remaining: 3m 38s
1000:	learn: 8483.3276756	test: 14223.4393535	b

15
Learning rate set to 0.009788
0:	learn: 33119.4961771	test: 32407.0175185	best: 32407.0175185 (0)	total: 4.37ms	remaining: 1m 27s
1000:	learn: 8541.2868243	test: 14356.9097572	best: 14356.9097572 (1000)	total: 2.63s	remaining: 49.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13987.46061
bestIteration = 1724

Shrink model to first 1725 iterations.
14
Learning rate set to 0.009788
0:	learn: 33099.8893331	test: 32374.4104196	best: 32374.4104196 (0)	total: 29ms	remaining: 9m 39s
1000:	learn: 8556.6796497	test: 14450.1693312	best: 14450.1693312 (1000)	total: 2.64s	remaining: 50.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 14141.74277
bestIteration = 1785

Shrink model to first 1786 iterations.
13
Learning rate set to 0.009788
0:	learn: 33108.5253361	test: 32405.5849689	best: 32405.5849689 (0)	total: 15.9ms	remaining: 5m 17s
1000:	learn: 8624.9245166	test: 14423.3373215	best: 14422.1352890 (996)	total: 2.6s	remaining: 49.3s
Stopped by overf

In [9]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)

# 변수의 순위도출
model = LinearRegression()
selector = RFE(model, n_features_to_select = 1, step = 1)
selector = selector.fit(X, y)

# 데이터프레임으로 생성
hitter_regression = pd.DataFrame({'변수명':list(col_dict.keys()),
                                   '실제변수명':list(col_dict.values()),
                                   'regression순위':list(selector.ranking_)})

#### 2. 투수

In [11]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수의 순위도출
model = RandomForestRegressor(random_state = 42)
selector = RFE(model, n_features_to_select = 1, step = 1)
selector = selector.fit(X, y)

# 데이터프레임으로 생성
pitcher_rf = pd.DataFrame({'변수명':list(col_dict.keys()),
                  '실제변수명':list(col_dict.values()),
                  'rf순위':list(selector.ranking_)})

In [12]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
lgb_params = {"objective" : "rmse",
             "verbosity" : -1}

remove_cols = []

while True:

    # lgbm 모델링
    lgb_dtrain = lgb.Dataset(data = X, label = y) 
    lgb_dvalid = lgb.Dataset(data = X_test, label = y_test) 

    lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)

    # 가장 낮은 변수들 중 1개 삭제
    remove_col = [col for col, importance in zip(list(col_dict.keys()), lgb_model.feature_importance()) if importance == min(lgb_model.feature_importance())][-1]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
pitcher_lgbm = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'lgbm순위':[i for i in range(len(col_dict_origin), 0, -1)]})

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 14155.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 

Early stopping, best iteration is:
[61]	valid_0's rmse: 14621.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	valid_0's rmse: 14536.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[72]	valid_0's rmse: 14575.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[50]	valid_0's rmse: 15015.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's rmse: 14842.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[395]	valid_0's rmse: 14881.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[72]	valid_0's rmse: 14282.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[128]	valid_0's rmse: 14481.9
Training until validation scores don't improve for 1

In [13]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }

remove_cols = []

while True:
    
    print(X.shape[1])

    # xgb 모델링
    xgb_dtrain = xgb.DMatrix(data = X, label = y) 
    xgb_dvalid = xgb.DMatrix(data = X_test, label = y_test) 

    xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])

    # 가장 낮은 변수들 중 1개 삭제
    remove_col = sorted(xgb_model.get_score().items(),  key = lambda item: item[1])[0][0]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
pitcher_xgb = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'xgb순위':[i for i in range(len(col_dict_origin), 0, -1)]})

80
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:285.52805	eval-rmse:15078.06543
79
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:275.94363	eval-rmse:15052.00879
78
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:264.18320	eval-rmse:15084.03711
77
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:260.17914	eval-rmse:15064.01660
76
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:297.44650	eval-rmse:15059.33301
75
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:286.38422	eval-rmse:15070.94043
74
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:286.39770	eval-rmse:15070.93750
73
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:247.07932	eval-rmse:15124.81445
72
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[109]	train-rmse:284.32089	eval-rmse:15124.25879
71
[0]	train-rmse:19822.49023	eval-rmse:22013.53125
[108]	train-rmse:282.48743	eval-rmse:15

In [14]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

col_dict_origin = col_dict.copy()

# RFE 적용
remove_cols = []

while True:
    
    print(X.shape[1])

    # cb 모델링
    cat_cols = []
    cb_dtrain = Pool(data=X, label=y, cat_features=cat_cols)
    cb_dvalid = Pool(data=X_test, label=y_test, cat_features=cat_cols)

    cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
    cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
    
    # 가장 낮은 변수들 중 1개 삭제
    remove_col = [col for col, importance in zip(list(col_dict.keys()), cb_model.feature_importances_) if importance == min(cb_model.feature_importances_)][-1]
    remove_cols.append(remove_col)
    X.drop(remove_col, axis=1, inplace=True)
    X_test.drop(remove_col, axis=1, inplace=True)
    del col_dict[remove_col]
    
    # while문 종료조건
    if X.shape[1] == 1:
        remove_cols.append(X.columns[0])
        break

# 데이터프레임으로 생성
pitcher_cb = pd.DataFrame({'변수명':remove_cols,
                            '실제변수명':[col_dict_origin[key] for key in remove_cols],
                            'cb순위':[i for i in range(len(col_dict_origin), 0, -1)]})

80
Learning rate set to 0.009508
0:	learn: 23571.9276317	test: 25562.8093113	best: 25562.8093113 (0)	total: 11.5ms	remaining: 3m 49s
1000:	learn: 6012.0475898	test: 12300.1169826	best: 12300.1169826 (1000)	total: 6.57s	remaining: 2m 4s
2000:	learn: 3878.6744319	test: 12039.8890787	best: 12038.0696409 (1977)	total: 12.8s	remaining: 1m 55s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11996.6898
bestIteration = 2618

Shrink model to first 2619 iterations.
79
Learning rate set to 0.009508
0:	learn: 23563.5305629	test: 25525.9988497	best: 25525.9988497 (0)	total: 9.79ms	remaining: 3m 15s
1000:	learn: 6000.1740169	test: 12336.3241135	best: 12335.8910386 (999)	total: 6.25s	remaining: 1m 58s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12089.89134
bestIteration = 1491

Shrink model to first 1492 iterations.
78
Learning rate set to 0.009508
0:	learn: 23574.5241304	test: 25564.8189973	best: 25564.8189973 (0)	total: 7.99ms	remaining: 2m 39s
1000:	learn:

2000:	learn: 3734.6708434	test: 12074.5431616	best: 12074.5431616 (2000)	total: 11.7s	remaining: 1m 45s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12012.65297
bestIteration = 2472

Shrink model to first 2473 iterations.
60
Learning rate set to 0.009508
0:	learn: 23573.9067544	test: 25551.4881442	best: 25551.4881442 (0)	total: 10.2ms	remaining: 3m 23s
1000:	learn: 5942.7626160	test: 12251.6100127	best: 12251.6100127 (1000)	total: 5.79s	remaining: 1m 49s
2000:	learn: 3786.9247917	test: 11992.6363464	best: 11991.1368006 (1943)	total: 11.5s	remaining: 1m 43s
3000:	learn: 2685.5299443	test: 11923.3863775	best: 11918.2891956 (2905)	total: 17.2s	remaining: 1m 37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11918.2892
bestIteration = 2905

Shrink model to first 2906 iterations.
59
Learning rate set to 0.009508
0:	learn: 23573.2659852	test: 25562.4439295	best: 25562.4439295 (0)	total: 10.8ms	remaining: 3m 36s
1000:	learn: 5962.6779816	test: 12226.9

1000:	learn: 5949.0875450	test: 12114.8847297	best: 12114.6954867 (998)	total: 4.75s	remaining: 1m 30s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11924.99787
bestIteration = 1519

Shrink model to first 1520 iterations.
42
Learning rate set to 0.009508
0:	learn: 23567.9512573	test: 25538.9516728	best: 25538.9516728 (0)	total: 8.23ms	remaining: 2m 44s
1000:	learn: 5998.1911420	test: 12299.7098770	best: 12299.7098770 (1000)	total: 4.73s	remaining: 1m 29s
2000:	learn: 3732.7368504	test: 12006.5081011	best: 12006.4135354 (1999)	total: 9.43s	remaining: 1m 24s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11944.16322
bestIteration = 2832

Shrink model to first 2833 iterations.
41
Learning rate set to 0.009508
0:	learn: 23555.1860716	test: 25549.3116106	best: 25549.3116106 (0)	total: 7.14ms	remaining: 2m 22s
1000:	learn: 5883.8213769	test: 12288.7038302	best: 12288.7038302 (1000)	total: 4.74s	remaining: 1m 30s
2000:	learn: 3786.7987166	test: 12057.5

1000:	learn: 6161.0250266	test: 12099.2157354	best: 12099.0686062 (998)	total: 3.81s	remaining: 1m 12s
2000:	learn: 3868.6195268	test: 11824.5618161	best: 11821.6849891 (1968)	total: 7.36s	remaining: 1m 6s
3000:	learn: 2778.5727453	test: 11672.8697992	best: 11672.1955358 (2995)	total: 10.8s	remaining: 1m 1s
4000:	learn: 2104.9761565	test: 11592.5786216	best: 11591.9120440 (3990)	total: 14.2s	remaining: 56.6s
5000:	learn: 1676.1304501	test: 11541.9839138	best: 11541.9383617 (4999)	total: 17.5s	remaining: 52.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11523.72867
bestIteration = 5748

Shrink model to first 5749 iterations.
24
Learning rate set to 0.009508
0:	learn: 23557.6316350	test: 25553.8496906	best: 25553.8496906 (0)	total: 7.12ms	remaining: 2m 22s
1000:	learn: 6112.1461054	test: 12420.6438830	best: 12420.6438830 (1000)	total: 3.4s	remaining: 1m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12307.14885
bestIteration = 1236

Shrink mo

12
Learning rate set to 0.009508
0:	learn: 23569.7118973	test: 25556.7354947	best: 25556.7354947 (0)	total: 4.13ms	remaining: 1m 22s
1000:	learn: 5937.6141981	test: 12216.5970179	best: 12216.5970179 (1000)	total: 2.5s	remaining: 47.5s
2000:	learn: 3890.4827946	test: 11840.7488307	best: 11840.4059069 (1998)	total: 4.98s	remaining: 44.8s
3000:	learn: 2992.0819379	test: 11688.0039312	best: 11687.2485081 (2964)	total: 7.48s	remaining: 42.4s
4000:	learn: 2408.2353316	test: 11623.8294486	best: 11622.7492696 (3992)	total: 9.96s	remaining: 39.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11593.92222
bestIteration = 4889

Shrink model to first 4890 iterations.
11
Learning rate set to 0.009508
0:	learn: 23560.0207595	test: 25531.4307811	best: 25531.4307811 (0)	total: 3.8ms	remaining: 1m 15s
1000:	learn: 6027.8402966	test: 12289.8806722	best: 12289.4719701 (992)	total: 2.48s	remaining: 47.1s
2000:	learn: 4092.0535939	test: 12004.8364370	best: 12004.8364370 (2000)	total: 4.

In [15]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# 변수의 순위도출
model = LinearRegression()
selector = RFE(model, n_features_to_select = 1, step = 1)
selector = selector.fit(X, y)

# 데이터프레임으로 생성
pitcher_regression = pd.DataFrame({'변수명':list(col_dict.keys()),
                                   '실제변수명':list(col_dict.values()),
                                   'regression순위':list(selector.ranking_)})

# 데이터프레임 병합

In [22]:
hitter_final = pd.merge(hitter_rf, hitter_lgbm, on=['변수명','실제변수명'], how='left')
hitter_final = pd.merge(hitter_final, hitter_xgb, on=['변수명','실제변수명'], how='left')
hitter_final = pd.merge(hitter_final, hitter_cb, on=['변수명','실제변수명'], how='left')
hitter_final = pd.merge(hitter_final, hitter_regression, on=['변수명','실제변수명'], how='left')
hitter_final

Unnamed: 0,변수명,실제변수명,rf순위,lgbm순위,xgb순위,cb순위,regression순위
0,variable1,데뷔년도,5,11,3,5,53
1,variable2,연도,9,15,11,6,70
2,variable3,타율,10,30,4,32,9
3,variable4,경기,21,13,9,38,82
4,variable5,타석,61,42,13,46,42
...,...,...,...,...,...,...,...
80,variable81,포지션(수비)_우익수,78,81,66,77,57
81,variable82,포지션(수비)_유격수,79,82,84,76,51
82,variable83,포지션(수비)_좌익수,74,83,73,71,36
83,variable84,포지션(수비)_중견수,67,84,71,73,23


In [23]:
pitcher_final = pd.merge(pitcher_rf, pitcher_lgbm, on=['변수명','실제변수명'], how='left')
pitcher_final = pd.merge(pitcher_final, pitcher_xgb, on=['변수명','실제변수명'], how='left')
pitcher_final = pd.merge(pitcher_final, pitcher_cb, on=['변수명','실제변수명'], how='left')
pitcher_final = pd.merge(pitcher_final, pitcher_regression, on=['변수명','실제변수명'], how='left')
pitcher_final

Unnamed: 0,변수명,실제변수명,rf순위,lgbm순위,xgb순위,cb순위,regression순위
0,variable1,데뷔년도,15,3,4,12,32
1,variable2,연도,11,19,9,10,31
2,variable3,평균자책점,30,29,2,19,64
3,variable4,경기,37,39,10,37,27
4,variable5,승리,68,44,39,52,45
...,...,...,...,...,...,...,...
75,variable76,팀명_롯데,46,59,60,40,18
76,variable77,팀명_삼성,42,71,70,45,17
77,variable78,팀명_우리/히어로즈/넥센/키움,78,79,69,75,13
78,variable79,팀명_한화,53,64,62,50,60


In [24]:
hitter_final.to_csv('../변수중요도데이터/RFE_타자.csv', index=False, encoding='utf-8-sig')
pitcher_final.to_csv('../변수중요도데이터/RFE_투수.csv', index=False, encoding='utf-8-sig')