In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import math

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Lightgbm
 2. Catboost

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [7]:
def cb_model(data_x, data_y, cat_cols):

    # cb 모델링
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = list(map(lambda x: 10 ** x, cb_model.predict(X_valid)))
        rmse = mean_squared_error(list(map(lambda x: 10 ** x, y_valid)), pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [5]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용)_train_ver3.csv')

#### 1. 타자

In [8]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)

X = X.reset_index(drop=True)
y = pd.Series(list(map(lambda x: math.log10(x), y)))

In [10]:
cb_model(X, y, ['variable2', 'variable43'])

Learning rate set to 0.008427
0:	learn: 0.4801442	test: 0.4948057	best: 0.4948057 (0)	total: 152ms	remaining: 50m 42s
1000:	learn: 0.1278861	test: 0.1601379	best: 0.1601379 (1000)	total: 23.5s	remaining: 7m 26s
2000:	learn: 0.1027864	test: 0.1552702	best: 0.1552702 (2000)	total: 46.4s	remaining: 6m 57s
3000:	learn: 0.0836364	test: 0.1530247	best: 0.1530186 (2991)	total: 1m 9s	remaining: 6m 33s
4000:	learn: 0.0699607	test: 0.1522297	best: 0.1522033 (3987)	total: 1m 33s	remaining: 6m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1521930788
bestIteration = 4071

Shrink model to first 4072 iterations.
Learning rate set to 0.008427
0:	learn: 0.4821386	test: 0.4871917	best: 0.4871917 (0)	total: 19.8ms	remaining: 6m 36s
1000:	learn: 0.1307104	test: 0.1595375	best: 0.1595375 (1000)	total: 23.2s	remaining: 7m 20s
2000:	learn: 0.1053635	test: 0.1534860	best: 0.1534740 (1996)	total: 45.8s	remaining: 6m 52s
3000:	learn: 0.0856436	test: 0.1502658	best: 0.1502658 (3000)	to

Unnamed: 0,cb
cv1,19251.891922
cv2,19842.788693
cv3,17762.924229
cv4,19847.017975
cv5,17019.165469
평균,18744.757658


#### 2. 투수

In [11]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)

X = X.reset_index(drop=True)
y = pd.Series(list(map(lambda x: math.log10(x), y)))

In [12]:
cb_model(X, y, ['variable2'])

Learning rate set to 0.008028
0:	learn: 0.4237537	test: 0.3958130	best: 0.3958130 (0)	total: 18.8ms	remaining: 6m 16s
1000:	learn: 0.1421816	test: 0.1692855	best: 0.1692855 (1000)	total: 23.7s	remaining: 7m 29s
2000:	learn: 0.1090712	test: 0.1631510	best: 0.1631510 (2000)	total: 46.7s	remaining: 7m
3000:	learn: 0.0861192	test: 0.1614128	best: 0.1614128 (3000)	total: 1m 10s	remaining: 6m 39s
4000:	learn: 0.0701986	test: 0.1604173	best: 0.1603792 (3984)	total: 1m 35s	remaining: 6m 22s
5000:	learn: 0.0583103	test: 0.1597314	best: 0.1597314 (5000)	total: 1m 59s	remaining: 5m 57s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1593663185
bestIteration = 5365

Shrink model to first 5366 iterations.
Learning rate set to 0.008028
0:	learn: 0.4231758	test: 0.3983022	best: 0.3983022 (0)	total: 18.9ms	remaining: 6m 18s
1000:	learn: 0.1390900	test: 0.1809191	best: 0.1809191 (1000)	total: 23.3s	remaining: 7m 22s
2000:	learn: 0.1041703	test: 0.1758323	best: 0.1758323 (2000)	tota

Unnamed: 0,cb
cv1,15423.045443
cv2,13223.095731
cv3,19822.883367
cv4,15189.11818
cv5,19547.780851
평균,16641.184714
