In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 모델 정의
 1. Lightgbm
 2. Catboost

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [3]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    lgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        lgb_models[step] = lgb_model

        step += 1
        
    # lgbm 성능종합
    performance = [lgb_models[step].best_score['valid_0']['rmse'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        cb_models[step] = cb_model

        step += 1
        
    # cb 성능종합
    performance = [cb_models[step].best_score_['validation']['RMSE'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [8]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver5/타자(모델링용_원핫인코딩)_train_ver5.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver5/투수(모델링용_원핫인코딩)_train_ver5.csv')

hitter = hitter.drop(['ID', '선수명'], axis=1)
pitcher = pitcher.drop(['ID', '선수명'], axis=1)

#### 1. 타자

In [9]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [10]:
# 타자데이터 도출
hitter_performance = pd.concat([lgbm_model(X, y),
                                cb_model(X, y)], axis=1)

round(hitter_performance, 1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 11891.5


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's rmse: 12566.1


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 14578.5


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's rmse: 11259.1


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 15509.6
Learning rate set to 0.008484
0:	learn: 38425.6131657	test: 33933.8062890	best: 33933.8062890 (0)	total: 142ms	remaining: 47m 21s
1000:	learn: 7902.0155209	test: 12204.1522028	best: 12203.7443544 (998)	total: 5.4s	remaining: 1m 42s
2000:	learn: 5169.4797356	test: 11745.6975500	best: 11745.3104520 (1999)	total: 10.8s	remaining: 1m 37s
Stopped by overfitting d

Unnamed: 0,lgbm,cb
cv1,11891.5,11717.6
cv2,12566.1,10893.6
cv3,14578.5,13778.5
cv4,11259.1,10277.9
cv5,15509.6,13061.0
평균,13160.9,11945.7


#### 2. 투수

In [11]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [12]:
# 타자데이터 도출
pitcher_performance = pd.concat([lgbm_model(X, y),
                                 cb_model(X, y)], axis=1)

round(pitcher_performance, 1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[389]	valid_0's rmse: 9860.68


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 9779.54


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[141]	valid_0's rmse: 10326.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[711]	valid_0's rmse: 10457.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[516]	valid_0's rmse: 10317.2
Learning rate set to 0.008079
0:	learn: 28271.6737333	test: 28725.4965323	best: 28725.4965323 (0)	total: 18.5ms	remaining: 6m 10s
1000:	learn: 5772.7679748	test: 11105.5084493	best: 11105.5084493 (1000)	total: 6.56s	remaining: 2m 4s
2000:	learn: 3850.9959901	test: 10572.0459866	best: 10571.7288768 (1999)	total: 12.9s	remaining: 1m 55s
3000:	learn: 2805.7

Unnamed: 0,lgbm,cb
cv1,9860.7,10267.3
cv2,9779.5,8898.5
cv3,10326.2,9403.4
cv4,10457.9,8743.6
cv5,10317.2,9535.2
평균,10148.3,9369.6
