# 패키지 불러오기

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler


n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [3]:
def rf_model(data_x, data_y):

    # rf 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        rf_model = RandomForestRegressor(random_state = 42)
        rf_model.fit(X_train, y_train)

        pred_valid = rf_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # rf 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'rf':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [4]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    lgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        lgb_models[step] = lgb_model

        step += 1
        
    # lgbm 성능종합
    performance = [lgb_models[step].best_score['valid_0']['rmse'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [5]:
def xgb_model(data_x, data_y):
    
    # xgb 모델링
    xgb_final_param = {
          "objective" : 'reg:squarederror',
          "random_state" : 42,
          "verbosity" : 0
          }

    xgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        xgb_dtrain = xgb.DMatrix(data = X_train, label = y_train) 
        xgb_dvalid = xgb.DMatrix(data = X_valid, label = y_valid) 

        xgb_model = xgb.train(params = xgb_final_param, dtrain = xgb_dtrain, num_boost_round = 20000, early_stopping_rounds = 100, verbose_eval = 1000, evals=[(xgb_dtrain, 'train'), (xgb_dvalid,'eval')])
        xgb_models[step] = xgb_model

        step += 1
        
    # xgb 성능종합
    performance = [xgb_models[step].best_score for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'xgb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [6]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        cb_models[step] = cb_model

        step += 1
        
    # cb 성능종합
    performance = [cb_models[step].best_score_['validation']['RMSE'] for step in range(1,6)]
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [7]:
def knn_model(data_x, data_y):

    # knn 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        knn_model = KNeighborsRegressor()
        knn_model.fit(X_train, y_train)

        pred_valid = knn_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # knn 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'knn':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [8]:
def regression_model(data_x, data_y):

    # regression 모델링
    performance = []
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        regression_model = LinearRegression()
        regression_model.fit(X_train, y_train)

        pred_valid = regression_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)
        performance.append(rmse)
        
    # regression 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'regression':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [9]:
hitter = pd.read_csv('../선수데이터(전처리완료)/타자(모델링용).csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/투수(모델링용).csv')

#### 1. 타자

In [10]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns = col_dict.keys())

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [11]:
# 타자데이터 도출
hitter_performance = pd.concat([rf_model(X, y),
                                  lgbm_model(X, y),
                                  xgb_model(X, y),
                                  cb_model(X, y),
                                  knn_model(X, y),
                                  regression_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[487]	valid_0's rmse: 18891.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 15045.3


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[12]	valid_0's rmse: 14796.4


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's rmse: 11647


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	valid_0's rmse: 14305.1
[0]	train-rmse:26942.34766	eval-rmse:31251.62695
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
Stopping. Best iteration:
[52]	train-rmse:1003.94617	eval-rmse:19264.77344

[0]	train-rmse:27199.75781	eval-rmse:30016.45312
Multiple eval metrics have been passed: 'e

#### 2. 투수

In [12]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns = col_dict.keys())

X = X.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y = y.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [13]:
# 타자데이터 도출
pitcher_performance = pd.concat([rf_model(X, y),
                                  lgbm_model(X, y),
                                  xgb_model(X, y),
                                  cb_model(X, y),
                                  knn_model(X, y),
                                  regression_model(X, y)], axis=1)



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's rmse: 10730.9


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[628]	valid_0's rmse: 12551.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[412]	valid_0's rmse: 13600.1


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's rmse: 12558.6


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[160]	valid_0's rmse: 10102.8
[0]	train-rmse:20657.09961	eval-rmse:18609.31836
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
Stopping. Best iteration:
[98]	train-rmse:277.40286	eval-rmse:11883.29590

[0]	train-rmse:19289.38281	eval-rmse:25022.93750
Multiple eval metrics have been passed

In [14]:
hitter_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,19070.752968,18891.194408,19264.773438,17979.416372,24337.560073,19692.186652
cv2,14941.931704,15045.293502,14374.276367,14083.47336,23869.636846,20553.774759
cv3,14194.929702,14796.443613,12914.292969,13889.671732,18950.389048,16946.954584
cv4,13562.11204,11646.97464,13683.847656,11544.448027,21744.677592,18163.85316
cv5,15382.064431,14305.121554,16765.609375,15193.107115,23965.53707,19420.043727
평균,15430.358169,14937.005543,15400.559961,14538.023321,22573.560126,18955.362576


In [15]:
pitcher_performance

Unnamed: 0,rf,lgbm,xgb,cb,knn,regression
cv1,11750.868945,10730.933865,11883.295898,10205.594091,15398.243114,14432.35
cv2,14632.571644,12551.162745,14383.013672,14335.629934,22151.796596,18497.29
cv3,14644.404225,13600.09531,16255.125977,13841.024346,17778.868518,17642.95
cv4,12720.788436,12558.630105,10807.875977,12480.209187,19677.000611,18019.29
cv5,12153.584686,10102.819844,12778.478516,10587.513086,18446.218682,5297658000000.0
평균,13180.443587,11908.728374,13221.558008,12289.994129,18690.425504,1059532000000.0
