In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import math
import time

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

# 함수 정의

In [2]:
# 시간가중치 생성
def make_exponential_weight(length, a):
    return [a**(i-1) for i in range(length, 0, -1)]

##################################################################################################
##################################################################################################

# 단순 평균
def simple_creation(dataset, n_year, feature):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset.loc[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            tmp.loc[i, f'{n_year}년평균{feature}'] = tmp.loc[i-n_year+1:i, feature].mean()
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중시간 평균
def time_weighted_creation(dataset, n_year, feature, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                      a = alpha))
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중타석(또는 이닝) 평균
def play_weighted_creation(dataset, n_year, feature, criteria):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            weight = np.array(tmp.loc[i-n_year+1:i, criteria])
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 시간 + 타석(또는 이닝) 평균
def timeandplay_weighted_creation(dataset, n_year, feature, criteria, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            play_weight = np.array(tmp.loc[i-n_year+1:i, criteria])
            time_weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                          a = alpha))
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(np.multiply(value, play_weight), time_weight) / np.matmul(play_weight, time_weight)
                        
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 종합 함수
def make_average_variable(dataset, n_year, feature, criteria, alpha, how):
    
    if how == 'simple':
        return simple_creation(dataset, n_year, feature)
    
    elif how == 'time_weighted':
        return time_weighted_creation(dataset, n_year, feature, alpha)
    
    elif how == 'play_weighted':
        return play_weighted_creation(dataset, n_year, feature, criteria)
    
    elif how == 'time_and_play_weighted':
        return timeandplay_weighted_creation(dataset, n_year, feature, criteria, alpha)

In [3]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [4]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        pred_valid = lgb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)

    # lgbm 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [5]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = cb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# 데이터 불러오기 및 파생변수를 생성할 변수 선정

In [2]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용_원핫인코딩)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용_원핫인코딩)_train_ver3.csv')

hitter.shape, pitcher.shape

((3033, 88), (2345, 83))

In [7]:
# 타자 변수들 중 누적변수를 생성할 변수 추리기
hitter_columns = list(hitter.columns)
hitter_remove_cols = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '누적타석', '출생연도', '출생월', '출생일', '나이', 
                  '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성',
                  '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대', '포지션(수비)_1루수', '포지션(수비)_2루수',
                  '포지션(수비)_3루수', '포지션(수비)_수비기록없음', '포지션(수비)_우익수', '포지션(수비)_유격수',
                  '포지션(수비)_좌익수', '포지션(수비)_중견수', '포지션(수비)_포수']

for hitter_remove_col in hitter_remove_cols:
    hitter_columns.remove(hitter_remove_col)
    
    

# 투수 변수들 중 누적변수를 생성할 변수 추리기
pitcher_columns = list(pitcher.columns)
pitcher_remove_cols = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '누적이닝', '출생연도', '출생월', '출생일', '나이',
                      '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성', 
                      '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대']

for pitcher_remove_col in pitcher_remove_cols:
    pitcher_columns.remove(pitcher_remove_col)
    

print(f'타자 파생변수 개수: {len(hitter_columns)}')
print(f'투수 파생변수 개수: {len(pitcher_columns)}')

타자 파생변수 개수: 58
투수 파생변수 개수: 62


# 파생변수 생성 (3년, 0.9)

In [8]:
YEAR = 3
ALPHA = 0.9

In [9]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 59초
20/58개 생성 완료!! == 경과시간: 117초
30/58개 생성 완료!! == 경과시간: 175초
40/58개 생성 완료!! == 경과시간: 233초
50/58개 생성 완료!! == 경과시간: 291초
10/62개 생성 완료!! == 경과시간: 388초
20/62개 생성 완료!! == 경과시간: 439초
30/62개 생성 완료!! == 경과시간: 490초
40/62개 생성 완료!! == 경과시간: 540초
50/62개 생성 완료!! == 경과시간: 595초
60/62개 생성 완료!! == 경과시간: 653초


In [10]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 65초
20/58개 생성 완료!! == 경과시간: 131초
30/58개 생성 완료!! == 경과시간: 196초
40/58개 생성 완료!! == 경과시간: 258초
50/58개 생성 완료!! == 경과시간: 324초
10/62개 생성 완료!! == 경과시간: 431초
20/62개 생성 완료!! == 경과시간: 491초
30/62개 생성 완료!! == 경과시간: 558초
40/62개 생성 완료!! == 경과시간: 615초
50/62개 생성 완료!! == 경과시간: 669초
60/62개 생성 완료!! == 경과시간: 719초


In [11]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 67초
20/58개 생성 완료!! == 경과시간: 127초
30/58개 생성 완료!! == 경과시간: 188초
40/58개 생성 완료!! == 경과시간: 250초
50/58개 생성 완료!! == 경과시간: 309초
10/62개 생성 완료!! == 경과시간: 408초
20/62개 생성 완료!! == 경과시간: 457초
30/62개 생성 완료!! == 경과시간: 508초
40/62개 생성 완료!! == 경과시간: 558초
50/62개 생성 완료!! == 경과시간: 696초
60/62개 생성 완료!! == 경과시간: 852초


In [15]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 185초
20/58개 생성 완료!! == 경과시간: 366초
30/58개 생성 완료!! == 경과시간: 551초
40/58개 생성 완료!! == 경과시간: 736초
50/58개 생성 완료!! == 경과시간: 921초
10/62개 생성 완료!! == 경과시간: 1223초
20/62개 생성 완료!! == 경과시간: 1306초
30/62개 생성 완료!! == 경과시간: 1355초
40/62개 생성 완료!! == 경과시간: 1404초
50/62개 생성 완료!! == 경과시간: 1453초
60/62개 생성 완료!! == 경과시간: 1502초


In [16]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간 + 타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)


#### 1. simple 방식

In [17]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[297]	valid_0's rmse: 11328.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11417.3
[2000]	valid_0's rmse: 11390.5
[3000]	valid_0's rmse: 11389.4
Early stopping, best iteration is:
[3863]	valid_0's rmse: 11389.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's rmse: 8200.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 11031.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 10274.3
Learning rate set to 0.008427
0:	learn: 37108.5237112	test: 41538.0870045	best: 41538.0870045 (0)	total: 159ms	remaining: 53m 3s
1000:	learn: 6310.3752160	test: 11824.5726422	best: 11824.5726422 (1000)	total: 11.5s	remaining: 3m 38s
2000:	learn: 3917.7568179	test: 11156.628

Unnamed: 0,lgbm,cb
cv1,11328.4,10843.0
cv2,11389.3,10777.2
cv3,8200.9,8142.9
cv4,11031.2,11235.9
cv5,10274.3,9463.2
평균,10444.9,10092.4


In [18]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[441]	valid_0's rmse: 8964.33
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	valid_0's rmse: 7044.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's rmse: 10988
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 9747.62
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[455]	valid_0's rmse: 9600.22
Learning rate set to 0.008028
0:	learn: 29673.9063383	test: 23850.5818249	best: 23850.5818249 (0)	total: 13.7ms	remaining: 4m 34s
1000:	learn: 3786.7658013	test: 9219.0281232	best: 9219.0281232 (1000)	total: 13.6s	remaining: 4m 17s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 8992.22603
bestIteration = 1797

Shrink model to first 1798 iteration

Unnamed: 0,lgbm,cb
cv1,8964.3,8992.2
cv2,7044.5,7127.7
cv3,10988.0,8642.5
cv4,9747.6,7825.3
cv5,9600.2,8916.9
평균,9268.9,8300.9


#### 2. time_weighted 방식

In [19]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[649]	valid_0's rmse: 11228.6
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 10727.5
[2000]	valid_0's rmse: 10680.9
[3000]	valid_0's rmse: 10678.1
[4000]	valid_0's rmse: 10678
[5000]	valid_0's rmse: 10678
[6000]	valid_0's rmse: 10678
[7000]	valid_0's rmse: 10678
[8000]	valid_0's rmse: 10678
[9000]	valid_0's rmse: 10678
Early stopping, best iteration is:
[9142]	valid_0's rmse: 10678
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 8422.67
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 9496.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 9677.92
Learning rate set to 0.008427
0:	learn: 37106.8885600	test: 41545.0382450	best: 41545.0382450 (0)	tota

Unnamed: 0,lgbm,cb
cv1,11228.6,10439.4
cv2,10678.0,10622.6
cv3,8422.7,7702.3
cv4,9496.3,10896.7
cv5,9677.9,8993.0
평균,9900.7,9730.8


In [20]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	valid_0's rmse: 8826.21
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 6507.61
[2000]	valid_0's rmse: 6503.46
[3000]	valid_0's rmse: 6503.24
[4000]	valid_0's rmse: 6503.24
[5000]	valid_0's rmse: 6503.24
[6000]	valid_0's rmse: 6503.24
Early stopping, best iteration is:
[6375]	valid_0's rmse: 6503.24
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[191]	valid_0's rmse: 9862.07
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 9641.89
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[261]	valid_0's rmse: 8827.01
Learning rate set to 0.008028
0:	learn: 29648.5388936	test: 23844.8975417	best: 23844.8975417 (0)	total: 15.2ms	remaining: 5m 3s
1000:	learn: 3586.4002191	test: 8865.7605107	best:

Unnamed: 0,lgbm,cb
cv1,8826.2,8664.5
cv2,6503.2,6809.1
cv3,9862.1,8793.5
cv4,9641.9,7582.0
cv5,8827.0,8527.9
평균,8732.1,8075.4


#### 3. play_weighted 방식

In [21]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[437]	valid_0's rmse: 11230.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11242.7
[2000]	valid_0's rmse: 11227.7
Early stopping, best iteration is:
[2514]	valid_0's rmse: 11227.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 9040.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[278]	valid_0's rmse: 11360.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 10437.8
Learning rate set to 0.008427
0:	learn: 37112.5824045	test: 41546.5651690	best: 41546.5651690 (0)	total: 17.1ms	remaining: 5m 42s
1000:	learn: 6299.6923399	test: 11982.9151386	best: 11982.9151386 (1000)	total: 16.3s	remaining: 5m 10s
2000:	learn: 4004.7080212	test: 11356.1004759	best: 11356.1004759 (200

Unnamed: 0,lgbm,cb
cv1,11230.7,10994.8
cv2,11227.4,11353.7
cv3,9040.9,8413.0
cv4,11360.9,11572.1
cv5,10437.8,10312.0
평균,10659.5,10529.1


In [22]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[80]	valid_0's rmse: 11391.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7361.44
Early stopping, best iteration is:
[1367]	valid_0's rmse: 7359.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's rmse: 10164.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 10228.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[126]	valid_0's rmse: 10651.6
Learning rate set to 0.008028
0:	learn: 29674.6338149	test: 23854.9604508	best: 23854.9604508 (0)	total: 21.7ms	remaining: 7m 14s
1000:	learn: 3619.8123919	test: 10647.8073914	best: 10647.8073914 (1000)	total: 19.2s	remaining: 6m 3s
2000:	learn: 2269.0990466	test: 10453.4077424	best: 10451.1277378 (1964)	total: 35.9s	remaining: 5m 22

Unnamed: 0,lgbm,cb
cv1,11391.1,10443.3
cv2,7359.5,6882.4
cv3,10164.1,8491.0
cv4,10228.9,8347.6
cv5,10651.6,10793.1
평균,9959.0,8991.5


#### 4. time_and_play weighted 방식

In [23]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[451]	valid_0's rmse: 10710.8
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11248.6
[2000]	valid_0's rmse: 11211.1
[3000]	valid_0's rmse: 11209.2
[4000]	valid_0's rmse: 11209
[5000]	valid_0's rmse: 11209
Early stopping, best iteration is:
[5365]	valid_0's rmse: 11209
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 8804.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's rmse: 10894.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's rmse: 9489.84
Learning rate set to 0.008427
0:	learn: 37109.4978450	test: 41548.8575575	best: 41548.8575575 (0)	total: 24.8ms	remaining: 8m 15s
1000:	learn: 6051.2445763	test: 11511.9273722	best: 11511.9273722 (1000)	total: 21s	rem

Unnamed: 0,lgbm,cb
cv1,10710.8,10725.4
cv2,11209.0,10471.4
cv3,8804.9,8064.4
cv4,10894.3,11178.1
cv5,9489.8,9358.1
평균,10221.8,9959.5


In [25]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[84]	valid_0's rmse: 11133.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[335]	valid_0's rmse: 7651.99
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[112]	valid_0's rmse: 10286.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[26]	valid_0's rmse: 9978.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[188]	valid_0's rmse: 10440.4
Learning rate set to 0.008028
0:	learn: 29673.3911983	test: 23844.3634666	best: 23844.3634666 (0)	total: 11.3ms	remaining: 3m 45s
1000:	learn: 3512.5566456	test: 10191.7151148	best: 10191.7151148 (1000)	total: 17.7s	remaining: 5m 36s
2000:	learn: 2160.6494569	test: 10038.2096524	best: 10035.2688060 (1976)	total: 36.5s	remaining: 5m 28s
3000:	learn: 1564.5204638	test

Unnamed: 0,lgbm,cb
cv1,11133.6,9995.6
cv2,7652.0,6742.3
cv3,10286.2,8314.3
cv4,9978.8,8188.5
cv5,10440.4,10149.2
평균,9898.2,8678.0


# 파생변수 생성 (4년, 0.9)

In [8]:
YEAR = 4
ALPHA = 0.9

In [9]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 84초
20/58개 생성 완료!! == 경과시간: 158초
30/58개 생성 완료!! == 경과시간: 232초
40/58개 생성 완료!! == 경과시간: 310초
50/58개 생성 완료!! == 경과시간: 396초
10/62개 생성 완료!! == 경과시간: 525초
20/62개 생성 완료!! == 경과시간: 587초
30/62개 생성 완료!! == 경과시간: 651초
40/62개 생성 완료!! == 경과시간: 714초
50/62개 생성 완료!! == 경과시간: 777초
60/62개 생성 완료!! == 경과시간: 840초


In [10]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 72초
20/58개 생성 완료!! == 경과시간: 148초
30/58개 생성 완료!! == 경과시간: 236초
40/58개 생성 완료!! == 경과시간: 302초
50/58개 생성 완료!! == 경과시간: 457초
10/62개 생성 완료!! == 경과시간: 745초
20/62개 생성 완료!! == 경과시간: 894초
30/62개 생성 완료!! == 경과시간: 1044초
40/62개 생성 완료!! == 경과시간: 1105초
50/62개 생성 완료!! == 경과시간: 1152초
60/62개 생성 완료!! == 경과시간: 1199초


In [11]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 60초
20/58개 생성 완료!! == 경과시간: 119초
30/58개 생성 완료!! == 경과시간: 179초
40/58개 생성 완료!! == 경과시간: 238초
50/58개 생성 완료!! == 경과시간: 298초
10/62개 생성 완료!! == 경과시간: 396초
20/62개 생성 완료!! == 경과시간: 446초
30/62개 생성 완료!! == 경과시간: 496초
40/62개 생성 완료!! == 경과시간: 546초
50/62개 생성 완료!! == 경과시간: 596초
60/62개 생성 완료!! == 경과시간: 646초


In [12]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 60초
20/58개 생성 완료!! == 경과시간: 120초
30/58개 생성 완료!! == 경과시간: 180초
40/58개 생성 완료!! == 경과시간: 240초
50/58개 생성 완료!! == 경과시간: 300초
10/62개 생성 완료!! == 경과시간: 400초
20/62개 생성 완료!! == 경과시간: 451초
30/62개 생성 완료!! == 경과시간: 502초
40/62개 생성 완료!! == 경과시간: 553초
50/62개 생성 완료!! == 경과시간: 604초
60/62개 생성 완료!! == 경과시간: 654초


In [13]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간 + 타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)


#### 1. simple 방식

In [14]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[357]	valid_0's rmse: 11864.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 12151.8
[2000]	valid_0's rmse: 12131.5
[3000]	valid_0's rmse: 12130.6
[4000]	valid_0's rmse: 12130.6
[5000]	valid_0's rmse: 12130.6
[6000]	valid_0's rmse: 12130.6
[7000]	valid_0's rmse: 12130.6
[8000]	valid_0's rmse: 12130.6
[9000]	valid_0's rmse: 12130.6
[10000]	valid_0's rmse: 12130.6
Early stopping, best iteration is:
[10729]	valid_0's rmse: 12130.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[67]	valid_0's rmse: 8297.22
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's rmse: 11895.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 10775
Learning rate set to 0.008427
0:	learn: 37120.6007117	tes

Unnamed: 0,lgbm,cb
cv1,11864.1,11858.7
cv2,12130.6,11891.5
cv3,8297.2,9378.2
cv4,11895.4,12481.9
cv5,10775.0,10786.3
평균,10992.5,11279.3


In [15]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[179]	valid_0's rmse: 9265.91
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[625]	valid_0's rmse: 9163.15
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's rmse: 10750
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[225]	valid_0's rmse: 9396.17
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[858]	valid_0's rmse: 9797.91
Learning rate set to 0.008028
0:	learn: 29649.7670976	test: 23846.3793283	best: 23846.3793283 (0)	total: 12.8ms	remaining: 4m 15s
1000:	learn: 4140.9788842	test: 9751.6673248	best: 9751.6673248 (1000)	total: 11.7s	remaining: 3m 42s
2000:	learn: 2558.1206848	test: 9493.0344928	best: 9492.7320121 (1991)	total: 23.1s	remaining: 3m 27s
Stopped by overfitting detector  (

Unnamed: 0,lgbm,cb
cv1,9265.9,9426.3
cv2,9163.1,7369.4
cv3,10750.0,9409.2
cv4,9396.2,8274.5
cv5,9797.9,10377.6
평균,9674.6,8971.4


#### 2. time_weighted 방식

In [16]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[799]	valid_0's rmse: 10993.6
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11439.6
[2000]	valid_0's rmse: 11419.3
[3000]	valid_0's rmse: 11418
[4000]	valid_0's rmse: 11418
[5000]	valid_0's rmse: 11417.9
[6000]	valid_0's rmse: 11417.9
[7000]	valid_0's rmse: 11417.9
[8000]	valid_0's rmse: 11417.9
[9000]	valid_0's rmse: 11417.9
[10000]	valid_0's rmse: 11417.9
Early stopping, best iteration is:
[10544]	valid_0's rmse: 11417.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[352]	valid_0's rmse: 8150.41
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 11897.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 10977.3
Learning rate set to 0.008427
0:	learn: 37109.1024066	test

Unnamed: 0,lgbm,cb
cv1,10993.6,11234.2
cv2,11417.9,11713.1
cv3,8150.4,8668.6
cv4,11897.1,11975.1
cv5,10977.3,10138.7
평균,10687.3,10745.9


In [17]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[675]	valid_0's rmse: 9667.17
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7823.25
[2000]	valid_0's rmse: 7818.21
Early stopping, best iteration is:
[2685]	valid_0's rmse: 7817.89
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[129]	valid_0's rmse: 10471.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[237]	valid_0's rmse: 9530.03
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[346]	valid_0's rmse: 9199.28
Learning rate set to 0.008028
0:	learn: 29670.8101355	test: 23844.5214581	best: 23844.5214581 (0)	total: 14.6ms	remaining: 4m 51s
1000:	learn: 3795.4852932	test: 8930.3166059	best: 8930.3166059 (1000)	total: 15s	remaining: 4m 44s
2000:	learn: 2284.2342209	test: 8613.4382945	best: 8613.3399980 (1996)	t

Unnamed: 0,lgbm,cb
cv1,9667.2,8528.8
cv2,7817.9,7111.4
cv3,10471.9,8512.6
cv4,9530.0,8116.9
cv5,9199.3,9257.5
평균,9337.3,8305.5


#### 3. play_weighted 방식

In [18]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[289]	valid_0's rmse: 12547.3
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 12668.8
[2000]	valid_0's rmse: 12642
[3000]	valid_0's rmse: 12640.9
[4000]	valid_0's rmse: 12640.9
[5000]	valid_0's rmse: 12640.9
[6000]	valid_0's rmse: 12640.9
[7000]	valid_0's rmse: 12640.9
[8000]	valid_0's rmse: 12640.9
Early stopping, best iteration is:
[8329]	valid_0's rmse: 12640.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's rmse: 10183.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 12570.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[58]	valid_0's rmse: 11735.4
Learning rate set to 0.008427
0:	learn: 37112.3459290	test: 41542.9974329	best: 41542.9974329 (0)	total: 17ms	remaining: 

Unnamed: 0,lgbm,cb
cv1,12547.3,11927.3
cv2,12640.9,11796.6
cv3,10183.3,9037.4
cv4,12570.4,12160.5
cv5,11735.4,10599.9
평균,11935.5,11104.3


In [19]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[562]	valid_0's rmse: 9952.81
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[352]	valid_0's rmse: 9355.17
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[103]	valid_0's rmse: 11081.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 10522.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[611]	valid_0's rmse: 12443.3
Learning rate set to 0.008028
0:	learn: 29650.1241555	test: 23830.5660600	best: 23830.5660600 (0)	total: 17.2ms	remaining: 5m 44s
1000:	learn: 4127.7805896	test: 10677.4660656	best: 10677.4660656 (1000)	total: 14.5s	remaining: 4m 36s
2000:	learn: 2485.8464392	test: 10330.2151928	best: 10330.2151928 (2000)	total: 29.1s	remaining: 4m 22s
3000:	learn: 1753.9477110	te

Unnamed: 0,lgbm,cb
cv1,9952.8,10258.0
cv2,9355.2,7728.0
cv3,11081.4,9437.0
cv4,10522.4,8672.6
cv5,12443.3,11820.1
평균,10671.0,9583.1


#### 4. time_and_play weighted 방식

In [20]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[386]	valid_0's rmse: 10872
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11895.9
[2000]	valid_0's rmse: 11885.4
Early stopping, best iteration is:
[2088]	valid_0's rmse: 11885.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 8802.53
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[141]	valid_0's rmse: 12118.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 10408.5
Learning rate set to 0.008427
0:	learn: 37111.3513930	test: 41546.4895562	best: 41546.4895562 (0)	total: 15.8ms	remaining: 5m 15s
1000:	learn: 6407.9914456	test: 12459.2869590	best: 12459.2869590 (1000)	total: 14.8s	remaining: 4m 41s
2000:	learn: 3943.0865645	test: 11737.2665763	best: 11737.1394713 (1999)

Unnamed: 0,lgbm,cb
cv1,10872.0,11289.7
cv2,11885.2,11441.1
cv3,8802.5,8635.8
cv4,12118.9,12140.2
cv5,10408.5,10222.8
평균,10817.4,10745.9


In [21]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 9666.15
[2000]	valid_0's rmse: 9661.7
Early stopping, best iteration is:
[2569]	valid_0's rmse: 9661.65
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7628.41
Early stopping, best iteration is:
[1023]	valid_0's rmse: 7627.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[68]	valid_0's rmse: 11029.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 10493.8
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 9838.8
Early stopping, best iteration is:
[914]	valid_0's rmse: 9837.95
Learning rate set to 0.008028
0:	learn: 29680.2023044	test: 23862.4217403	best: 23862.4217403 (0)	total: 16.4ms	remaining: 5m 27s
1000:	learn: 3832.0235422	test: 10454.0427984	best: 10454.0427984 (1000)	total: 15.1s	remaining: 4m 46s
Stopped b

Unnamed: 0,lgbm,cb
cv1,9661.6,10225.9
cv2,7627.9,7018.3
cv3,11029.5,9038.1
cv4,10493.8,8647.6
cv5,9837.9,10330.6
평균,9730.2,9052.1


# 파생변수 생성 (5년, 0.9)

In [22]:
YEAR = 5
ALPHA = 0.9

In [23]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 59초
20/58개 생성 완료!! == 경과시간: 118초
30/58개 생성 완료!! == 경과시간: 176초
40/58개 생성 완료!! == 경과시간: 235초
50/58개 생성 완료!! == 경과시간: 293초
10/62개 생성 완료!! == 경과시간: 389초
20/62개 생성 완료!! == 경과시간: 439초
30/62개 생성 완료!! == 경과시간: 488초
40/62개 생성 완료!! == 경과시간: 537초
50/62개 생성 완료!! == 경과시간: 587초
60/62개 생성 완료!! == 경과시간: 636초


In [24]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 57초
20/58개 생성 완료!! == 경과시간: 114초
30/58개 생성 완료!! == 경과시간: 172초
40/58개 생성 완료!! == 경과시간: 229초
50/58개 생성 완료!! == 경과시간: 286초
10/62개 생성 완료!! == 경과시간: 380초
20/62개 생성 완료!! == 경과시간: 428초
30/62개 생성 완료!! == 경과시간: 477초
40/62개 생성 완료!! == 경과시간: 525초
50/62개 생성 완료!! == 경과시간: 573초
60/62개 생성 완료!! == 경과시간: 621초


In [25]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 61초
20/58개 생성 완료!! == 경과시간: 122초
30/58개 생성 완료!! == 경과시간: 183초
40/58개 생성 완료!! == 경과시간: 244초
50/58개 생성 완료!! == 경과시간: 305초
10/62개 생성 완료!! == 경과시간: 405초
20/62개 생성 완료!! == 경과시간: 457초
30/62개 생성 완료!! == 경과시간: 508초
40/62개 생성 완료!! == 경과시간: 560초
50/62개 생성 완료!! == 경과시간: 611초
60/62개 생성 완료!! == 경과시간: 662초


In [26]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 61초
20/58개 생성 완료!! == 경과시간: 122초
30/58개 생성 완료!! == 경과시간: 183초
40/58개 생성 완료!! == 경과시간: 244초
50/58개 생성 완료!! == 경과시간: 305초
10/62개 생성 완료!! == 경과시간: 405초
20/62개 생성 완료!! == 경과시간: 457초
30/62개 생성 완료!! == 경과시간: 508초
40/62개 생성 완료!! == 경과시간: 559초
50/62개 생성 완료!! == 경과시간: 610초
60/62개 생성 완료!! == 경과시간: 661초


In [27]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간 + 타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)


#### 1. simple 방식

In [28]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[460]	valid_0's rmse: 12814.2
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11476.9
[2000]	valid_0's rmse: 11458.6
[3000]	valid_0's rmse: 11457.5
[4000]	valid_0's rmse: 11457.4
[5000]	valid_0's rmse: 11457.4
[6000]	valid_0's rmse: 11457.4
[7000]	valid_0's rmse: 11457.4
[8000]	valid_0's rmse: 11457.4
[9000]	valid_0's rmse: 11457.4
[10000]	valid_0's rmse: 11457.4
Early stopping, best iteration is:
[10141]	valid_0's rmse: 11457.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 9414.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 12738.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	valid_0's rmse: 10213.5
Learning rate set to 0.008427
0:	learn: 37135.5919714	te

Unnamed: 0,lgbm,cb
cv1,12814.2,12462.1
cv2,11457.4,12308.4
cv3,9414.4,9819.3
cv4,12738.6,12912.1
cv5,10213.5,10724.4
평균,11327.6,11645.2


In [29]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[882]	valid_0's rmse: 9594.92
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[300]	valid_0's rmse: 9297.18
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	valid_0's rmse: 10259.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[467]	valid_0's rmse: 9479.75
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[230]	valid_0's rmse: 11264
Learning rate set to 0.008028
0:	learn: 29654.9415968	test: 23841.8430864	best: 23841.8430864 (0)	total: 14.8ms	remaining: 4m 55s
1000:	learn: 4248.3875813	test: 9709.8825020	best: 9709.8825020 (1000)	total: 12s	remaining: 3m 47s
2000:	learn: 2563.0706549	test: 9246.8977916	best: 9246.8977916 (2000)	total: 24.2s	remaining: 3m 37s
3000:	learn: 1717.7763545	test: 917

Unnamed: 0,lgbm,cb
cv1,9594.9,9157.8
cv2,9297.2,7805.4
cv3,10259.8,9245.6
cv4,9479.7,8288.9
cv5,11264.0,10910.4
평균,9979.1,9081.6


#### 2. time_weighted 방식

In [30]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[487]	valid_0's rmse: 11984.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11824.2
[2000]	valid_0's rmse: 11799.6
[3000]	valid_0's rmse: 11798.3
[4000]	valid_0's rmse: 11798.2
[5000]	valid_0's rmse: 11798.2
[6000]	valid_0's rmse: 11798.2
[7000]	valid_0's rmse: 11798.2
[8000]	valid_0's rmse: 11798.2
[9000]	valid_0's rmse: 11798.2
Early stopping, best iteration is:
[9284]	valid_0's rmse: 11798.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's rmse: 9339
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[50]	valid_0's rmse: 11955.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[79]	valid_0's rmse: 10851.9
Learning rate set to 0.008427
0:	learn: 37135.8677848	test: 41587.0081833	best: 41587.00818

Unnamed: 0,lgbm,cb
cv1,11984.7,11600.8
cv2,11798.2,11788.5
cv3,9339.0,9185.6
cv4,11955.6,12460.8
cv5,10851.9,10313.4
평균,11185.9,11069.8


In [31]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[395]	valid_0's rmse: 8621.32
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	valid_0's rmse: 8594.34
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's rmse: 10495.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's rmse: 9400.59
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's rmse: 10955.3
Learning rate set to 0.008028
0:	learn: 29674.2578946	test: 23853.2122213	best: 23853.2122213 (0)	total: 16.2ms	remaining: 5m 23s
1000:	learn: 3932.5500734	test: 9275.8792825	best: 9275.8346091 (999)	total: 13.5s	remaining: 4m 16s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 9084.592766
bestIteration = 1625

Shrink model to first 1626 iterat

Unnamed: 0,lgbm,cb
cv1,8621.3,9084.6
cv2,8594.3,7485.8
cv3,10495.3,8780.6
cv4,9400.6,7968.2
cv5,10955.3,10000.6
평균,9613.4,8664.0


#### 3. play_weighted 방식

In [32]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[529]	valid_0's rmse: 12230.2
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 12357.1
[2000]	valid_0's rmse: 12332.5
[3000]	valid_0's rmse: 12331.3
[4000]	valid_0's rmse: 12331.2
[5000]	valid_0's rmse: 12331.2
[6000]	valid_0's rmse: 12331.2
Early stopping, best iteration is:
[6004]	valid_0's rmse: 12331.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 9933.37
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[144]	valid_0's rmse: 11683.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[62]	valid_0's rmse: 10933.2
Learning rate set to 0.008427
0:	learn: 37124.2862838	test: 41565.9431664	best: 41565.9431664 (0)	total: 18.5ms	remaining: 6m 9s
1000:	learn: 7021.7461227	test: 13553.5902358	best:

Unnamed: 0,lgbm,cb
cv1,12230.2,12190.5
cv2,12331.2,12359.5
cv3,9933.4,9870.0
cv4,11683.7,12322.6
cv5,10933.2,11115.0
평균,11422.3,11571.5


In [33]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11472.2
Early stopping, best iteration is:
[905]	valid_0's rmse: 11471.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[139]	valid_0's rmse: 9213.06
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[153]	valid_0's rmse: 10930.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 10304.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 11399.3
Learning rate set to 0.008028
0:	learn: 29655.9505642	test: 23838.2808557	best: 23838.2808557 (0)	total: 14.6ms	remaining: 4m 51s
1000:	learn: 4166.2108779	test: 11131.2033118	best: 11131.2033118 (1000)	total: 13.8s	remaining: 4m 22s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10905.2971
bestIteration = 1861

S

Unnamed: 0,lgbm,cb
cv1,11471.1,10905.3
cv2,9213.1,8114.0
cv3,10930.1,9675.2
cv4,10304.3,8821.2
cv5,11399.3,10945.9
평균,10663.6,9692.3


#### 4. time_and_play weighted 방식

In [34]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[337]	valid_0's rmse: 11579.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 12082.4
[2000]	valid_0's rmse: 12055.7
[3000]	valid_0's rmse: 12054.4
[4000]	valid_0's rmse: 12054.4
[5000]	valid_0's rmse: 12054.4
[6000]	valid_0's rmse: 12054.4
Early stopping, best iteration is:
[6449]	valid_0's rmse: 12054.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 9036.58
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's rmse: 12186.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 10730
Learning rate set to 0.008427
0:	learn: 37122.1597017	test: 41560.4317658	best: 41560.4317658 (0)	total: 17.5ms	remaining: 5m 49s
1000:	learn: 6670.7798463	test: 12947.1792879	best: 

Unnamed: 0,lgbm,cb
cv1,11579.4,11577.6
cv2,12054.4,11707.5
cv3,9036.6,9194.4
cv4,12186.5,11863.4
cv5,10730.0,10197.5
평균,11117.4,10908.1


In [35]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's rmse: 10320.6
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8253.07
[2000]	valid_0's rmse: 8249.74
[3000]	valid_0's rmse: 8249.63
[4000]	valid_0's rmse: 8249.63
Early stopping, best iteration is:
[3977]	valid_0's rmse: 8249.63
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[245]	valid_0's rmse: 11162.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 10433.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 11544.9
Learning rate set to 0.008028
0:	learn: 29649.8363969	test: 23829.6273323	best: 23829.6273323 (0)	total: 15.8ms	remaining: 5m 15s
1000:	learn: 4003.3104084	test: 11035.5513051	best: 11034.5365932 (997)	total: 14.8s	remaining: 4m 41s
2000:	le

Unnamed: 0,lgbm,cb
cv1,10320.6,10686.6
cv2,8249.6,7904.2
cv3,11162.1,8979.3
cv4,10433.6,8416.6
cv5,11544.9,10808.7
평균,10342.2,9359.1


# 파생변수 생성 (3년, 0.6)

In [36]:
YEAR = 3
ALPHA = 0.6

In [37]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 60초
20/58개 생성 완료!! == 경과시간: 118초
30/58개 생성 완료!! == 경과시간: 176초
40/58개 생성 완료!! == 경과시간: 235초
50/58개 생성 완료!! == 경과시간: 293초
10/62개 생성 완료!! == 경과시간: 388초
20/62개 생성 완료!! == 경과시간: 438초
30/62개 생성 완료!! == 경과시간: 487초
40/62개 생성 완료!! == 경과시간: 536초
50/62개 생성 완료!! == 경과시간: 585초
60/62개 생성 완료!! == 경과시간: 634초


In [38]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 57초
20/58개 생성 완료!! == 경과시간: 113초
30/58개 생성 완료!! == 경과시간: 170초
40/58개 생성 완료!! == 경과시간: 226초
50/58개 생성 완료!! == 경과시간: 284초
10/62개 생성 완료!! == 경과시간: 377초
20/62개 생성 완료!! == 경과시간: 425초
30/62개 생성 완료!! == 경과시간: 473초
40/62개 생성 완료!! == 경과시간: 521초
50/62개 생성 완료!! == 경과시간: 569초
60/62개 생성 완료!! == 경과시간: 617초


In [39]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 60초
20/58개 생성 완료!! == 경과시간: 121초
30/58개 생성 완료!! == 경과시간: 181초
40/58개 생성 완료!! == 경과시간: 242초
50/58개 생성 완료!! == 경과시간: 303초
10/62개 생성 완료!! == 경과시간: 402초
20/62개 생성 완료!! == 경과시간: 453초
30/62개 생성 완료!! == 경과시간: 504초
40/62개 생성 완료!! == 경과시간: 555초
50/62개 생성 완료!! == 경과시간: 606초
60/62개 생성 완료!! == 경과시간: 657초


In [40]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 61초
20/58개 생성 완료!! == 경과시간: 122초
30/58개 생성 완료!! == 경과시간: 183초
40/58개 생성 완료!! == 경과시간: 243초
50/58개 생성 완료!! == 경과시간: 304초
10/62개 생성 완료!! == 경과시간: 403초
20/62개 생성 완료!! == 경과시간: 454초
30/62개 생성 완료!! == 경과시간: 505초
40/62개 생성 완료!! == 경과시간: 556초
50/62개 생성 완료!! == 경과시간: 607초
60/62개 생성 완료!! == 경과시간: 658초


In [41]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간 + 타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)


#### 1. simple 방식

In [42]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[297]	valid_0's rmse: 11328.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11417.3
[2000]	valid_0's rmse: 11390.5
[3000]	valid_0's rmse: 11389.4
Early stopping, best iteration is:
[3863]	valid_0's rmse: 11389.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's rmse: 8200.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 11031.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 10274.3
Learning rate set to 0.008427
0:	learn: 37108.5237112	test: 41538.0870045	best: 41538.0870045 (0)	total: 15.4ms	remaining: 5m 7s
1000:	learn: 6310.3752160	test: 11824.5726422	best: 11824.5726422 (1000)	total: 10.9s	remaining: 3m 26s
2000:	learn: 3917.7568179	test: 11156.628

Unnamed: 0,lgbm,cb
cv1,11328.4,10843.0
cv2,11389.3,10777.2
cv3,8200.9,8142.9
cv4,11031.2,11235.9
cv5,10274.3,9463.2
평균,10444.9,10092.4


In [43]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[441]	valid_0's rmse: 8964.33
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	valid_0's rmse: 7044.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's rmse: 10988
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 9747.62
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[455]	valid_0's rmse: 9600.22
Learning rate set to 0.008028
0:	learn: 29673.9063383	test: 23850.5818249	best: 23850.5818249 (0)	total: 15.2ms	remaining: 5m 3s
1000:	learn: 3786.7658013	test: 9219.0281232	best: 9219.0281232 (1000)	total: 12.1s	remaining: 3m 49s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 8992.22603
bestIteration = 1797

Shrink model to first 1798 iterations

Unnamed: 0,lgbm,cb
cv1,8964.3,8992.2
cv2,7044.5,7127.7
cv3,10988.0,8642.5
cv4,9747.6,7825.3
cv5,9600.2,8916.9
평균,9268.9,8300.9


#### 2. time_weighted 방식

In [44]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[233]	valid_0's rmse: 9817.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8940.94
[2000]	valid_0's rmse: 8917.2
[3000]	valid_0's rmse: 8916.26
[4000]	valid_0's rmse: 8916.23
[5000]	valid_0's rmse: 8916.23
[6000]	valid_0's rmse: 8916.23
Early stopping, best iteration is:
[6671]	valid_0's rmse: 8916.23
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[89]	valid_0's rmse: 6072.08
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[114]	valid_0's rmse: 8625.17
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[80]	valid_0's rmse: 7775.1
Learning rate set to 0.008427
0:	learn: 37103.7948072	test: 41543.4108404	best: 41543.4108404 (0)	total: 17.3ms	remaining: 5m 46s
1000:	learn: 4685.9067720	test: 9651.7082449	best: 96

Unnamed: 0,lgbm,cb
cv1,9817.7,9052.7
cv2,8916.2,9273.8
cv3,6072.1,5951.8
cv4,8625.2,9089.6
cv5,7775.1,6864.9
평균,8241.3,8046.6


In [45]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 7021.99
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 4435.58
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[96]	valid_0's rmse: 7726.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's rmse: 8387.05
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[77]	valid_0's rmse: 6497.62
Learning rate set to 0.008028
0:	learn: 29642.7410737	test: 23829.2854802	best: 23829.2854802 (0)	total: 15.3ms	remaining: 5m 6s
1000:	learn: 3080.6272465	test: 6759.7919848	best: 6759.6868384 (998)	total: 14.4s	remaining: 4m 33s
2000:	learn: 1966.7783684	test: 6524.4238543	best: 6522.8303330 (1972)	total: 28.1s	remaining: 4m 13s
Stopped by overfitting detector  (100 i

Unnamed: 0,lgbm,cb
cv1,7022.0,6499.2
cv2,4435.6,4584.7
cv3,7726.9,6487.7
cv4,8387.1,7346.0
cv5,6497.6,6200.5
평균,6813.8,6223.6


#### 3. play_weighted 방식

In [46]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[437]	valid_0's rmse: 11230.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11242.7
[2000]	valid_0's rmse: 11227.7
Early stopping, best iteration is:
[2514]	valid_0's rmse: 11227.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 9040.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[278]	valid_0's rmse: 11360.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 10437.8
Learning rate set to 0.008427
0:	learn: 37112.5824045	test: 41546.5651690	best: 41546.5651690 (0)	total: 14.2ms	remaining: 4m 44s
1000:	learn: 6299.6923399	test: 11982.9151386	best: 11982.9151386 (1000)	total: 15s	remaining: 4m 44s
2000:	learn: 4004.7080212	test: 11356.1004759	best: 11356.1004759 (2000)

Unnamed: 0,lgbm,cb
cv1,11230.7,10994.8
cv2,11227.4,11353.7
cv3,9040.9,8413.0
cv4,11360.9,11572.1
cv5,10437.8,10312.0
평균,10659.5,10529.1


In [47]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[80]	valid_0's rmse: 11391.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7361.44
Early stopping, best iteration is:
[1367]	valid_0's rmse: 7359.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's rmse: 10164.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 10228.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[126]	valid_0's rmse: 10651.6
Learning rate set to 0.008028
0:	learn: 29674.6338149	test: 23854.9604508	best: 23854.9604508 (0)	total: 14.3ms	remaining: 4m 45s
1000:	learn: 3619.8123919	test: 10647.8073914	best: 10647.8073914 (1000)	total: 14.6s	remaining: 4m 36s
2000:	learn: 2269.0990466	test: 10453.4077424	best: 10451.1277378 (1964)	total: 28.9s	remaining: 4m 2

Unnamed: 0,lgbm,cb
cv1,11391.1,10443.3
cv2,7359.5,6882.4
cv3,10164.1,8491.0
cv4,10228.9,8347.6
cv5,10651.6,10793.1
평균,9959.0,8991.5


#### 4. time_and_play weighted 방식

In [48]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	valid_0's rmse: 9794.53
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 9035.1
[2000]	valid_0's rmse: 8986.56
[3000]	valid_0's rmse: 8984.77
[4000]	valid_0's rmse: 8984.69
[5000]	valid_0's rmse: 8984.68
Early stopping, best iteration is:
[5574]	valid_0's rmse: 8984.68
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's rmse: 6645.16
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 10888.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 8035.52
Learning rate set to 0.008427
0:	learn: 37095.6084163	test: 41520.8920675	best: 41520.8920675 (0)	total: 16.5ms	remaining: 5m 29s
1000:	learn: 5052.9824918	test: 9500.6508350	best: 9498.7622663 (993)	total: 15.5s	

Unnamed: 0,lgbm,cb
cv1,9794.5,8993.9
cv2,8984.7,9091.2
cv3,6645.2,6860.4
cv4,10888.2,9785.6
cv5,8035.5,7463.9
평균,8869.6,8439.0


In [49]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[175]	valid_0's rmse: 8560.61
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[560]	valid_0's rmse: 7154.25
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	valid_0's rmse: 9185.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 9229.03
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7448.55
Early stopping, best iteration is:
[1544]	valid_0's rmse: 7446.1
Learning rate set to 0.008028
0:	learn: 29670.4242181	test: 23844.7779402	best: 23844.7779402 (0)	total: 14.5ms	remaining: 4m 50s
1000:	learn: 2972.3695273	test: 8207.7468714	best: 8207.7468714 (1000)	total: 14.1s	remaining: 4m 27s
2000:	learn: 1703.8878784	test: 7993.6101331	best: 7992.8349165 (1997)	total: 29s	remaining: 4m 20s
Stopp

Unnamed: 0,lgbm,cb
cv1,8560.6,7990.3
cv2,7154.2,5227.7
cv3,9185.3,6602.7
cv4,9229.0,7566.1
cv5,7446.1,7401.0
평균,8315.1,6957.5


# 파생변수 생성 (3년, 0.3)

In [50]:
YEAR = 3
ALPHA = 0.3

In [51]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 60초
20/58개 생성 완료!! == 경과시간: 119초
30/58개 생성 완료!! == 경과시간: 177초
40/58개 생성 완료!! == 경과시간: 236초
50/58개 생성 완료!! == 경과시간: 294초
10/62개 생성 완료!! == 경과시간: 390초
20/62개 생성 완료!! == 경과시간: 439초
30/62개 생성 완료!! == 경과시간: 488초
40/62개 생성 완료!! == 경과시간: 538초
50/62개 생성 완료!! == 경과시간: 587초
60/62개 생성 완료!! == 경과시간: 636초


In [52]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 57초
20/58개 생성 완료!! == 경과시간: 115초
30/58개 생성 완료!! == 경과시간: 172초
40/58개 생성 완료!! == 경과시간: 229초
50/58개 생성 완료!! == 경과시간: 286초
10/62개 생성 완료!! == 경과시간: 381초
20/62개 생성 완료!! == 경과시간: 429초
30/62개 생성 완료!! == 경과시간: 477초
40/62개 생성 완료!! == 경과시간: 525초
50/62개 생성 완료!! == 경과시간: 573초
60/62개 생성 완료!! == 경과시간: 622초


In [53]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 61초
20/58개 생성 완료!! == 경과시간: 122초
30/58개 생성 완료!! == 경과시간: 182초
40/58개 생성 완료!! == 경과시간: 243초
50/58개 생성 완료!! == 경과시간: 304초
10/62개 생성 완료!! == 경과시간: 404초
20/62개 생성 완료!! == 경과시간: 455초
30/62개 생성 완료!! == 경과시간: 506초
40/62개 생성 완료!! == 경과시간: 557초
50/62개 생성 완료!! == 경과시간: 608초
60/62개 생성 완료!! == 경과시간: 660초


In [54]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')

10/58개 생성 완료!! == 경과시간: 61초
20/58개 생성 완료!! == 경과시간: 122초
30/58개 생성 완료!! == 경과시간: 183초
40/58개 생성 완료!! == 경과시간: 244초
50/58개 생성 완료!! == 경과시간: 304초
10/62개 생성 완료!! == 경과시간: 414초
20/62개 생성 완료!! == 경과시간: 467초
30/62개 생성 완료!! == 경과시간: 518초
40/62개 생성 완료!! == 경과시간: 569초
50/62개 생성 완료!! == 경과시간: 620초
60/62개 생성 완료!! == 경과시간: 671초


In [55]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)
시간 + 타석(또는 이닝)가중방식 == 타자: (3033, 146) == 투수: (2345, 145)


#### 1. simple 방식

In [56]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[297]	valid_0's rmse: 11328.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11417.3
[2000]	valid_0's rmse: 11390.5
[3000]	valid_0's rmse: 11389.4
Early stopping, best iteration is:
[3863]	valid_0's rmse: 11389.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's rmse: 8200.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 11031.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 10274.3
Learning rate set to 0.008427
0:	learn: 37108.5237112	test: 41538.0870045	best: 41538.0870045 (0)	total: 10.7ms	remaining: 3m 34s
1000:	learn: 6310.3752160	test: 11824.5726422	best: 11824.5726422 (1000)	total: 10.9s	remaining: 3m 27s
2000:	learn: 3917.7568179	test: 11156.62

Unnamed: 0,lgbm,cb
cv1,11328.4,10843.0
cv2,11389.3,10777.2
cv3,8200.9,8142.9
cv4,11031.2,11235.9
cv5,10274.3,9463.2
평균,10444.9,10092.4


In [57]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_simple.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[441]	valid_0's rmse: 8964.33
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	valid_0's rmse: 7044.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's rmse: 10988
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 9747.62
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[455]	valid_0's rmse: 9600.22
Learning rate set to 0.008028
0:	learn: 29673.9063383	test: 23850.5818249	best: 23850.5818249 (0)	total: 12.6ms	remaining: 4m 12s
1000:	learn: 3786.7658013	test: 9219.0281232	best: 9219.0281232 (1000)	total: 11.4s	remaining: 3m 36s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 8992.22603
bestIteration = 1797

Shrink model to first 1798 iteration

Unnamed: 0,lgbm,cb
cv1,8964.3,8992.2
cv2,7044.5,7127.7
cv3,10988.0,8642.5
cv4,9747.6,7825.3
cv5,9600.2,8916.9
평균,9268.9,8300.9


#### 2. time_weighted 방식

In [58]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 6943.57
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7206.59
[2000]	valid_0's rmse: 7169.05
[3000]	valid_0's rmse: 7167.06
[4000]	valid_0's rmse: 7166.97
[5000]	valid_0's rmse: 7166.97
Early stopping, best iteration is:
[5460]	valid_0's rmse: 7166.97
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	valid_0's rmse: 3978.15
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[188]	valid_0's rmse: 5665.84
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's rmse: 5204.24
Learning rate set to 0.008427
0:	learn: 37098.2381079	test: 41533.5221844	best: 41533.5221844 (0)	total: 16.8ms	remaining: 5m 35s
1000:	learn: 3007.6866496	test: 7135.8937203	best: 7135.8195799 (998)	total: 14.4

Unnamed: 0,lgbm,cb
cv1,6943.6,6783.3
cv2,7167.0,7230.2
cv3,3978.1,3598.1
cv4,5665.8,5795.7
cv5,5204.2,4498.4
평균,5791.8,5581.1


In [59]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 4416.89
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 2467.64
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[100]	valid_0's rmse: 5714.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 4451.33
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[102]	valid_0's rmse: 5061.32
Learning rate set to 0.008028
0:	learn: 29634.3561217	test: 23811.6950925	best: 23811.6950925 (0)	total: 19.6ms	remaining: 6m 31s
1000:	learn: 1933.3606394	test: 4354.4086080	best: 4354.3745639 (999)	total: 14.4s	remaining: 4m 32s
2000:	learn: 1106.8828292	test: 4219.1362959	best: 4219.1362959 (2000)	total: 28.1s	remaining: 4m 12s
Stopped by overfitting detector  (10

Unnamed: 0,lgbm,cb
cv1,4416.9,4215.1
cv2,2467.6,2571.5
cv3,5714.1,4383.4
cv4,4451.3,4444.9
cv5,5061.3,4334.8
평균,4422.3,3989.9


#### 3. play_weighted 방식

In [60]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[437]	valid_0's rmse: 11230.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11242.7
[2000]	valid_0's rmse: 11227.7
Early stopping, best iteration is:
[2514]	valid_0's rmse: 11227.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 9040.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[278]	valid_0's rmse: 11360.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 10437.8
Learning rate set to 0.008427
0:	learn: 37112.5824045	test: 41546.5651690	best: 41546.5651690 (0)	total: 15.8ms	remaining: 5m 16s
1000:	learn: 6299.6923399	test: 11982.9151386	best: 11982.9151386 (1000)	total: 15s	remaining: 4m 44s
2000:	learn: 4004.7080212	test: 11356.1004759	best: 11356.1004759 (2000)

Unnamed: 0,lgbm,cb
cv1,11230.7,10994.8
cv2,11227.4,11353.7
cv3,9040.9,8413.0
cv4,11360.9,11572.1
cv5,10437.8,10312.0
평균,10659.5,10529.1


In [61]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[80]	valid_0's rmse: 11391.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7361.44
Early stopping, best iteration is:
[1367]	valid_0's rmse: 7359.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's rmse: 10164.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 10228.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[126]	valid_0's rmse: 10651.6
Learning rate set to 0.008028
0:	learn: 29674.6338149	test: 23854.9604508	best: 23854.9604508 (0)	total: 14.2ms	remaining: 4m 44s
1000:	learn: 3619.8123919	test: 10647.8073914	best: 10647.8073914 (1000)	total: 13.7s	remaining: 4m 19s
2000:	learn: 2269.0990466	test: 10453.4077424	best: 10451.1277378 (1964)	total: 27.7s	remaining: 4m 9

Unnamed: 0,lgbm,cb
cv1,11391.1,10443.3
cv2,7359.5,6882.4
cv3,10164.1,8491.0
cv4,10228.9,8347.6
cv5,10651.6,10793.1
평균,9959.0,8991.5


#### 4. time_and_play weighted 방식

In [62]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance.to_csv(f'타자_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[177]	valid_0's rmse: 7540.13
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7311.99
[2000]	valid_0's rmse: 7260.37
Early stopping, best iteration is:
[2746]	valid_0's rmse: 7258.13
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 5000.65
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 7150.54
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[240]	valid_0's rmse: 5003.37
Learning rate set to 0.008427
0:	learn: 37106.2588655	test: 41540.5300977	best: 41540.5300977 (0)	total: 14.4ms	remaining: 4m 48s
1000:	learn: 3445.5262184	test: 7241.7265590	best: 7241.7265590 (1000)	total: 15.1s	remaining: 4m 47s
2000:	learn: 2113.1989907	test: 6945.3844572	best: 6945.3844572 (2000)	t

Unnamed: 0,lgbm,cb
cv1,7540.1,6813.5
cv2,7258.1,7281.8
cv3,5000.7,4567.6
cv4,7150.5,6758.4
cv5,5003.4,4765.7
평균,6390.6,6037.4


In [63]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance.to_csv(f'투수_time_and_play_weighted.csv', index=False, encoding='cp949') # 나중에 구글폼에 복붙하려고 잠시 저장
pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[65]	valid_0's rmse: 6182.96
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 6750.45
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's rmse: 7939.84
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 5614.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[85]	valid_0's rmse: 6152.77
Learning rate set to 0.008028
0:	learn: 29669.3683643	test: 23831.1124081	best: 23831.1124081 (0)	total: 16.5ms	remaining: 5m 29s
1000:	learn: 2266.5858024	test: 5898.7706259	best: 5898.5530783 (999)	total: 14.4s	remaining: 4m 32s
2000:	learn: 1158.8909021	test: 5775.9318104	best: 5774.7964918 (1975)	total: 28.5s	remaining: 4m 16s
Stopped by overfitting detector  (100

Unnamed: 0,lgbm,cb
cv1,6183.0,5771.1
cv2,6750.4,3929.6
cv3,7939.8,5670.4
cv4,5614.9,5873.7
cv5,6152.8,4929.3
평균,6528.2,5234.8
