In [1]:
'''
Catboost와 Lightgbm만 실험
'''

'\nCatboost와 Lightgbm만 실험\n'

# 패키지 불러오기

In [2]:
import math
import time
import warnings

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings(action='ignore')

n_splits = 5

# 함수 정의

In [3]:
# 시간가중치 생성
def make_exponential_weight(length, a):
    return [a**(i-1) for i in range(length, 0, -1)]

##################################################################################################
##################################################################################################

# 단순 평균
def simple_creation(dataset, n_year, feature):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset.loc[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            if feature == '연봉':
                tmp.loc[i, f'{n_year}년평균{feature}'] = tmp.loc[i-n_year:i-1, feature].mean()
            else:
                tmp.loc[i, f'{n_year}년평균{feature}'] = tmp.loc[i-n_year+1:i, feature].mean()
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중시간 평균
def time_weighted_creation(dataset, n_year, feature, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            if feature == '연봉':
                value = np.array(tmp.loc[i-n_year:i-1, feature])
            else:
                value = np.array(tmp.loc[i-n_year+1:i, feature])
                
            weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                      a = alpha))
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중타석(또는 이닝) 평균
def play_weighted_creation(dataset, n_year, feature, criteria):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            if feature =='연봉':
                value = np.array(tmp.loc[i-n_year:i-1, feature])
                weight = np.array(tmp.loc[i-n_year:i-1, criteria])
                
            else:
                value = np.array(tmp.loc[i-n_year+1:i, feature])
                weight = np.array(tmp.loc[i-n_year+1:i, criteria])
                
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 시간 + 타석(또는 이닝) 평균
def timeandplay_weighted_creation(dataset, n_year, feature, criteria, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            if feature == '연봉':
                value = np.array(tmp.loc[i-n_year:i-1, feature])
                play_weight = np.array(tmp.loc[i-n_year:i-1, criteria])
                
            else:
                value = np.array(tmp.loc[i-n_year+1:i, feature])
                play_weight = np.array(tmp.loc[i-n_year+1:i, criteria])
                
            time_weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                          a = alpha))
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(np.multiply(value, play_weight), time_weight) / np.matmul(play_weight, time_weight)
                        
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 종합 함수
def make_average_variable(dataset, n_year, feature, criteria, alpha, how):
    
    if how == 'simple':
        return simple_creation(dataset, n_year, feature)
    
    elif how == 'time_weighted':
        return time_weighted_creation(dataset, n_year, feature, alpha)
    
    elif how == 'play_weighted':
        return play_weighted_creation(dataset, n_year, feature, criteria)
    
    elif how == 'time_and_play_weighted':
        return timeandplay_weighted_creation(dataset, n_year, feature, criteria, alpha)

In [4]:
# N년평균연봉 결측치 대체 함수 (최저연봉대체)

def data_fillna(dataframe, n_year):
    
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2002), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2002, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2003), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2003, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2004), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2004, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2005), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2005, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2006), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2006, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2007), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2007, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2008), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2008, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2009), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2009, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2010), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2010, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2011), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2011, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2012), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2012, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2013), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2013, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2014), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2014, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2015), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2015, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2016), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2016, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2017), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2017, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2018), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2018, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2019), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2019, '최저연봉'])
    dataframe.loc[(dataframe[f'{n_year}년평균연봉'].isnull()) & (dataframe['연도'] == 2020), f'{n_year}년평균연봉'] = float(minimum_money.loc[minimum_money['연도'] == 2020, '최저연봉'])

    return dataframe

In [5]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [6]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        pred_valid = lgb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)

    # lgbm 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [7]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = cb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# 데이터 불러오기 및 파생변수를 생성할 변수 선정

In [8]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver5/타자(모델링용_원핫인코딩)_train_ver5.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver5/투수(모델링용_원핫인코딩)_train_ver5.csv')

hitter.shape, pitcher.shape

((3142, 91), (2425, 85))

In [9]:
# 타자 변수들 중 누적변수를 생성할 변수 추리기
hitter_columns = list(hitter.columns)
hitter_remove_cols = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '뉴스개수(누적정규화)', '누적타석', '출생연도', '출생월', '출생일', '나이', 
                  '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성',
                  '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대', '포지션(수비)_1루수', '포지션(수비)_2루수',
                  '포지션(수비)_3루수', '포지션(수비)_수비기록없음', '포지션(수비)_우익수', '포지션(수비)_유격수',
                  '포지션(수비)_좌익수', '포지션(수비)_중견수', '포지션(수비)_포수', '1차FA여부', '2차FA여부', '3차FA여부']

for hitter_remove_col in hitter_remove_cols:
    hitter_columns.remove(hitter_remove_col)
    
    

# 투수 변수들 중 누적변수를 생성할 변수 추리기
pitcher_columns = list(pitcher.columns)
pitcher_remove_cols = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '뉴스개수(누적정규화)', '누적이닝', '출생연도', '출생월', '출생일', '나이',
                      '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성', 
                      '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대', '1차FA여부', '2차FA여부']

for pitcher_remove_col in pitcher_remove_cols:
    pitcher_columns.remove(pitcher_remove_col)
    

print(f'타자 파생변수 개수: {len(hitter_columns)}')
print(f'투수 파생변수 개수: {len(pitcher_columns)}')

타자 파생변수 개수: 57
투수 파생변수 개수: 61


In [10]:
# 최저연봉 데이터프레임 구성

### 최저연봉데이터
money = pd.read_excel('../연봉데이터/연도별_최저연봉.xlsx')
money['최저연봉'] = money['최저연봉'].apply(lambda x: int(x.replace(',', '')))

### 소비자물가지수
money_rate = pd.read_excel('../물가상승률데이터/소비자_물가지수(한국은행).xlsx')
money_rate = money_rate.loc[:, '2002':].transpose()
money_rate = money_rate.reset_index()
money_rate.columns = ['연도','소비자물가지수']
for idx in range(money_rate.shape[0]):
    money_rate.loc[idx, '2020년기준상수'] = money_rate.loc[money_rate.shape[0]-1, '소비자물가지수'] / money_rate.loc[idx, '소비자물가지수']
    
### 소비자물가지수를 반영한 최저연봉
minimum_money = pd.DataFrame({'연도':list(money['연도']),
                              '최저연봉':list(money['최저연봉'] * money_rate['2020년기준상수'])})
minimum_money

Unnamed: 0,연도,최저연봉
0,2002,2517.298049
1,2003,2431.83391
2,2004,2485.636814
3,2005,2687.777268
4,2006,2628.862123
5,2007,2563.871831
6,2008,2449.377897
7,2009,2383.665717
8,2010,2778.750371
9,2011,2671.199468


# 파생변수 생성 (3년, 0.9)

In [11]:
YEAR = 3
ALPHA = 0.9

In [12]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_simple = data_fillna(hitter_simple, YEAR)
pitcher_simple = data_fillna(pitcher_simple, YEAR)
hitter_simple.to_csv(f'누적변수추가(ver5데이터)/타자_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_simple.to_csv(f'누적변수추가(ver5데이터)/투수_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 102초
20/57개 생성 완료!! == 경과시간: 201초
30/57개 생성 완료!! == 경과시간: 301초
40/57개 생성 완료!! == 경과시간: 401초
50/57개 생성 완료!! == 경과시간: 501초
10/61개 생성 완료!! == 경과시간: 656초
20/61개 생성 완료!! == 경과시간: 742초
30/61개 생성 완료!! == 경과시간: 827초
40/61개 생성 완료!! == 경과시간: 912초
50/61개 생성 완료!! == 경과시간: 998초
60/61개 생성 완료!! == 경과시간: 1084초


In [13]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_weighted = data_fillna(hitter_time_weighted, YEAR)
pitcher_time_weighted = data_fillna(pitcher_time_weighted, YEAR)
hitter_time_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 100초
20/57개 생성 완료!! == 경과시간: 198초
30/57개 생성 완료!! == 경과시간: 296초
40/57개 생성 완료!! == 경과시간: 394초
50/57개 생성 완료!! == 경과시간: 492초
10/61개 생성 완료!! == 경과시간: 645초
20/61개 생성 완료!! == 경과시간: 729초
30/61개 생성 완료!! == 경과시간: 813초
40/61개 생성 완료!! == 경과시간: 897초
50/61개 생성 완료!! == 경과시간: 981초
60/61개 생성 완료!! == 경과시간: 1066초


In [14]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_play_weighted = data_fillna(hitter_play_weighted, YEAR)
pitcher_play_weighted = data_fillna(pitcher_play_weighted, YEAR)
hitter_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 111초
20/57개 생성 완료!! == 경과시간: 217초
30/57개 생성 완료!! == 경과시간: 325초
40/57개 생성 완료!! == 경과시간: 430초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 698초
20/61개 생성 완료!! == 경과시간: 787초
30/61개 생성 완료!! == 경과시간: 877초
40/61개 생성 완료!! == 경과시간: 967초
50/61개 생성 완료!! == 경과시간: 1056초
60/61개 생성 완료!! == 경과시간: 1146초


In [15]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_and_play_weighted = data_fillna(hitter_time_and_play_weighted, YEAR)
pitcher_time_and_play_weighted = data_fillna(pitcher_time_and_play_weighted, YEAR)
hitter_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 108초
20/57개 생성 완료!! == 경과시간: 213초
30/57개 생성 완료!! == 경과시간: 317초
40/57개 생성 완료!! == 경과시간: 422초
50/57개 생성 완료!! == 경과시간: 527초
10/61개 생성 완료!! == 경과시간: 691초
20/61개 생성 완료!! == 경과시간: 781초
30/61개 생성 완료!! == 경과시간: 870초
40/61개 생성 완료!! == 경과시간: 960초
50/61개 생성 완료!! == 경과시간: 1050초
60/61개 생성 완료!! == 경과시간: 1140초


In [16]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간 + 타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)


#### 1. simple 방식

In [17]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 12250.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14307.1
Early stopping, best iteration is:
[1345]	valid_0's rmse: 14299.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's rmse: 9841.96
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 11974.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's rmse: 11475.1
Learning rate set to 0.009465
0:	learn: 38799.6653689	test: 31882.5956309	best: 31882.5956309 (0)	total: 177ms	remaining: 58m 50s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11287.72156
bestIteration = 897

Shrink model to first 898 iterations.
Learning rate set to 0.009465
0:	learn: 36090.0439491	test: 42862.045

Unnamed: 0,lgbm,cb
cv1,12250.7,11287.7
cv2,14299.5,14954.8
cv3,9842.0,9430.0
cv4,11974.7,10984.1
cv5,11475.1,10035.8
평균,11968.4,11338.5


In [18]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7083.11
[2000]	valid_0's rmse: 7074.35
Early stopping, best iteration is:
[2452]	valid_0's rmse: 7074.31
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	valid_0's rmse: 8931.24
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[283]	valid_0's rmse: 11474.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 11898.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[187]	valid_0's rmse: 10995.3
Learning rate set to 0.009088
0:	learn: 28904.8829476	test: 26047.6994511	best: 26047.6994511 (0)	total: 39.1ms	remaining: 13m 2s
1000:	learn: 4761.6497813	test: 7735.9444633	best: 7735.6145514 (999)	total: 11.5s	remaining: 3m 37s
2000:	learn: 3049.6012954	test: 7224.3328354	best: 7221.7033894 (1961)	

Unnamed: 0,lgbm,cb
cv1,7074.3,7221.7
cv2,8931.2,7834.6
cv3,11474.2,11605.1
cv4,11898.8,10542.4
cv5,10995.3,9401.4
평균,10074.8,9321.0


#### 2. time_weighted 방식

In [19]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 12646.5
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13887.9
[2000]	valid_0's rmse: 13846.7
[3000]	valid_0's rmse: 13844.7
[4000]	valid_0's rmse: 13844.7
[5000]	valid_0's rmse: 13844.7
[6000]	valid_0's rmse: 13844.7
[7000]	valid_0's rmse: 13844.7
Early stopping, best iteration is:
[7482]	valid_0's rmse: 13844.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 10089.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 11495.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[153]	valid_0's rmse: 10637.4
Learning rate set to 0.009465
0:	learn: 38803.5600692	test: 31873.8091295	best: 31873.8091295 (0)	total: 21.3ms	remaining: 7m 6s
Stopped by overfittin

Unnamed: 0,lgbm,cb
cv1,12646.5,11399.8
cv2,13844.7,14716.0
cv3,10089.1,9295.7
cv4,11495.9,10998.3
cv5,10637.4,10093.5
평균,11742.7,11300.7


In [20]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's rmse: 7558.04
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[124]	valid_0's rmse: 8612.24
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's rmse: 10773.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 12547
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[171]	valid_0's rmse: 11112.5
Learning rate set to 0.009088
0:	learn: 28895.9843455	test: 26048.3925603	best: 26048.3925603 (0)	total: 39.7ms	remaining: 13m 14s
1000:	learn: 4629.9981685	test: 7585.9270920	best: 7585.5948687 (999)	total: 13.2s	remaining: 4m 10s
2000:	learn: 2910.7699862	test: 7203.1551434	best: 7202.2845144 (1993)	total: 26.3s	remaining: 3m 56s
Stopped by overfitting detector  (

Unnamed: 0,lgbm,cb
cv1,7558.0,7108.6
cv2,8612.2,7727.9
cv3,10773.9,11214.3
cv4,12547.0,10308.7
cv5,11112.5,9316.8
평균,10120.7,9135.3


#### 3. play_weighted 방식

In [21]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 12687.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15142.3
[2000]	valid_0's rmse: 15121.1
Early stopping, best iteration is:
[2126]	valid_0's rmse: 15120.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 10523.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[219]	valid_0's rmse: 11148.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's rmse: 11559.9
Learning rate set to 0.009465
0:	learn: 38807.3684861	test: 31874.5749199	best: 31874.5749199 (0)	total: 19.4ms	remaining: 6m 28s
1000:	learn: 6024.2457495	test: 11373.2694299	best: 11370.7220132 (962)	total: 14.8s	remaining: 4m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1123

Unnamed: 0,lgbm,cb
cv1,12687.1,11234.8
cv2,15120.9,14311.9
cv3,10523.8,9890.3
cv4,11148.1,11110.1
cv5,11559.9,10382.1
평균,12208.0,11385.8


In [22]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8016.3
[2000]	valid_0's rmse: 7995.15
Early stopping, best iteration is:
[2613]	valid_0's rmse: 7994.52
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 8837.61
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[195]	valid_0's rmse: 11506.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's rmse: 11932.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's rmse: 10809.7
Learning rate set to 0.009088
0:	learn: 28876.9265062	test: 26024.8646158	best: 26024.8646158 (0)	total: 19.5ms	remaining: 6m 29s
1000:	learn: 4698.3777620	test: 7986.6573424	best: 7986.6573424 (1000)	total: 13.8s	remaining: 4m 22s
2000:	learn: 2966.5927501	test: 7572.9048009	best: 7572.9048009 (2000)	t

Unnamed: 0,lgbm,cb
cv1,7994.5,7420.1
cv2,8837.6,7973.8
cv3,11506.4,10965.4
cv4,11932.3,10147.1
cv5,10809.7,9178.5
평균,10216.1,9137.0


#### 4. time_and_play weighted 방식

In [23]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 12554.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[749]	valid_0's rmse: 14480.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 10714.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[195]	valid_0's rmse: 11919.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's rmse: 11755.3
Learning rate set to 0.009465
0:	learn: 38805.4552953	test: 31873.5175467	best: 31873.5175467 (0)	total: 18.8ms	remaining: 6m 15s
1000:	learn: 6031.8206141	test: 11162.4952700	best: 11162.4952700 (1000)	total: 14.7s	remaining: 4m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 10947.57358
bestIteration = 1659

Shrink model to first 1660 iterat

Unnamed: 0,lgbm,cb
cv1,12554.2,10947.6
cv2,14480.7,14728.9
cv3,10714.3,9673.5
cv4,11919.4,11210.8
cv5,11755.3,10776.2
평균,12284.8,11467.4


In [24]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[258]	valid_0's rmse: 8116.84
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[339]	valid_0's rmse: 9142.38
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	valid_0's rmse: 11577
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 12287.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[211]	valid_0's rmse: 10914.3
Learning rate set to 0.009088
0:	learn: 28879.2917132	test: 26038.4827048	best: 26038.4827048 (0)	total: 22ms	remaining: 7m 20s
1000:	learn: 4711.3152309	test: 7746.4142721	best: 7746.0666583 (995)	total: 13.8s	remaining: 4m 21s
2000:	learn: 2943.6572205	test: 7353.6485902	best: 7351.4138676 (1994)	total: 27.5s	remaining: 4m 6s
3000:	learn: 2116.7028005	test: 7275.4

Unnamed: 0,lgbm,cb
cv1,8116.8,7245.4
cv2,9142.4,7914.1
cv3,11577.0,10999.6
cv4,12287.9,10085.4
cv5,10914.3,9327.4
평균,10407.7,9114.4


# 파생변수 생성 (4년, 0.9)

In [25]:
YEAR = 4
ALPHA = 0.9

In [26]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_simple = data_fillna(hitter_simple, YEAR)
pitcher_simple = data_fillna(pitcher_simple, YEAR)
hitter_simple.to_csv(f'누적변수추가(ver5데이터)/타자_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_simple.to_csv(f'누적변수추가(ver5데이터)/투수_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 100초
20/57개 생성 완료!! == 경과시간: 201초
30/57개 생성 완료!! == 경과시간: 302초
40/57개 생성 완료!! == 경과시간: 402초
50/57개 생성 완료!! == 경과시간: 503초
10/61개 생성 완료!! == 경과시간: 668초
20/61개 생성 완료!! == 경과시간: 758초
30/61개 생성 완료!! == 경과시간: 845초
40/61개 생성 완료!! == 경과시간: 931초
50/61개 생성 완료!! == 경과시간: 1018초
60/61개 생성 완료!! == 경과시간: 1105초


In [27]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_weighted = data_fillna(hitter_time_weighted, YEAR)
pitcher_time_weighted = data_fillna(pitcher_time_weighted, YEAR)
hitter_time_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 101초
20/57개 생성 완료!! == 경과시간: 200초
30/57개 생성 완료!! == 경과시간: 299초
40/57개 생성 완료!! == 경과시간: 399초
50/57개 생성 완료!! == 경과시간: 498초
10/61개 생성 완료!! == 경과시간: 653초
20/61개 생성 완료!! == 경과시간: 738초
30/61개 생성 완료!! == 경과시간: 823초
40/61개 생성 완료!! == 경과시간: 908초
50/61개 생성 완료!! == 경과시간: 993초
60/61개 생성 완료!! == 경과시간: 1078초


In [28]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_play_weighted = data_fillna(hitter_play_weighted, YEAR)
pitcher_play_weighted = data_fillna(pitcher_play_weighted, YEAR)
hitter_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 110초
20/57개 생성 완료!! == 경과시간: 217초
30/57개 생성 완료!! == 경과시간: 324초
40/57개 생성 완료!! == 경과시간: 431초
50/57개 생성 완료!! == 경과시간: 537초
10/61개 생성 완료!! == 경과시간: 703초
20/61개 생성 완료!! == 경과시간: 794초
30/61개 생성 완료!! == 경과시간: 885초
40/61개 생성 완료!! == 경과시간: 976초
50/61개 생성 완료!! == 경과시간: 1067초
60/61개 생성 완료!! == 경과시간: 1158초


In [29]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_and_play_weighted = data_fillna(hitter_time_and_play_weighted, YEAR)
pitcher_time_and_play_weighted = data_fillna(pitcher_time_and_play_weighted, YEAR)
hitter_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 109초
20/57개 생성 완료!! == 경과시간: 216초
30/57개 생성 완료!! == 경과시간: 322초
40/57개 생성 완료!! == 경과시간: 428초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 700초
20/61개 생성 완료!! == 경과시간: 792초
30/61개 생성 완료!! == 경과시간: 882초
40/61개 생성 완료!! == 경과시간: 973초
50/61개 생성 완료!! == 경과시간: 1064초
60/61개 생성 완료!! == 경과시간: 1155초


In [30]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간 + 타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)


#### 1. simple 방식

In [31]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's rmse: 11426.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[679]	valid_0's rmse: 14105.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[89]	valid_0's rmse: 10418.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[95]	valid_0's rmse: 11466.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 12125.9
Learning rate set to 0.009465
0:	learn: 38792.8957771	test: 31850.7506794	best: 31850.7506794 (0)	total: 44.2ms	remaining: 14m 44s
1000:	learn: 6063.8094929	test: 10990.8913112	best: 10988.4915632 (987)	total: 13.8s	remaining: 4m 22s
2000:	learn: 3636.4672972	test: 10732.6287608	best: 10728.4207942 (1973)	total: 27.7s	remaining: 4m 8s
3000:	learn: 2542.2231218	test: 

Unnamed: 0,lgbm,cb
cv1,11426.1,10660.8
cv2,14105.9,13741.0
cv3,10418.1,9474.2
cv4,11466.3,10426.8
cv5,12125.9,10149.8
평균,11908.5,10890.5


In [32]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7905.13
[2000]	valid_0's rmse: 7883.98
[3000]	valid_0's rmse: 7882.8
[4000]	valid_0's rmse: 7882.75
[5000]	valid_0's rmse: 7882.75
[6000]	valid_0's rmse: 7882.75
[7000]	valid_0's rmse: 7882.75
[8000]	valid_0's rmse: 7882.75
Early stopping, best iteration is:
[7981]	valid_0's rmse: 7882.75
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[761]	valid_0's rmse: 8118.73
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[279]	valid_0's rmse: 11095.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's rmse: 11114.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	valid_0's rmse: 10492.5
Learning rate set to 0.009088
0:	learn: 28895.7877588	test: 26028.5187144	best: 26028.5187144 (0)	total: 35.8ms	remaini

Unnamed: 0,lgbm,cb
cv1,7882.7,7187.7
cv2,8118.7,7542.4
cv3,11095.3,11030.3
cv4,11114.3,9967.5
cv5,10492.5,9319.5
평균,9740.7,9009.5


#### 2. time_weighted 방식

In [33]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 12271
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13375.4
[2000]	valid_0's rmse: 13326.5
[3000]	valid_0's rmse: 13323.1
[4000]	valid_0's rmse: 13322.8
[5000]	valid_0's rmse: 13322.8
[6000]	valid_0's rmse: 13322.8
[7000]	valid_0's rmse: 13322.8
[8000]	valid_0's rmse: 13322.8
[9000]	valid_0's rmse: 13322.8
Early stopping, best iteration is:
[9696]	valid_0's rmse: 13322.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[70]	valid_0's rmse: 9852.77
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 11096.3
Early stopping, best iteration is:
[954]	valid_0's rmse: 11096.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 11497
Learning rate set to 0.009465
0:	learn: 38805.3845479	test: 3

Unnamed: 0,lgbm,cb
cv1,12271.0,10660.9
cv2,13322.8,13690.9
cv3,9852.8,9468.2
cv4,11096.1,10251.5
cv5,11497.0,10186.2
평균,11607.9,10851.5


In [34]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7531.48
[2000]	valid_0's rmse: 7514.55
[3000]	valid_0's rmse: 7513.96
[4000]	valid_0's rmse: 7513.95
[5000]	valid_0's rmse: 7513.94
[6000]	valid_0's rmse: 7513.94
[7000]	valid_0's rmse: 7513.94
Early stopping, best iteration is:
[7046]	valid_0's rmse: 7513.94
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[681]	valid_0's rmse: 7746.43
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 11002
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[441]	valid_0's rmse: 11638.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[170]	valid_0's rmse: 10467.2
Learning rate set to 0.009088
0:	learn: 28902.2513687	test: 26032.4504475	best: 26032.4504475 (0)	total: 31.2ms	remaining: 10m 23s
1000:	learn: 4561.

Unnamed: 0,lgbm,cb
cv1,7513.9,7170.7
cv2,7746.4,7211.6
cv3,11002.0,11128.9
cv4,11638.4,10191.1
cv5,10467.2,9112.7
평균,9673.6,8963.0


#### 3. play_weighted 방식

In [35]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 11760.8
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14024.5
Early stopping, best iteration is:
[962]	valid_0's rmse: 14022.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 10598.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[163]	valid_0's rmse: 11185
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 12461
Learning rate set to 0.009465
0:	learn: 38801.0785917	test: 31881.7939028	best: 31881.7939028 (0)	total: 47.8ms	remaining: 15m 55s
1000:	learn: 6061.3644603	test: 11089.2409909	best: 11089.1895082 (994)	total: 15.1s	remaining: 4m 47s
2000:	learn: 3647.9031860	test: 10849.2251776	best: 10847.3444679 (1997)	total: 30s	remaining: 4m 29s
Stopp

Unnamed: 0,lgbm,cb
cv1,11760.8,10833.0
cv2,14022.7,13416.7
cv3,10598.4,9740.1
cv4,11185.0,10496.5
cv5,12461.0,10041.5
평균,12005.6,10905.6


In [36]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7385.7
[2000]	valid_0's rmse: 7365.8
[3000]	valid_0's rmse: 7365.23
[4000]	valid_0's rmse: 7365.21
Early stopping, best iteration is:
[4270]	valid_0's rmse: 7365.21
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	valid_0's rmse: 8823.89
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[96]	valid_0's rmse: 11175.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[225]	valid_0's rmse: 10791.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's rmse: 9091.74
Learning rate set to 0.009088
0:	learn: 28881.6821311	test: 26027.8248705	best: 26027.8248705 (0)	total: 9.29ms	remaining: 3m 5s
1000:	learn: 4623.7758293	test: 8133.9762898	best: 8133.4596915 (999)	total: 13.8s	remaining: 4m 22s
2000:	learn: 2

Unnamed: 0,lgbm,cb
cv1,7365.2,7572.1
cv2,8823.9,7753.5
cv3,11175.7,10969.3
cv4,10791.4,9858.1
cv5,9091.7,9193.4
평균,9449.6,9069.3


#### 4. time_and_play weighted 방식

In [37]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 11679.5
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14093.2
[2000]	valid_0's rmse: 14074.4
Early stopping, best iteration is:
[2460]	valid_0's rmse: 14073.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	valid_0's rmse: 10282.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's rmse: 10635.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's rmse: 11656
Learning rate set to 0.009465
0:	learn: 38800.7592976	test: 31868.5111179	best: 31868.5111179 (0)	total: 24.2ms	remaining: 8m 3s
1000:	learn: 6019.0582117	test: 10932.5970668	best: 10930.8438715 (997)	total: 14.7s	remaining: 4m 38s
2000:	learn: 3634.3388315	test: 10695.0934274	best: 10694.5192635 (1998)	t

Unnamed: 0,lgbm,cb
cv1,11679.5,10691.7
cv2,14073.8,13253.3
cv3,10282.8,9402.7
cv4,10635.9,10307.2
cv5,11656.0,10169.8
평균,11665.6,10764.9


In [38]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8547.25
[2000]	valid_0's rmse: 8529.35
[3000]	valid_0's rmse: 8529.06
Early stopping, best iteration is:
[3315]	valid_0's rmse: 8529.05
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[252]	valid_0's rmse: 8432.69
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[225]	valid_0's rmse: 11237.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[78]	valid_0's rmse: 11256.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[157]	valid_0's rmse: 10126.5
Learning rate set to 0.009088
0:	learn: 28881.8433291	test: 26027.1025283	best: 26027.1025283 (0)	total: 17.2ms	remaining: 5m 43s
1000:	learn: 4592.7360441	test: 8023.1427685	best: 8023.1427685 (1000)	total: 13.9s	remaining: 4m 23s
2000:	learn: 2990.2320683	test: 7701.428

Unnamed: 0,lgbm,cb
cv1,8529.1,7632.4
cv2,8432.7,7911.1
cv3,11237.7,10468.5
cv4,11256.9,10021.4
cv5,10126.5,9409.4
평균,9916.6,9088.5


# 파생변수 생성 (5년, 0.9)

In [39]:
YEAR = 5
ALPHA = 0.9

In [40]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_simple = data_fillna(hitter_simple, YEAR)
pitcher_simple = data_fillna(pitcher_simple, YEAR)
hitter_simple.to_csv(f'누적변수추가(ver5데이터)/타자_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_simple.to_csv(f'누적변수추가(ver5데이터)/투수_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 102초
20/57개 생성 완료!! == 경과시간: 203초
30/57개 생성 완료!! == 경과시간: 304초
40/57개 생성 완료!! == 경과시간: 406초
50/57개 생성 완료!! == 경과시간: 507초
10/61개 생성 완료!! == 경과시간: 664초
20/61개 생성 완료!! == 경과시간: 752초
30/61개 생성 완료!! == 경과시간: 838초
40/61개 생성 완료!! == 경과시간: 925초
50/61개 생성 완료!! == 경과시간: 1012초
60/61개 생성 완료!! == 경과시간: 1099초


In [41]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_weighted = data_fillna(hitter_time_weighted, YEAR)
pitcher_time_weighted = data_fillna(pitcher_time_weighted, YEAR)
hitter_time_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 101초
20/57개 생성 완료!! == 경과시간: 201초
30/57개 생성 완료!! == 경과시간: 300초
40/57개 생성 완료!! == 경과시간: 399초
50/57개 생성 완료!! == 경과시간: 499초
10/61개 생성 완료!! == 경과시간: 654초
20/61개 생성 완료!! == 경과시간: 739초
30/61개 생성 완료!! == 경과시간: 825초
40/61개 생성 완료!! == 경과시간: 910초
50/61개 생성 완료!! == 경과시간: 995초
60/61개 생성 완료!! == 경과시간: 1082초


In [42]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_play_weighted = data_fillna(hitter_play_weighted, YEAR)
pitcher_play_weighted = data_fillna(pitcher_play_weighted, YEAR)
hitter_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 112초
20/57개 생성 완료!! == 경과시간: 223초
30/57개 생성 완료!! == 경과시간: 330초
40/57개 생성 완료!! == 경과시간: 437초
50/57개 생성 완료!! == 경과시간: 544초
10/61개 생성 완료!! == 경과시간: 710초
20/61개 생성 완료!! == 경과시간: 802초
30/61개 생성 완료!! == 경과시간: 893초
40/61개 생성 완료!! == 경과시간: 985초
50/61개 생성 완료!! == 경과시간: 1076초
60/61개 생성 완료!! == 경과시간: 1167초


In [43]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_and_play_weighted = data_fillna(hitter_time_and_play_weighted, YEAR)
pitcher_time_and_play_weighted = data_fillna(pitcher_time_and_play_weighted, YEAR)
hitter_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 110초
20/57개 생성 완료!! == 경과시간: 216초
30/57개 생성 완료!! == 경과시간: 323초
40/57개 생성 완료!! == 경과시간: 430초
50/57개 생성 완료!! == 경과시간: 537초
10/61개 생성 완료!! == 경과시간: 704초
20/61개 생성 완료!! == 경과시간: 795초
30/61개 생성 완료!! == 경과시간: 886초
40/61개 생성 완료!! == 경과시간: 978초
50/61개 생성 완료!! == 경과시간: 1069초
60/61개 생성 완료!! == 경과시간: 1160초


In [44]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간 + 타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)


#### 1. simple 방식

In [45]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 12417.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 12973
[2000]	valid_0's rmse: 12926.1
[3000]	valid_0's rmse: 12923.2
[4000]	valid_0's rmse: 12923.1
[5000]	valid_0's rmse: 12923
[6000]	valid_0's rmse: 12923
[7000]	valid_0's rmse: 12923
[8000]	valid_0's rmse: 12923
[9000]	valid_0's rmse: 12923
Early stopping, best iteration is:
[9735]	valid_0's rmse: 12923
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's rmse: 10854.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[513]	valid_0's rmse: 10893.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's rmse: 11602
Learning rate set to 0.009465
0:	learn: 38819.5304069	test: 31875.7007715	best: 31875.7007715 (0)	total:

Unnamed: 0,lgbm,cb
cv1,12417.4,10807.6
cv2,12923.0,13284.7
cv3,10854.3,9736.9
cv4,10893.7,9937.3
cv5,11602.0,9808.6
평균,11738.1,10715.0


In [46]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7972.4
[2000]	valid_0's rmse: 7951.98
[3000]	valid_0's rmse: 7951.39
[4000]	valid_0's rmse: 7951.37
[5000]	valid_0's rmse: 7951.36
[6000]	valid_0's rmse: 7951.36
[7000]	valid_0's rmse: 7951.36
[8000]	valid_0's rmse: 7951.36
[9000]	valid_0's rmse: 7951.36
[10000]	valid_0's rmse: 7951.36
[11000]	valid_0's rmse: 7951.36
[12000]	valid_0's rmse: 7951.36
Early stopping, best iteration is:
[12842]	valid_0's rmse: 7951.36
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[404]	valid_0's rmse: 8282.89
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[185]	valid_0's rmse: 10550.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[82]	valid_0's rmse: 10023.2
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 9374.77
Early stopping, best ite

Unnamed: 0,lgbm,cb
cv1,7951.4,7793.9
cv2,8282.9,7122.1
cv3,10550.4,11135.5
cv4,10023.2,9788.1
cv5,9374.5,9257.1
평균,9236.5,9019.3


#### 2. time_weighted 방식

In [47]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's rmse: 12049.2
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13545.4
[2000]	valid_0's rmse: 13508.7
[3000]	valid_0's rmse: 13506.4
[4000]	valid_0's rmse: 13506.2
[5000]	valid_0's rmse: 13506.2
Early stopping, best iteration is:
[5499]	valid_0's rmse: 13506.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[79]	valid_0's rmse: 10534.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[667]	valid_0's rmse: 10827.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 12095.4
Learning rate set to 0.009465
0:	learn: 38798.8504370	test: 31879.0386607	best: 31879.0386607 (0)	total: 30.8ms	remaining: 10m 16s
1000:	learn: 5915.2688299	test: 11174.4831125	best: 11174.4831125 (1000)	total: 1

Unnamed: 0,lgbm,cb
cv1,12049.2,10990.1
cv2,13506.2,13375.9
cv3,10534.8,9705.6
cv4,10827.5,9971.8
cv5,12095.4,9848.5
평균,11802.6,10778.4


In [48]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8269.41
[2000]	valid_0's rmse: 8239.61
[3000]	valid_0's rmse: 8237.76
[4000]	valid_0's rmse: 8237.66
[5000]	valid_0's rmse: 8237.65
[6000]	valid_0's rmse: 8237.65
[7000]	valid_0's rmse: 8237.65
[8000]	valid_0's rmse: 8237.65
[9000]	valid_0's rmse: 8237.65
[10000]	valid_0's rmse: 8237.65
[11000]	valid_0's rmse: 8237.65
[12000]	valid_0's rmse: 8237.65
Early stopping, best iteration is:
[12893]	valid_0's rmse: 8237.65
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[743]	valid_0's rmse: 8239.15
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[199]	valid_0's rmse: 9872.75
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[155]	valid_0's rmse: 10646.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[299]	valid_0's rm

Unnamed: 0,lgbm,cb
cv1,8237.6,7631.7
cv2,8239.2,7513.8
cv3,9872.8,11153.5
cv4,10646.8,9941.1
cv5,9757.0,9024.1
평균,9350.7,9052.8


#### 3. play_weighted 방식

In [49]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 13109
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[270]	valid_0's rmse: 14598.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	valid_0's rmse: 9808.98
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[161]	valid_0's rmse: 10697.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[117]	valid_0's rmse: 11685.7
Learning rate set to 0.009465
0:	learn: 38805.0491022	test: 31878.1724797	best: 31878.1724797 (0)	total: 31.6ms	remaining: 10m 31s
1000:	learn: 6133.2686226	test: 11190.3478596	best: 11186.0998137 (990)	total: 14.6s	remaining: 4m 37s
2000:	learn: 3722.9927058	test: 10859.8145240	best: 10859.8145240 (2000)	total: 29.3s	remaining: 4m 23s
Stopped by overfitting detector

Unnamed: 0,lgbm,cb
cv1,13109.0,10813.5
cv2,14598.8,12866.0
cv3,9809.0,9590.5
cv4,10697.2,10051.8
cv5,11685.7,9683.3
평균,11979.9,10601.0


In [50]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8204.27
[2000]	valid_0's rmse: 8182.56
[3000]	valid_0's rmse: 8181.24
[4000]	valid_0's rmse: 8181.17
[5000]	valid_0's rmse: 8181.17
[6000]	valid_0's rmse: 8181.17
[7000]	valid_0's rmse: 8181.17
[8000]	valid_0's rmse: 8181.17
[9000]	valid_0's rmse: 8181.17
[10000]	valid_0's rmse: 8181.17
[11000]	valid_0's rmse: 8181.17
[12000]	valid_0's rmse: 8181.17
Early stopping, best iteration is:
[12885]	valid_0's rmse: 8181.17
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[664]	valid_0's rmse: 7464.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[138]	valid_0's rmse: 10286.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	valid_0's rmse: 9962.43
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	valid_0's rms

Unnamed: 0,lgbm,cb
cv1,8181.2,7648.9
cv2,7464.6,7226.6
cv3,10286.2,10749.8
cv4,9962.4,9646.7
cv5,9479.5,8979.8
평균,9074.8,8850.4


#### 4. time_and_play weighted 방식

In [51]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 12456.9
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14042.9
[2000]	valid_0's rmse: 14003.2
[3000]	valid_0's rmse: 14001.6
[4000]	valid_0's rmse: 14001.5
[5000]	valid_0's rmse: 14001.5
[6000]	valid_0's rmse: 14001.5
[7000]	valid_0's rmse: 14001.5
[8000]	valid_0's rmse: 14001.5
[9000]	valid_0's rmse: 14001.5
[10000]	valid_0's rmse: 14001.5
Early stopping, best iteration is:
[10513]	valid_0's rmse: 14001.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's rmse: 10201.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's rmse: 11076
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 12289.1
Learning rate set to 0.009465
0:	learn: 38803.4326613	te

Unnamed: 0,lgbm,cb
cv1,12456.9,10903.0
cv2,14001.5,13102.2
cv3,10201.7,9497.3
cv4,11076.0,10257.4
cv5,12289.1,9930.8
평균,12005.1,10738.2


In [52]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7750.16
[2000]	valid_0's rmse: 7708.19
[3000]	valid_0's rmse: 7706.64
[4000]	valid_0's rmse: 7706.6
[5000]	valid_0's rmse: 7706.59
[6000]	valid_0's rmse: 7706.59
[7000]	valid_0's rmse: 7706.59
[8000]	valid_0's rmse: 7706.59
[9000]	valid_0's rmse: 7706.59
[10000]	valid_0's rmse: 7706.59
[11000]	valid_0's rmse: 7706.59
[12000]	valid_0's rmse: 7706.59
Early stopping, best iteration is:
[11961]	valid_0's rmse: 7706.59
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[156]	valid_0's rmse: 8205.32
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[407]	valid_0's rmse: 9843.86
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[213]	valid_0's rmse: 10048.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rms

Unnamed: 0,lgbm,cb
cv1,7706.6,7521.8
cv2,8205.3,7234.4
cv3,9843.9,10631.6
cv4,10048.6,9835.8
cv5,9155.2,8929.5
평균,8991.9,8830.6


# 파생변수 생성 (3년, 0.6)

In [53]:
YEAR = 3
ALPHA = 0.6

In [54]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_simple = data_fillna(hitter_simple, YEAR)
pitcher_simple = data_fillna(pitcher_simple, YEAR)
hitter_simple.to_csv(f'누적변수추가(ver5데이터)/타자_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_simple.to_csv(f'누적변수추가(ver5데이터)/투수_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 101초
20/57개 생성 완료!! == 경과시간: 203초
30/57개 생성 완료!! == 경과시간: 304초
40/57개 생성 완료!! == 경과시간: 405초
50/57개 생성 완료!! == 경과시간: 507초
10/61개 생성 완료!! == 경과시간: 673초
20/61개 생성 완료!! == 경과시간: 761초
30/61개 생성 완료!! == 경과시간: 847초
40/61개 생성 완료!! == 경과시간: 934초
50/61개 생성 완료!! == 경과시간: 1021초
60/61개 생성 완료!! == 경과시간: 1109초


In [55]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_weighted = data_fillna(hitter_time_weighted, YEAR)
pitcher_time_weighted = data_fillna(pitcher_time_weighted, YEAR)
hitter_time_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 102초
20/57개 생성 완료!! == 경과시간: 201초
30/57개 생성 완료!! == 경과시간: 301초
40/57개 생성 완료!! == 경과시간: 400초
50/57개 생성 완료!! == 경과시간: 500초
10/61개 생성 완료!! == 경과시간: 656초
20/61개 생성 완료!! == 경과시간: 742초
30/61개 생성 완료!! == 경과시간: 828초
40/61개 생성 완료!! == 경과시간: 913초
50/61개 생성 완료!! == 경과시간: 998초
60/61개 생성 완료!! == 경과시간: 1084초


In [56]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_play_weighted = data_fillna(hitter_play_weighted, YEAR)
pitcher_play_weighted = data_fillna(pitcher_play_weighted, YEAR)
hitter_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 109초
20/57개 생성 완료!! == 경과시간: 216초
30/57개 생성 완료!! == 경과시간: 322초
40/57개 생성 완료!! == 경과시간: 429초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 701초
20/61개 생성 완료!! == 경과시간: 792초
30/61개 생성 완료!! == 경과시간: 883초
40/61개 생성 완료!! == 경과시간: 975초
50/61개 생성 완료!! == 경과시간: 1065초
60/61개 생성 완료!! == 경과시간: 1156초


In [57]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_and_play_weighted = data_fillna(hitter_time_and_play_weighted, YEAR)
pitcher_time_and_play_weighted = data_fillna(pitcher_time_and_play_weighted, YEAR)
hitter_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 109초
20/57개 생성 완료!! == 경과시간: 216초
30/57개 생성 완료!! == 경과시간: 322초
40/57개 생성 완료!! == 경과시간: 428초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 700초
20/61개 생성 완료!! == 경과시간: 791초
30/61개 생성 완료!! == 경과시간: 881초
40/61개 생성 완료!! == 경과시간: 972초
50/61개 생성 완료!! == 경과시간: 1063초
60/61개 생성 완료!! == 경과시간: 1154초


In [58]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간 + 타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)


#### 1. simple 방식

In [59]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 12250.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14307.1
Early stopping, best iteration is:
[1345]	valid_0's rmse: 14299.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's rmse: 9841.96
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 11974.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's rmse: 11475.1
Learning rate set to 0.009465
0:	learn: 38799.6653689	test: 31882.5956309	best: 31882.5956309 (0)	total: 36.6ms	remaining: 12m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11287.72156
bestIteration = 897

Shrink model to first 898 iterations.
Learning rate set to 0.009465
0:	learn: 36090.0439491	test: 42862.04

Unnamed: 0,lgbm,cb
cv1,12250.7,11287.7
cv2,14299.5,14954.8
cv3,9842.0,9430.0
cv4,11974.7,10984.1
cv5,11475.1,10035.8
평균,11968.4,11338.5


In [60]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7083.11
[2000]	valid_0's rmse: 7074.35
Early stopping, best iteration is:
[2452]	valid_0's rmse: 7074.31
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	valid_0's rmse: 8931.24
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[283]	valid_0's rmse: 11474.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 11898.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[187]	valid_0's rmse: 10995.3
Learning rate set to 0.009088
0:	learn: 28904.8829476	test: 26047.6994511	best: 26047.6994511 (0)	total: 40.6ms	remaining: 13m 30s
1000:	learn: 4761.6497813	test: 7735.9444633	best: 7735.6145514 (999)	total: 11.1s	remaining: 3m 31s
2000:	learn: 3049.6012954	test: 7224.3328354	best: 7221.7033894 (1961)

Unnamed: 0,lgbm,cb
cv1,7074.3,7221.7
cv2,8931.2,7834.6
cv3,11474.2,11605.1
cv4,11898.8,10542.4
cv5,10995.3,9401.4
평균,10074.8,9321.0


#### 2. time_weighted 방식

In [61]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's rmse: 12505.2
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13998.7
[2000]	valid_0's rmse: 13945.7
[3000]	valid_0's rmse: 13943.6
Early stopping, best iteration is:
[3227]	valid_0's rmse: 13943.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 10371.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[79]	valid_0's rmse: 12401.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 11236.3
Learning rate set to 0.009465
0:	learn: 38804.5852083	test: 31867.7523371	best: 31867.7523371 (0)	total: 28.8ms	remaining: 9m 36s
1000:	learn: 5921.9757026	test: 11537.1435401	best: 11536.1569650 (993)	total: 14.4s	remaining: 4m 32s
Stopped by overfitting detector  (100 iter

Unnamed: 0,lgbm,cb
cv1,12505.2,11524.1
cv2,13943.6,14905.3
cv3,10371.5,9097.3
cv4,12401.9,11357.4
cv5,11236.3,10470.3
평균,12091.7,11470.9


In [62]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[472]	valid_0's rmse: 8481.84
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[321]	valid_0's rmse: 8754.53
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[230]	valid_0's rmse: 12142.6
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's rmse: 12248
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[287]	valid_0's rmse: 10796.3
Learning rate set to 0.009088
0:	learn: 28898.9526558	test: 26045.8873316	best: 26045.8873316 (0)	total: 17.4ms	remaining: 5m 47s
1000:	learn: 4704.6574327	test: 7594.8211909	best: 7594.8211909 (1000)	total: 13.1s	remaining: 4m 8s
2000:	learn: 2946.1806600	test: 7095.8815654	best: 7095.8815654 (2000)	total: 26.2s	remaining: 3m 55s
Stopped by overfitting detector  (1

Unnamed: 0,lgbm,cb
cv1,8481.8,7003.7
cv2,8754.5,7660.9
cv3,12142.6,11223.0
cv4,12248.0,10224.9
cv5,10796.3,9475.7
평균,10484.7,9117.6


#### 3. play_weighted 방식

In [63]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 12687.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15142.3
[2000]	valid_0's rmse: 15121.1
Early stopping, best iteration is:
[2126]	valid_0's rmse: 15120.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 10523.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[219]	valid_0's rmse: 11148.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's rmse: 11559.9
Learning rate set to 0.009465
0:	learn: 38807.3684861	test: 31874.5749199	best: 31874.5749199 (0)	total: 24.8ms	remaining: 8m 15s
1000:	learn: 6024.2457495	test: 11373.2694299	best: 11370.7220132 (962)	total: 14.7s	remaining: 4m 38s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1123

Unnamed: 0,lgbm,cb
cv1,12687.1,11234.8
cv2,15120.9,14311.9
cv3,10523.8,9890.3
cv4,11148.1,11110.1
cv5,11559.9,10382.1
평균,12208.0,11385.8


In [64]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8016.3
[2000]	valid_0's rmse: 7995.15
Early stopping, best iteration is:
[2613]	valid_0's rmse: 7994.52
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 8837.61
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[195]	valid_0's rmse: 11506.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's rmse: 11932.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's rmse: 10809.7
Learning rate set to 0.009088
0:	learn: 28876.9265062	test: 26024.8646158	best: 26024.8646158 (0)	total: 46.8ms	remaining: 15m 35s
1000:	learn: 4698.3777620	test: 7986.6573424	best: 7986.6573424 (1000)	total: 13.9s	remaining: 4m 23s
2000:	learn: 2966.5927501	test: 7572.9048009	best: 7572.9048009 (2000)	

Unnamed: 0,lgbm,cb
cv1,7994.5,7420.1
cv2,8837.6,7973.8
cv3,11506.4,10965.4
cv4,11932.3,10147.1
cv5,10809.7,9178.5
평균,10216.1,9137.0


#### 4. time_and_play weighted 방식

In [65]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 12008.9
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 13912.7
[2000]	valid_0's rmse: 13859.2
[3000]	valid_0's rmse: 13855.5
[4000]	valid_0's rmse: 13855.2
[5000]	valid_0's rmse: 13855.2
Early stopping, best iteration is:
[5448]	valid_0's rmse: 13855.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[58]	valid_0's rmse: 10803.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	valid_0's rmse: 12174.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 11371.8
Learning rate set to 0.009465
0:	learn: 38806.6747537	test: 31877.3509630	best: 31877.3509630 (0)	total: 31.3ms	remaining: 10m 24s
1000:	learn: 6136.2989437	test: 11669.5698615	best: 11665.9564611 (967)	total: 14.

Unnamed: 0,lgbm,cb
cv1,12008.9,11652.7
cv2,13855.2,15201.4
cv3,10803.4,9390.5
cv4,12174.5,11241.2
cv5,11371.8,10667.2
평균,12042.8,11630.6


In [66]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[785]	valid_0's rmse: 7887.08
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[249]	valid_0's rmse: 9072.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[183]	valid_0's rmse: 10474
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 12064.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's rmse: 10943.2
Learning rate set to 0.009088
0:	learn: 28894.1008531	test: 26041.8864897	best: 26041.8864897 (0)	total: 45.1ms	remaining: 15m 1s
1000:	learn: 4756.5908822	test: 7932.7965637	best: 7932.7965637 (1000)	total: 15.2s	remaining: 4m 48s
2000:	learn: 2963.2887431	test: 7572.1047420	best: 7572.1047420 (2000)	total: 29.7s	remaining: 4m 26s
3000:	learn: 2050.0588254	test: 7493

Unnamed: 0,lgbm,cb
cv1,7887.1,7459.2
cv2,9072.8,8068.8
cv3,10474.0,10908.5
cv4,12064.9,10080.6
cv5,10943.2,9598.3
평균,10088.4,9223.1


# 파생변수 생성 (3년, 0.3)

In [67]:
YEAR = 3
ALPHA = 0.3

In [68]:
# simple 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'simple')
    
    if idx == 0:
        hitter_simple = tmp.copy()
    else:
        hitter_simple = pd.concat([hitter_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'simple')
    
    if idx == 0:
        pitcher_simple = tmp.copy()
    else:
        pitcher_simple = pd.concat([pitcher_simple, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_simple = data_fillna(hitter_simple, YEAR)
pitcher_simple = data_fillna(pitcher_simple, YEAR)
hitter_simple.to_csv(f'누적변수추가(ver5데이터)/타자_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_simple.to_csv(f'누적변수추가(ver5데이터)/투수_단순누적({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 101초
20/57개 생성 완료!! == 경과시간: 203초
30/57개 생성 완료!! == 경과시간: 304초
40/57개 생성 완료!! == 경과시간: 406초
50/57개 생성 완료!! == 경과시간: 507초
10/61개 생성 완료!! == 경과시간: 665초
20/61개 생성 완료!! == 경과시간: 752초
30/61개 생성 완료!! == 경과시간: 838초
40/61개 생성 완료!! == 경과시간: 926초
50/61개 생성 완료!! == 경과시간: 1013초
60/61개 생성 완료!! == 경과시간: 1100초


In [69]:
# 시간가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_weighted')
    
    if idx == 0:
        hitter_time_weighted = tmp.copy()
    else:
        hitter_time_weighted = pd.concat([hitter_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_weighted')
    
    if idx == 0:
        pitcher_time_weighted = tmp.copy()
    else:
        pitcher_time_weighted = pd.concat([pitcher_time_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_weighted = data_fillna(hitter_time_weighted, YEAR)
pitcher_time_weighted = data_fillna(pitcher_time_weighted, YEAR)
hitter_time_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 103초
20/57개 생성 완료!! == 경과시간: 202초
30/57개 생성 완료!! == 경과시간: 302초
40/57개 생성 완료!! == 경과시간: 402초
50/57개 생성 완료!! == 경과시간: 503초
10/61개 생성 완료!! == 경과시간: 662초
20/61개 생성 완료!! == 경과시간: 748초
30/61개 생성 완료!! == 경과시간: 833초
40/61개 생성 완료!! == 경과시간: 919초
50/61개 생성 완료!! == 경과시간: 1004초
60/61개 생성 완료!! == 경과시간: 1090초


In [70]:
# 타석(또는 이닝)가중 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'play_weighted')
    
    if idx == 0:
        hitter_play_weighted = tmp.copy()
    else:
        hitter_play_weighted = pd.concat([hitter_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'play_weighted')
    
    if idx == 0:
        pitcher_play_weighted = tmp.copy()
    else:
        pitcher_play_weighted = pd.concat([pitcher_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_play_weighted = data_fillna(hitter_play_weighted, YEAR)
pitcher_play_weighted = data_fillna(pitcher_play_weighted, YEAR)
hitter_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 109초
20/57개 생성 완료!! == 경과시간: 216초
30/57개 생성 완료!! == 경과시간: 322초
40/57개 생성 완료!! == 경과시간: 428초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 701초
20/61개 생성 완료!! == 경과시간: 792초
30/61개 생성 완료!! == 경과시간: 883초
40/61개 생성 완료!! == 경과시간: 973초
50/61개 생성 완료!! == 경과시간: 1064초
60/61개 생성 완료!! == 경과시간: 1156초


In [71]:
# 시간 + 타석(또는 이닝) 방식

t1 = time.time()

################## 타자 ##################
for idx, hitter_column in enumerate(hitter_columns):
    
    tmp = make_average_variable(hitter, YEAR, hitter_column, '타석', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        hitter_time_and_play_weighted = tmp.copy()
    else:
        hitter_time_and_play_weighted = pd.concat([hitter_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(hitter_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
        
################## 투수 ##################
for idx, pitcher_column in enumerate(pitcher_columns):
    
    tmp = make_average_variable(pitcher, YEAR, pitcher_column, '이닝', ALPHA, 'time_and_play_weighted')
    
    if idx == 0:
        pitcher_time_and_play_weighted = tmp.copy()
    else:
        pitcher_time_and_play_weighted = pd.concat([pitcher_time_and_play_weighted, tmp.iloc[:,[-1]]], axis=1)
        
    if (idx+1) % 10 == 0:
        print(f'{idx+1}/{len(pitcher_columns)}개 생성 완료!! == 경과시간: {round(time.time()-t1)}초')
        
hitter_time_and_play_weighted = data_fillna(hitter_time_and_play_weighted, YEAR)
pitcher_time_and_play_weighted = data_fillna(pitcher_time_and_play_weighted, YEAR)
hitter_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/타자_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')
pitcher_time_and_play_weighted.to_csv(f'누적변수추가(ver5데이터)/투수_시간경기가중({YEAR},{ALPHA}).csv', index=False, encoding='utf-8-sig')

10/57개 생성 완료!! == 경과시간: 108초
20/57개 생성 완료!! == 경과시간: 215초
30/57개 생성 완료!! == 경과시간: 322초
40/57개 생성 완료!! == 경과시간: 428초
50/57개 생성 완료!! == 경과시간: 535초
10/61개 생성 완료!! == 경과시간: 700초
20/61개 생성 완료!! == 경과시간: 791초
30/61개 생성 완료!! == 경과시간: 882초
40/61개 생성 완료!! == 경과시간: 973초
50/61개 생성 완료!! == 경과시간: 1064초
60/61개 생성 완료!! == 경과시간: 1155초


In [72]:
print(f'simple방식 == 타자: {hitter_simple.shape} == 투수: {pitcher_simple.shape}')
print(f'시간가중방식 == 타자: {hitter_time_weighted.shape} == 투수: {pitcher_time_weighted.shape}')
print(f'타석(또는 이닝)가중방식 == 타자: {hitter_play_weighted.shape} == 투수: {pitcher_play_weighted.shape}')
print(f'시간 + 타석(또는 이닝)가중방식 == 타자: {hitter_time_and_play_weighted.shape} == 투수: {pitcher_time_and_play_weighted.shape}')

simple방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)
시간 + 타석(또는 이닝)가중방식 == 타자: (3142, 148) == 투수: (2425, 146)


#### 1. simple 방식

In [73]:
# 타자
col_dict, X, y = Xy_split(hitter_simple.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 12250.7
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14307.1
Early stopping, best iteration is:
[1345]	valid_0's rmse: 14299.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's rmse: 9841.96
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 11974.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's rmse: 11475.1
Learning rate set to 0.009465
0:	learn: 38799.6653689	test: 31882.5956309	best: 31882.5956309 (0)	total: 17.1ms	remaining: 5m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11287.72156
bestIteration = 897

Shrink model to first 898 iterations.
Learning rate set to 0.009465
0:	learn: 36090.0439491	test: 42862.045

Unnamed: 0,lgbm,cb
cv1,12250.7,11287.7
cv2,14299.5,14954.8
cv3,9842.0,9430.0
cv4,11974.7,10984.1
cv5,11475.1,10035.8
평균,11968.4,11338.5


In [74]:
# 투수
col_dict, X, y = Xy_split(pitcher_simple.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 7083.11
[2000]	valid_0's rmse: 7074.35
Early stopping, best iteration is:
[2452]	valid_0's rmse: 7074.31
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	valid_0's rmse: 8931.24
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[283]	valid_0's rmse: 11474.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 11898.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[187]	valid_0's rmse: 10995.3
Learning rate set to 0.009088
0:	learn: 28904.8829476	test: 26047.6994511	best: 26047.6994511 (0)	total: 14.6ms	remaining: 4m 52s
1000:	learn: 4761.6497813	test: 7735.9444633	best: 7735.6145514 (999)	total: 10.9s	remaining: 3m 26s
2000:	learn: 3049.6012954	test: 7224.3328354	best: 7221.7033894 (1961)	

Unnamed: 0,lgbm,cb
cv1,7074.3,7221.7
cv2,8931.2,7834.6
cv3,11474.2,11605.1
cv4,11898.8,10542.4
cv5,10995.3,9401.4
평균,10074.8,9321.0


#### 2. time_weighted 방식

In [75]:
# 타자
col_dict, X, y = Xy_split(hitter_time_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 11817.9
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15151.6
Early stopping, best iteration is:
[1714]	valid_0's rmse: 15117.7
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 10442.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[72]	valid_0's rmse: 12850.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's rmse: 11053.8
Learning rate set to 0.009465
0:	learn: 38813.7964338	test: 31874.2045167	best: 31874.2045167 (0)	total: 30.5ms	remaining: 10m 9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11432.61586
bestIteration = 742

Shrink model to first 743 iterations.
Learning rate set to 0.009465
0:	learn: 36092.4710945	test: 42868.799

Unnamed: 0,lgbm,cb
cv1,11817.9,11432.6
cv2,15117.7,15820.0
cv3,10442.3,9172.2
cv4,12850.8,11692.5
cv5,11053.8,10912.1
평균,12256.5,11805.9


In [76]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8166.75
Early stopping, best iteration is:
[998]	valid_0's rmse: 8166.65
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's rmse: 8596.97
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[247]	valid_0's rmse: 10964.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 12079
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[184]	valid_0's rmse: 10917.2
Learning rate set to 0.009088
0:	learn: 28901.2363149	test: 26044.3227168	best: 26044.3227168 (0)	total: 39.7ms	remaining: 13m 14s
1000:	learn: 4709.0348342	test: 7887.0696046	best: 7887.0696046 (1000)	total: 13.2s	remaining: 4m 10s
2000:	learn: 2979.7951684	test: 7548.5371950	best: 7548.5371950 (2000)	total: 26.3s	remaining: 3m 56s
30

Unnamed: 0,lgbm,cb
cv1,8166.7,7425.8
cv2,8597.0,7935.3
cv3,10964.4,11218.1
cv4,12079.0,10568.7
cv5,10917.2,9777.6
평균,10144.8,9385.1


#### 3. play_weighted 방식

In [77]:
# 타자
col_dict, X, y = Xy_split(hitter_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 12687.1
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 15142.3
[2000]	valid_0's rmse: 15121.1
Early stopping, best iteration is:
[2126]	valid_0's rmse: 15120.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 10523.8
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[219]	valid_0's rmse: 11148.1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's rmse: 11559.9
Learning rate set to 0.009465
0:	learn: 38807.3684861	test: 31874.5749199	best: 31874.5749199 (0)	total: 20.4ms	remaining: 6m 48s
1000:	learn: 6024.2457495	test: 11373.2694299	best: 11370.7220132 (962)	total: 14.6s	remaining: 4m 37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1123

Unnamed: 0,lgbm,cb
cv1,12687.1,11234.8
cv2,15120.9,14311.9
cv3,10523.8,9890.3
cv4,11148.1,11110.1
cv5,11559.9,10382.1
평균,12208.0,11385.8


In [78]:
# 투수
col_dict, X, y = Xy_split(pitcher_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8016.3
[2000]	valid_0's rmse: 7995.15
Early stopping, best iteration is:
[2613]	valid_0's rmse: 7994.52
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's rmse: 8837.61
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[195]	valid_0's rmse: 11506.4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's rmse: 11932.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's rmse: 10809.7
Learning rate set to 0.009088
0:	learn: 28876.9265062	test: 26024.8646158	best: 26024.8646158 (0)	total: 47.2ms	remaining: 15m 43s
1000:	learn: 4698.3777620	test: 7986.6573424	best: 7986.6573424 (1000)	total: 13.9s	remaining: 4m 22s
2000:	learn: 2966.5927501	test: 7572.9048009	best: 7572.9048009 (2000)	

Unnamed: 0,lgbm,cb
cv1,7994.5,7420.1
cv2,8837.6,7973.8
cv3,11506.4,10965.4
cv4,11932.3,10147.1
cv5,10809.7,9178.5
평균,10216.1,9137.0


#### 4. time_and_play weighted 방식

In [79]:
# 타자
col_dict, X, y = Xy_split(hitter_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[70]	valid_0's rmse: 11807.4
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 14001.1
Early stopping, best iteration is:
[1599]	valid_0's rmse: 13975.9
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 9686.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's rmse: 12610
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 10664.2
Learning rate set to 0.009465
0:	learn: 38805.8942002	test: 31870.6134213	best: 31870.6134213 (0)	total: 35.8ms	remaining: 11m 55s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11145.54172
bestIteration = 895

Shrink model to first 896 iterations.
Learning rate set to 0.009465
0:	learn: 36085.8037815	test: 42839.55826

Unnamed: 0,lgbm,cb
cv1,11807.4,11145.5
cv2,13975.9,15654.6
cv3,9686.2,9159.1
cv4,12610.0,12035.4
cv5,10664.2,10891.0
평균,11748.7,11777.1


In [80]:
# 투수
col_dict, X, y = Xy_split(pitcher_time_and_play_weighted.drop(['ID', '선수명'], axis=1))
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[298]	valid_0's rmse: 8679.22
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's rmse: 9012.48
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[348]	valid_0's rmse: 11363.5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's rmse: 12306.3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[256]	valid_0's rmse: 10816.3
Learning rate set to 0.009088
0:	learn: 28881.8129741	test: 26031.8137314	best: 26031.8137314 (0)	total: 36.1ms	remaining: 12m 2s
1000:	learn: 4741.2127217	test: 8351.0118850	best: 8351.0118850 (1000)	total: 14s	remaining: 4m 25s
2000:	learn: 2989.8489030	test: 8025.9739140	best: 8025.0076211 (1984)	total: 27.8s	remaining: 4m 9s
3000:	learn: 2095.5945923	test: 794

Unnamed: 0,lgbm,cb
cv1,8679.2,7939.6
cv2,9012.5,8348.7
cv3,11363.5,10991.7
cv4,12306.3,10573.5
cv5,10816.3,9914.9
평균,10435.6,9553.7
