In [None]:
'''
Catboost와 Lightgbm만 실험
'''

# 패키지 불러오기

In [1]:
import math

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


n_splits = 5

In [5]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용_원핫인코딩)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용_원핫인코딩)_train_ver3.csv')

In [3]:
# 시간가중치 생성
def make_exponential_weight(length, a):
    return [a**(i-1) for i in range(length, 0, -1)]

##################################################################################################
##################################################################################################

# 단순 평균
def simple_creation(dataset, n_year, feature):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset.loc[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            tmp.loc[i, f'{n_year}년평균{feature}'] = tmp.loc[i-n_year+1:i, feature].mean()
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중시간 평균
def time_weighted_creation(dataset, n_year, feature, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                      a = alpha))
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 가중타석(또는 이닝) 평균
def play_weighted_creation(dataset, n_year, feature, criteria):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            weight = np.array(tmp.loc[i-n_year+1:i, criteria])
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(value, weight) / np.sum(weight)
            
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 시간 + 타석(또는 이닝) 평균
def timeandplay_weighted_creation(dataset, n_year, feature, criteria, alpha):

    final_dataset = pd.DataFrame()
    
    # ID별로 접근
    for ID in dataset['ID'].unique():

        tmp = dataset[dataset['ID'] == ID].sort_values(by='연도').reset_index(drop=True)
        
        # 파생변수 생성
        for i in range(tmp.shape[0]):
            
            value = np.array(tmp.loc[i-n_year+1:i, feature])
            play_weight = np.array(tmp.loc[i-n_year+1:i, criteria])
            time_weight = np.array(make_exponential_weight(length = value.shape[0], 
                                                          a = alpha))
            
            tmp.loc[i, f'{n_year}년평균{feature}'] = np.matmul(np.multiply(value, play_weight), time_weight) / np.matmul(play_weight, time_weight)
                        
        final_dataset = pd.concat([final_dataset, tmp]).reset_index(drop=True)

    return final_dataset

##################################################################################################
##################################################################################################

# 종합 함수
def make_average_variable(dataset, n_year, feature, criteria, alpha, how):
    
    if how == 'simple':
        return simple_creation(dataset, n_year, feature)
    
    elif how == 'time_weighted':
        return time_weighted_creation(dataset, n_year, feature, alpha)
    
    elif how == 'play_weighted':
        return play_weighted_creation(dataset, n_year, feature, criteria)
    
    elif how == 'time_and_play_weighted':
        return time_and_play_weighted_creation(dataset, n_year, feature, criteria, alpha)

In [4]:
make_average_variable(hitter, 3, '안타', '타석', 0.9, 'simple')[['선수명','연도','타석','안타','3년평균안타']]

Unnamed: 0,선수명,연도,타석,안타,3년평균안타
0,고지행,2003,289,71,71.000000
1,고지행,2004,2,0,35.500000
2,오선우,2019,57,8,8.000000
3,채종국,2002,224,42,42.000000
4,채종국,2003,164,33,37.500000
...,...,...,...,...,...
3028,한상훈,2014,203,50,59.666667
3029,김대륙,2015,63,8,8.000000
3030,김대륙,2016,59,7,7.500000
3031,강로한,2019,306,69,69.000000


In [7]:
hitter.columns
# 타자에서 평균처리가 필요하지 않은 변수
hitter_remove_col = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '누적타석', '출생연도', '출생월', '출생일', '나이', 
                  '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성',
                  '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대', '포지션(수비)_1루수', '포지션(수비)_2루수',
                  '포지션(수비)_3루수', '포지션(수비)_수비기록없음', '포지션(수비)_우익수', '포지션(수비)_유격수',
                  '포지션(수비)_좌익수', '포지션(수비)_중견수', '포지션(수비)_포수']

Index(['ID', '선수명', '데뷔년도', '연도', '타율', '경기', '타석', '타수', '득점', '안타', '2루타',
       '3루타', '홈런', '루타', '타점', '희생번트', '희생플라이', '볼넷', '고의사구', '사구', '삼진',
       '병살타', '장타율', '출루율', 'OPS', '멀티히트', '득점권타율', '대타타율', '장타', '땅볼', '뜬공',
       '결승타', '투구수/타석', '순수장타율', '추정득점', 'GPA', '경기(주루)', '도루시도(주루)',
       '도루허용(주루)', '도루실패(주루)', '도루성공률(주루)', '주루사(주루)', '견제사(주루)', '경기(수비)',
       '선발경기(수비)', '수비이닝(수비)', '실책(수비)', '견제사(수비)', '자살(수비)', '보살(수비)',
       '병살(수비)', '포일(수비)', '도루허용(수비)', '도루실패(수비)', '수비율(수비)', '도루저지율(수비)',
       'WAR', 'FA여부', '뉴스개수(연도별정규화)', '뉴스개수(누적정규화)', '수상횟수', '누적타석',
       '포지션 엔트로피', '출생연도', '출생월', '출생일', '나이', '연봉', '팀명_KIA', '팀명_KT',
       '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성',
       '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대', '포지션(수비)_1루수', '포지션(수비)_2루수',
       '포지션(수비)_3루수', '포지션(수비)_수비기록없음', '포지션(수비)_우익수', '포지션(수비)_유격수',
       '포지션(수비)_좌익수', '포지션(수비)_중견수', '포지션(수비)_포수'],
      dtype='object')

In [10]:
pitcher.columns
pitcher_remove_col = ['ID', '선수명', '데뷔년도', '연도', 'FA여부', '누적이닝', '출생연도', '출생월', '출생일', '나이',
                      '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK', '팀명_두산', '팀명_롯데', '팀명_삼성', 
                      '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대']

Index(['ID', '선수명', '데뷔년도', '연도', '평균자책점', '경기', '승리', '패배', '세이브', '홀드', '승률',
       '이닝', '피안타', '홈런', '볼넷', '사구', '삼진', '실점', '자책점', '이닝당_출루허용률', '완투',
       '완봉', '퀄리티스타트', '블론세이브', '타자수', '투구수', '피안타율', '2루타', '3루타', '희생번트',
       '희생플라이', '고의4구', '폭투', '보크', '선발', '선발승', '구원승', '종료', '세이브기회', '터프세이브',
       '병살타', '땅볼', '뜬공', '인플레이타구타율', '투구수/경기', '투구수/이닝', '9이닝당 삼진', '9이닝당 볼넷',
       '피출루율', '피장타율', '피OPS', '경기(수비)', '선발경기(수비)', '수비이닝(수비)', '실책(수비)',
       '견제사(수비)', '자살(수비)', '보살(수비)', '병살(수비)', '수비율(수비)', 'WAR', 'FA여부',
       '뉴스개수(연도별정규화)', '뉴스개수(누적정규화)', '수상횟수', '누적이닝', '포지션 엔트로피', '출생연도',
       '출생월', '출생일', '나이', '연봉', '팀명_KIA', '팀명_KT', '팀명_LG', '팀명_NC', '팀명_SK',
       '팀명_두산', '팀명_롯데', '팀명_삼성', '팀명_우리/히어로즈/넥센/키움', '팀명_한화', '팀명_현대'],
      dtype='object')

# 모델 정의
 1. Lightgbm
 2. Catboost

In [None]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

In [None]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        pred_valid = lgb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)

    # lgbm 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'lgbm':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

In [None]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = cb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    performance.append(np.mean(performance))

    output = pd.DataFrame({'cb':performance}, index=['cv1','cv2','cv3','cv4','cv5','평균'])
    return output

# train, test분할 후 모델링

In [None]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/타자(모델링용_원핫인코딩)_train_ver3.csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver3/투수(모델링용_원핫인코딩)_train_ver3.csv')

In [None]:
hitter_final = pd.DataFrame()

for ID in hitter['ID'].unique():
    
    tmp = hitter.loc[hitter['ID'] == ID].reset_index(drop=True)
    tmp['작년연봉'] = tmp['연봉'].shift(-1)
    
    hitter_final = pd.concat([hitter_final, tmp]).reset_index(drop=True)
    
hitter_final = hitter_final.dropna().reset_index(drop=True)
hitter.shape, hitter_final.shape

In [None]:
pitcher_final = pd.DataFrame()

for ID in pitcher['ID'].unique():
    
    tmp = pitcher.loc[pitcher['ID'] == ID].reset_index(drop=True)
    tmp['작년연봉'] = tmp['연봉'].shift(-1)
    
    pitcher_final = pd.concat([pitcher_final, tmp]).reset_index(drop=True)
    
pitcher_final = pitcher_final.dropna().reset_index(drop=True)
pitcher.shape, pitcher_final.shape

#### 1. 타자

In [None]:
# train, test 분할
hitter_final = hitter_final.drop(['ID', '선수명'], axis=1)
col_dict, X, y = Xy_split(hitter_final)

In [None]:
# 타자데이터 도출
hitter_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

hitter_performance

#### 2. 투수

In [None]:
# train, test 분할
pitcher_final = pitcher_final.drop(['ID', '선수명'], axis=1)
col_dict, X, y = Xy_split(pitcher_final)

In [None]:
# 투수데이터 도출
pitcher_performance = pd.concat([round(lgbm_model(X, y), 1),
                                 round(cb_model(X, y), 1)], axis=1)

pitcher_performance