# 패키지 불러오기

In [1]:
import math
import pandas as pd
import numpy as np

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

n_splits = 5

# 데이터 불러오기

In [2]:
hitter_simple_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/단순가중/타자_단순가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
hitter_simple_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/단순가중/타자_단순가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_simple_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/단순가중/투수_단순가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_simple_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/단순가중/투수_단순가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

hitter_time_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간가중/타자_시간가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
hitter_time_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간가중/타자_시간가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_time_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간가중/투수_시간가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_time_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간가중/투수_시간가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

hitter_play_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/경기가중/타자_경기가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
hitter_play_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/경기가중/타자_경기가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_play_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/경기가중/투수_경기가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_play_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/경기가중/투수_경기가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

hitter_time_and_play_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/타자_시간경기가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
hitter_time_and_play_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/타자_시간경기가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_time_and_play_weighted_fa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/투수_시간경기가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)
pitcher_time_and_play_weighted_nonfa = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/투수_시간경기가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

# 모델 정의

In [3]:
def Xy_split(dataset):
    
    dataset_X = dataset.drop('연봉', axis=1)
    dataset_y = dataset['연봉']
    
    return dataset_X, dataset_y

In [4]:
def lgbm_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    lgb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        lgb_models[step] = lgb_model

        step += 1
        
    # lgbm 성능종합
    performance = [lgb_models[step].best_score['valid_0']['rmse'] for step in range(1,6)]
    cv_performance = round(np.mean(performance),1)

    return cv_performance

In [5]:
def cb_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        cb_models[step] = cb_model

        step += 1
        
    # cb 성능종합
    performance = [cb_models[step].best_score_['validation']['RMSE'] for step in range(1,6)]
    cv_performance = round(np.mean(performance),1)

    return cv_performance

In [6]:
def lgbm_log_model(data_x, data_y):

    # lgbm 모델링
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :].values
        y_train = data_y[tr_idx].values

        X_valid = data_x.iloc[val_idx, :].values
        y_valid = data_y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)
        pred_valid = list(map(lambda x: 10 ** x, lgb_model.predict(X_valid)))
        rmse = mean_squared_error(list(map(lambda x: 10 ** x, y_valid)), pred_valid, squared=False)

        performance.append(rmse)

    # lgbm 성능종합
    cv_performance = round(np.mean(performance),1)

    return cv_performance

In [7]:
def cb_log_model(data_x, data_y):

    # cb 모델링
    cat_cols = []
    performance = []

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        pred_valid = list(map(lambda x: 10 ** x, cb_model.predict(X_valid)))
        rmse = mean_squared_error(list(map(lambda x: 10 ** x, y_valid)), pred_valid, squared=False)

        performance.append(rmse)
        
    # cb 성능종합
    cv_performance = round(np.mean(performance),1)

    return cv_performance

# 변수선택법 방법론 정의

In [8]:
def get_lgbm_RFE_rank(X, y):

    X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

    X = X.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y = y.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    original_feature_num = X.shape[1]

    # RFE 적용
    lgb_params = {"objective" : "rmse",
                 "verbosity" : -1}

    remove_cols = []

    while True:

        # 변수명 정의
        column_names = list(X.columns)
        print(f'남은변수개수: {len(column_names)}')

        # lgbm 모델링
        lgb_dtrain = lgb.Dataset(data = X, label = y) 
        lgb_dvalid = lgb.Dataset(data = X_test, label = y_test) 

        lgb_model = lgb.train(lgb_params, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000)

        # 가장 낮은 변수들 중 1개 삭제
        remove_col = [col for col, importance in zip(column_names, lgb_model.feature_importance()) if importance == min(lgb_model.feature_importance())][-1]
        remove_cols.append(remove_col)
        X.drop(remove_col, axis=1, inplace=True)
        X_test.drop(remove_col, axis=1, inplace=True)

        # while문 종료조건
        if X.shape[1] == 1:
            remove_cols.append(X.columns[0])
            break

    # 데이터프레임으로 생성
    result = pd.DataFrame({'변수명':remove_cols,
                           'RFE순위':[i for i in range(original_feature_num, 0, -1)]})

    return result

In [9]:
def get_cb_RFE_rank(X, y):

    X, X_test, y, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = 42)

    X = X.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y = y.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    original_feature_num = X.shape[1]
        
    # RFE 적용
    remove_cols = []

    while True:

        # 변수명 정의
        column_names = list(X.columns)
        print(f'남은변수개수: {len(column_names)}')

        # cb 모델링
        cat_cols = []
        cb_dtrain = Pool(data=X, label=y, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_test, label=y_test, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)
        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)

        # 가장 낮은 변수들 중 1개 삭제
        remove_col = [col for col, importance in zip(column_names, cb_model.feature_importances_) if importance == min(cb_model.feature_importances_)][-1]
        remove_cols.append(remove_col)
        X.drop(remove_col, axis=1, inplace=True)
        X_test.drop(remove_col, axis=1, inplace=True)

        # while문 종료조건
        if X.shape[1] == 1:
            remove_cols.append(X.columns[0])
            break

    # 데이터프레임으로 생성
    result = pd.DataFrame({'변수명':remove_cols,
                           'RFE순위':[i for i in range(original_feature_num, 0, -1)]})
    
    return result

In [10]:
def lgbm_RFE_modeling(data_x, data_y, score_dataframe, log_transformation):

    # 성능을 담을 변수 초기화
    performances = []

    # 변수개수별 성능파악
    for n_features in range(0, data_x.shape[1]+1, 5):

        # 변수가 0개일때는 실행x
        if n_features == 0:
            continue

        ### 변수목록 선정
        features = list(score_dataframe.loc[score_dataframe['RFE순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

        # 모델링 후 데이터프레임 생성
        data_x_tmp = data_x[features]
        
        if log_transformation == True:
            performances.append(lgbm_log_model(data_x_tmp, data_y))
        else:
            performances.append(lgbm_model(data_x_tmp, data_y))

    result = pd.DataFrame({'RMSE':performances}, index = [i for i in range(5, data_x.shape[1]+1, 5)])
    
    return result

In [11]:
def cb_RFE_modeling(data_x, data_y, score_dataframe, log_transformation):

    # 성능을 담을 변수 초기화
    performances = []

    # 변수개수별 성능파악
    for n_features in range(0, data_x.shape[1]+1, 5):

        # 변수가 0개일때는 실행x
        if n_features == 0:
            continue

        ### 변수목록 선정
        features = list(score_dataframe.loc[score_dataframe['RFE순위'].isin([i for i in range(1, n_features+1)]), '변수명'])

        # 모델링 후 데이터프레임 생성
        data_x_tmp = data_x[features]
        
        if log_transformation == True:
            performances.append(cb_log_model(data_x_tmp, data_y))
        else:
            performances.append(cb_model(data_x_tmp, data_y))

    result = pd.DataFrame({'RMSE':performances}, index = [i for i in range(5, data_x.shape[1]+1, 5)])
    
    return result

In [12]:
def get_mi_rank(X, y):

    # score 산정
    mutual_info = SelectKBest(mutual_info_regression, k=1).fit(X, y)
    mutual_info_importance = pd.Series(mutual_info.scores_).rank(ascending=False)

    # 데이터프레임 생성
    mi_score = pd.DataFrame({'변수명':list(X.columns),
                        'mi순위':list(mutual_info_importance)})

    return mi_score

In [13]:
def lgbm_mi_modeling(data_x, data_y, score_dataframe, log_transformation):
    
    # 성능을 담을 변수 초기화
    performances = []
    
    # 변수개수별 성능파악
    for n_features in range(0, data_x.shape[1]+1, 5):
        
        # 변수가 0개일때는 실행x
        if n_features == 0:
            continue

        ### 변수목록 선정
        features = list(score_dataframe.loc[score_dataframe['mi순위'].isin([i for i in range(1, n_features+1)]), '변수명'])
        
        # 모델링 후 데이터프레임 생성
        data_x_tmp = data_x[features]
        
        if log_transformation == True:
            performances.append(lgbm_log_model(data_x_tmp, data_y))
        else:
            performances.append(lgbm_model(data_x_tmp, data_y))
    
    result = pd.DataFrame({'RMSE':performances}, index = [i for i in range(5, data_x.shape[1]+1, 5)])
    
    return result

In [14]:
def cb_mi_modeling(data_x, data_y, score_dataframe, log_transformation):
    
    # 성능을 담을 변수 초기화
    performances = []
    
    # 변수개수별 성능파악
    for n_features in range(0, data_x.shape[1]+1, 5):
        
        # 변수가 0개일때는 실행x
        if n_features == 0:
            continue

        ### 변수목록 선정
        features = list(score_dataframe.loc[score_dataframe['mi순위'].isin([i for i in range(1, n_features+1)]), '변수명'])
        
        # 모델링 후 데이터프레임 생성
        data_x_tmp = data_x[features]
        
        if log_transformation == True:
            performances.append(cb_log_model(data_x_tmp, data_y))
        else:
            performances.append(cb_model(data_x_tmp, data_y))    
    
    result = pd.DataFrame({'RMSE':performances}, index = [i for i in range(5, data_x.shape[1]+1, 5)])
    
    return result

In [15]:
def feature_seleciton_modeling(model_type, selection_method, log_transformation):
    
    if (model_type == 'lightgbm') & (selection_method == 'RFE'):
        score = get_lgbm_RFE_rank(dataset)
        performance = lgbm_RFE_modeling(data_x, data_y, score, log_transformation)
        
    elif (model_type == 'catboost') & (selection_method == 'RFE'):
        score = get_cb_RFE_rank(dataset)
        performance = cb_RFE_modeling(data_x, data_y, score, log_transformation)
        
    elif (model_type == 'lgbm') & (selection_method == 'mi'):
        score = get_mi_rank(dataset)
        performance = lgbm_mi_modeling(data_x, data_y, score, log_transformation)
        
    elif (model_type == 'catboost') & (selection_method == 'mi'):
        score = get_mi_rank(dataset)
        performance = cb_mi_modeling(data_x, data_y, score, log_transformation)
        
    return performance

# util 정의

In [16]:
def load_dataset(player_type, weight_type, fa_type):
    
    #### 단순가중 ####
    if (player_type == '타자') & (weight_type == 'simple') & (fa_type == 'fa'):
        return hitter_simple_fa
    
    elif (player_type == '타자') & (weight_type == 'simple') & (fa_type == 'nonfa'):
        return hitter_simple_nonfa
    
    elif (player_type == '투수') & (weight_type == 'simple') & (fa_type == 'fa'):
        return pitcher_simple_fa
    
    elif (player_type == '투수') & (weight_type == 'simple') & (fa_type == 'nonfa'):
        return pitcher_simple_nonfa
    
    #### 시간가중 ####
    elif (player_type == '타자') & (weight_type == 'time') & (fa_type == 'fa'):
        return hitter_time_weighted_fa
    
    elif (player_type == '타자') & (weight_type == 'time') & (fa_type == 'nonfa'):
        return hitter_time_weighted_nonfa
    
    elif (player_type == '투수') & (weight_type == 'time') & (fa_type == 'fa'):
        return pitcher_time_weighted_fa
    
    elif (player_type == '투수') & (weight_type == 'time') & (fa_type == 'nonfa'):
        return pitcher_time_weighted_nonfa
    
    #### 경기가중 ####
    elif (player_type == '타자') & (weight_type == 'play') & (fa_type == 'fa'):
        return hitter_play_weighted_fa
    
    elif (player_type == '타자') & (weight_type == 'play') & (fa_type == 'nonfa'):
        return hitter_play_weighted_nonfa
    
    elif (player_type == '투수') & (weight_type == 'play') & (fa_type == 'fa'):
        return pitcher_play_weighted_fa
    
    elif (player_type == '투수') & (weight_type == 'play') & (fa_type == 'nonfa'):
        return pitcher_play_weighted_nonfa
    
    #### 시간경기가중 ####
    elif (player_type == '타자') & (weight_type == 'time_and_play') & (fa_type == 'fa'):
        return hitter_time_and_play_weighted_fa
    
    elif (player_type == '타자') & (weight_type == 'time_and_play') & (fa_type == 'nonfa'):
        return hitter_time_and_play_weighted_nonfa
    
    elif (player_type == '투수') & (weight_type == 'time_and_play') & (fa_type == 'fa'):
        return pitcher_time_and_play_weighted_fa
    
    elif (player_type == '투수') & (weight_type == 'time_and_play') & (fa_type == 'nonfa'):
        return pitcher_time_and_play_weighted_nonfa

In [17]:
def scaler_transformation(dataset, scaler_type):
    
    # Z-score scaler
    if scaler_type == 'standardized_scaler':
        scaler = StandardScaler()
        dataset_scaled = scaler.fit_transform(dataset)
        dataset_scaled = pd.DataFrame(dataset_scaled)
    
    # Minmax scaler
    elif scaler_type == 'minmax_scaler':
        scaler = MinMaxScaler()
        dataset_scaled = scaler.fit_transform(dataset)
        dataset_scaled = pd.DataFrame(dataset_scaled)
        
    return dataset_scaled

In [18]:
def log_transformation(dataset):
    
    dataset_log = pd.Series(list(map(lambda x: math.log10(x), dataset)))
    return dataset_log

# 종합함수

In [19]:
def modeling(data_x, data_y, model_type, selection_method, log_transformation):
    
    if (model_type == 'lightgbm') & (selection_method == 'RFE'):
        score = get_lgbm_RFE_rank(data_x, data_y)
        performance = lgbm_RFE_modeling(data_x, data_y, score, log_transformation)
        return score, performance
        
    elif (model_type == 'catboost') & (selection_method == 'RFE'):
        score = get_cb_RFE_rank(data_x, data_y)
        performance = cb_RFE_modeling(data_x, data_y, score, log_transformation)
        return score, performance
        
    elif (model_type == 'lightgbm') & (selection_method == 'mi'):
        score = get_mi_rank(data_x, data_y)
        performance = lgbm_mi_modeling(data_x, data_y, score, log_transformation)
        return score, performance
        
    elif (model_type == 'catboost') & (selection_method == 'mi'):
        score = get_mi_rank(data_x, data_y)
        performance = cb_mi_modeling(data_x, data_y, score, log_transformation)
        return score, performance
        
    elif (model_type == 'lightgbm') & (log_transformation == True) & (selection_method == 'no_use'):
        performance = lgbm_log_model(data_x, data_y)
        return performance
        
    elif (model_type == 'lightgbm') & (log_transformation == False) & (selection_method == 'no_use'):
        performance = lgbm_model(data_x, data_y)
        return performance
        
    elif (model_type == 'catboost') & (log_transformation == True) & (selection_method == 'no_use'):
        performance = cb_log_model(data_x, data_y)
        return performance
        
    elif (model_type == 'catboost') & (log_transformation == False) & (selection_method == 'no_use'):
        performance = cb_model(data_x, data_y)
        return performance

In [20]:
def get_performance(player_type, weight_type, fa_type, scaler_type, log_transform, model_type, selection_method):
    
    # 데이터 불러오기
    df = load_dataset(player_type, weight_type, fa_type)
    
    # X, y 분할
    X_data, y_data = Xy_split(df)
    
    # 스케일링여부 체크
    if scaler_type in ['standardized_scaler', 'minmax_scaler']:
        X_data = scaler_transformation(X_data, scaler_type)
    
    # 로그변환 여부 체크
    if log_transform == True:
        y_data = log_transformation(y_data)
        
    if selection_method == 'no_use':
        performance = modeling(X_data, y_data, model_type, selection_method, log_transform)
        
    elif selection_method in ['RFE', 'mi']:
        score_df, performance = modeling(X_data, y_data, model_type, selection_method, log_transform)
    
    print(f'\n\n===== {model_type} === {player_type} {fa_type} =====')
    print(f'===== {weight_type} == 스케일링: {scaler_type} == log변환: {log_transform} == 변수선택법: {selection_method} =====')
    
    if selection_method == 'no_use':
        print(f'===== RMSE: {performance} =====')    
        return performance
        
    elif selection_method in ['RFE', 'mi']:
        display(performance)
        return score_df, performance

# 최종실험

In [21]:
def cb_final_model(data_x, data_y):

    # cb 모델링
    cat_cols = []

    cb_models={}
    r2_scores={}
    step = 1

    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)

    for tr_idx, val_idx in cv.split(data_x):  

        X_train = data_x.iloc[tr_idx, :]
        y_train = data_y[tr_idx]

        X_valid = data_x.iloc[val_idx, :]
        y_valid = data_y[val_idx]

        cb_dtrain = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        cb_dvalid = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        cb_model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', loss_function='RMSE', verbose = 0)

        cb_model.fit(cb_dtrain, eval_set=cb_dvalid, early_stopping_rounds=100, verbose_eval=1000, use_best_model=True)
        r2 = r2_score(y_valid, cb_model.predict(X_valid))
        
        cb_models[step] = cb_model
        r2_scores[step] = r2
        
        step += 1
        
    return cb_models, r2_scores

#### (1) 투수FA
 - 시간경기가중 // Z-scoring + Feature selection(RFE)

In [25]:
# 데이터 load
train = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/투수_시간경기가중_fa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

# Xy split
train_x, train_y = Xy_split(train)
train_x.iloc[:,[116, 88, 14, 58, 63, 94, 1, 138, 65, 61, 60, 142, 0, 139, 80]].columns

Index(['5년평균세이브기회', '5년평균승률', '삼진', 'WAR', '누적이닝', '5년평균삼진', '연도', '5년평균WAR',
       '출생연도', '뉴스개수(누적정규화)', '뉴스개수(연도별정규화)', '5년평균연봉', '데뷔년도',
       '5년평균뉴스개수(연도별정규화)', '1차FA여부'],
      dtype='object')

#### (2) 타자 nonFA
 - 시간경기가중 // Z-scoring + Feature selection(RFE)

In [26]:
# 데이터 load
train = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/시간경기가중/타자_시간경기가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

# Xy split
train_x, train_y = Xy_split(train)
train_x.iloc[:,[59, 64, 140, 96, 101, 7, 134, 118, 57, 144]].columns

Index(['누적타석', '나이', '5년평균WAR', '5년평균홈런', '5년평균볼넷', '안타', '5년평균병살(수비)',
       '5년평균추정득점', '뉴스개수(누적정규화)', '5년평균연봉'],
      dtype='object')

#### (3) 투수 nonFA
 - 단순가중 // Feature selection(RFE)

In [27]:
# 데이터 load
train = pd.read_csv('../선수데이터(전처리완료)/모델링용ver8/단순가중/투수_단순가중_nonfa_train.csv', encoding='cp949').drop(['ID','선수명'], axis=1)

# Xy split
train_x, train_y = Xy_split(train)
train_x.iloc[:,[14, 68, 60, 87, 65, 58, 139, 0, 138, 142]].columns

Index(['삼진', '나이', '뉴스개수(연도별정규화)', '5년평균홀드', '출생연도', 'WAR', '5년평균뉴스개수(연도별정규화)',
       '데뷔년도', '5년평균WAR', '5년평균연봉'],
      dtype='object')