# 패키지 불러오기

In [34]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression


n_splits = 5

# 모델 정의
 1. Random Forest
 2. Lightgbm
 3. Xgboost
 4. Catboost
 5. Knn
 6. Linear regression

In [2]:
def Xy_split(dataset):
    
    X = dataset.drop('연봉', axis=1)
    column_dict = {f'variable{idx+1}':col for idx, col in enumerate(X.columns)}
    X = X.rename(columns = {col:f'variable{idx+1}' for idx, col in enumerate(X.columns)})
    y = dataset['연봉']
    
    return column_dict, X, y

# train, test분할 후 모델링

In [3]:
hitter = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/타자(모델링용).csv')
pitcher = pd.read_csv('../선수데이터(전처리완료)/모델링용ver1/투수(모델링용).csv')

#### 1. 타자

In [40]:
# train, test 분할
col_dict, X, y = Xy_split(hitter)

# score 산정

### fscore 산정
fscore = SelectKBest(f_regression, k=1).fit(X, y)
fscore_importance = pd.Series(fscore.scores_).rank(ascending=False)


### 상호정보량 산정
mutual_info = SelectKBest(mutual_info_regression, k=1).fit(X, y)
mutual_info_importance = pd.Series(mutual_info.scores_).rank(ascending=False)


hitter_score = pd.DataFrame({'변수명':list(col_dict.keys()),
                             '실제변수명':list(col_dict.values()),
                             'fscore순위':list(fscore_importance),
                             'mutual_info순위':list(mutual_info_importance)})

hitter_score

Unnamed: 0,변수명,실제변수명,fscore순위,mutual_info순위
0,variable1,데뷔년도,42.0,3.0
1,variable2,연도,43.0,23.0
2,variable3,타율,40.0,32.0
3,variable4,경기,26.5,19.0
4,variable5,타석,14.0,6.0
...,...,...,...,...
80,variable81,포지션(수비)_우익수,82.0,72.0
81,variable82,포지션(수비)_유격수,65.0,73.0
82,variable83,포지션(수비)_좌익수,80.0,83.0
83,variable84,포지션(수비)_중견수,74.0,79.0


#### 2. 투수

In [43]:
# train, test 분할
col_dict, X, y = Xy_split(pitcher)

# score 산정

### fscore 산정
fscore = SelectKBest(f_regression, k=1).fit(X, y)
fscore_importance = pd.Series(fscore.scores_).rank(ascending=False)


### 상호정보량 산정
mutual_info = SelectKBest(mutual_info_regression, k=1).fit(X, y)
mutual_info_importance = pd.Series(mutual_info.scores_).rank(ascending=False)


pitcher_score = pd.DataFrame({'변수명':list(col_dict.keys()),
                             '실제변수명':list(col_dict.values()),
                             'fscore순위':list(fscore_importance),
                             'mutual_info순위':list(mutual_info_importance)})

pitcher_score

Unnamed: 0,변수명,실제변수명,fscore순위,mutual_info순위
0,variable1,데뷔년도,36.0,3.0
1,variable2,연도,47.0,5.0
2,variable3,평균자책점,59.0,48.0
3,variable4,경기,39.0,10.0
4,variable5,승리,6.0,15.0
...,...,...,...,...
75,variable76,팀명_롯데,73.0,74.0
76,variable77,팀명_삼성,57.0,75.0
77,variable78,팀명_우리/히어로즈/넥센/키움,67.0,67.0
78,variable79,팀명_한화,76.0,64.0


# 저장

In [45]:
hitter_score.to_csv('../변수중요도데이터/filter_타자.csv', index=False, encoding='utf-8-sig')
pitcher_score.to_csv('../변수중요도데이터/filter_투수.csv', index=False, encoding='utf-8-sig')