In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import joblib
import warnings
warnings.filterwarnings(action='ignore')  

train = pd.read_csv('./data/FIFA_train.csv')
test = pd.read_csv('./data/FIFA_test.csv')
submission  = pd.read_csv('./data/submission.csv')

## 간단한 전처리

##### contract_until 변수 int 형으로 변환
##### 계약 연도만 추출
def func(string):
    string = string[-4:]
    return int(string)

train['contract_until'] = train['contract_until'].apply(func)
test['contract_until'] = test['contract_until'].apply(func)

feature=['age', 'continent', 'contract_until', 'position',
       'prefer_foot', 'reputation', 'stat_overall', 'stat_potential',
       'stat_skill_moves']
x_train = train[feature]
y_train = train['value']
x_test = test[feature]

- 밑은 log를 쓰는 전처리 입니다. 원본은 위의 전처리를 코드로 돌리시고 밑은 싹 마크다운하세요

In [None]:
def con_period(x):
    if x == 'Dec 31, 2018' :
        return '2019'
    elif x == 'Jun 30, 2020' :
        return '2020.5'
    elif x == 'Jun 30, 2019' :
        return '2019.5'
    elif x == 'May 31, 2020' :
        return '2020.3333'
    elif x == 'May 31, 2019' :
        return '2019.3333'
    elif x == 'Jan 31, 2019' :
        return '2019.0833'
    elif x == 'Jan 1, 2019' :
        return '2019'
    elif x == 'Jan 12, 2019' :
        return '2019.034'
    elif x == 'Dec 31, 2019' :
        return '2020'
    elif x == 'Jun 1, 2019' :
        return '2019.416'
    else :
        return x

In [None]:
train.contract_until = train.contract_until.apply(con_period).astype('float64')
test.contract_until = test.contract_until.apply(con_period).astype('float64')

In [None]:
train['continent'] = train['continent'].astype('category').cat.codes
test['continent'] = test['continent'].astype('category').cat.codes

train['position'] = train['position'].astype('category').cat.codes
test['position'] = test['position'].astype('category').cat.codes

train['prefer_foot'] = train['prefer_foot'].astype('category').cat.codes
test['prefer_foot'] = test['prefer_foot'].astype('category').cat.codes

In [None]:
train.value = np.log1p(train.value)

In [None]:
train = pd.get_dummies(columns = ['continent', 'position', 'prefer_foot'], data = train)
test = pd.get_dummies(columns = ['continent', 'position', 'prefer_foot'], data = test)

In [None]:
train[['age','reputation', 'stat_potential']] = np.log1p(train[['age','reputation', 'stat_potential']])
test[['age','reputation', 'stat_potential']] = np.log1p(test[['age','reputation', 'stat_potential']])

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = StandardScaler()

std_feature = ['age', 'contract_until', 'reputation', 'stat_overall',
       'stat_potential', 'stat_skill_moves']

train[std_feature] = pd.DataFrame(scaler.fit_transform(train[std_feature]), columns =  train[std_feature].columns)
test[std_feature] = pd.DataFrame(scaler.fit_transform(test[std_feature]), columns = test[std_feature].columns)

In [None]:
feature = ['age', 'contract_until', 'reputation', 'stat_overall',
       'stat_potential', 'stat_skill_moves', 'continent_0',
       'continent_1', 'continent_2', 'continent_3', 'continent_4',
       'position_0', 'position_1', 'position_2', 'position_3', 'prefer_foot_0',
       'prefer_foot_1']

x_train = train[feature]
y_train = train['value']
x_test = test[feature]

## 모델링

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


n_folds = 3

def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    rmse= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

- **RandomForestRegressor 튜닝 후**

In [None]:
model_rf = RandomForestRegressor(n_estimators= 100,
                                 min_samples_leaf= 1,
                                 max_features= 'auto',
                                 bootstrap= False,
                                 random_state=22)

In [None]:
score = rmse_cv(model_rf)
rf_score = score.mean()
rf_score

In [None]:
model_rf.fit(x_train, y_train)
pred_rf = np.expm1(model_rf.predict(x_test))
pred_rf

In [None]:
submission['rf'] = pred_rf

- **GradientBoostingRegressor 튜닝 후**

In [None]:
model_gb = GradientBoostingRegressor(n_estimators=900,
                                     min_samples_split=2,
                                     min_samples_leaf=2,
                                     max_features = 'sqrt',
                                     max_depth = 5,
                                     random_state=22)

In [None]:
score = rmse_cv(model_gb)
gb_score = score.mean()
gb_score

In [None]:
model_gb.fit(x_train, y_train)
pred_gb = np.expm1(model_gb.predict(x_test))
pred_gb

In [None]:
submission['gb'] = pred_gb

- **LGBMRegressor 튜닝 후**

In [None]:
model_lgb = LGBMRegressor(n_estimators=300,
                          min_samples_split=10,
                          min_samples_leaf=4,
                          max_features='sqrt',
                          max_depth = None,
                          random_state=22)

In [None]:
score = rmse_cv(model_lgb)
lgb_score = score.mean()
lgb_score

In [None]:
model_lgb.fit(x_train, y_train)
pred_lgb = np.expm1(model_lgb.predict(x_test))
pred_lgb

In [None]:
submission['lgb'] = pred_lgb

- **XGBRegressor 튜닝 후**

In [None]:
model_xgb = XGBRegressor(n_estimators=1000,
                         max_depth = 5,
                         random_state=22)

In [None]:
score = rmse_cv(model_xgb)
xgb_score = score.mean()
xgb_score

In [None]:
model_xgb.fit(x_train, y_train)
pred_xgb = np.expm1(model_xgb.predict(x_test))
pred_xgb

In [None]:
submission['xgb'] = pred_xgb

- **ExtraTreesRegressor 튜닝 후**

In [None]:
model_extree = ExtraTreesRegressor(n_estimators=700,
                                   random_state=22)

In [None]:
score = rmse_cv(model_extree)
extree_score = score.mean()
extree_score

In [None]:
model_extree.fit(x_train, y_train)
pred_extree = np.expm1(model_extree.predict(x_test))
pred_extree

In [None]:
submission['extree'] = pred_extree

- **CatBoostRegressor 튜닝 후**

In [None]:
model_cat = CatBoostRegressor(depth=2, learning_rate=0.1, eval_metric='RMSE', verbose=0, random_seed=22)

In [None]:
score = rmse_cv(model_cat)
cat_score = score.mean()
cat_score

In [None]:
model_cat.fit(x_train, y_train)
pred_cat = np.expm1(model_cat.predict(x_test))
pred_cat

In [None]:
submission['cat'] = pred_cat

- **StackingCVRegressor**

In [None]:
!pip install mlxtend

In [None]:
from mlxtend.regressor import StackingCVRegressor

In [None]:
model_stack = StackingCVRegressor(regressors=(model_rf, model_extree, model_lgb, model_xgb, model_gb, model_cat),
                                meta_regressor=model_cat,
                                use_features_in_secondary=True,
                                n_jobs=-1)

In [None]:
score = rmse_cv(model_stack)
stk_score = score.mean()
stk_score

In [None]:
model_stack.fit(np.array(x_train), y_train)
pred_stk = np.expm1(model_stack.predict(np.array(x_test)))
pred_stk

In [None]:
submission['stk'] = pred_stk

- **VotingRegressor**

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
model_vot = VotingRegressor([('cat', model_cat), ('rf', model_rf),('xgb',model_xgb),('lgb',model_lgb),('extree',model_extree),('gb',model_gb)])

In [None]:
score = rmse_cv(model_vot)
vot_score = score.mean()
vot_score

In [None]:
model_vot.fit(x_train, y_train)
pred_vot = np.expm1(model_vot.predict(x_test))
pred_vot

In [None]:
submission['vot'] = pred_vot

- **Blending**

In [None]:
b_all_model = (pred_cat + pred_extree + pred_xgb + pred_lgb + pred_gb + pred_rf + pred_stk+ pred_vot)/8 

In [None]:
submission['blending'] = b_all_model

In [None]:
submission

- **Outlier**

In [None]:
submission['blending'].plot()

인덱스 0번만 과하게 높은 것을 확인, 낮추는 방향으로 진행

In [None]:
q1 = submission['blending'].quantile(0.0042)
q2 = submission['blending'].quantile(0.99)


sub  = pd.read_csv('./data/submission.csv')
sub['value'] = submission['blending'].apply(lambda x: x if x > q1 else x*0.77)
sub['value'] = submission['blending'].apply(lambda x: x if x < q2 else x*1.1)

In [None]:
sub.to_csv("submission_fifa_regression1.csv", index=False)