In [1]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1


## 문제 6

**Kaggle 형** train_prob.csv로 문제 target을 예측하는 모델을 만들고, 

test_prob.csv에 대한 target 예측하여 다음과 같은 형식의 answer6.csv를 만들어라.

id, target

0, 6.9

5, 7.8

...


**평가지표**

$RMSE(Y, \hat{Y}) = \sqrt{\frac{1}{n}\sum^{n}_{i=1}(y_i-\hat{y_i})^2}$


In [2]:
df_train = pd.read_csv('train_prob.csv', index_col='id')
df_test = pd.read_csv('test_prob.csv', index_col='id')

In [3]:
# 반복문을 구성하여 처리해 봅니다.

# 처리 내용을 정의합니다, (대상 변수명, 치환할 내용, 치환후 수준별 카운트)
repl_list = [
    ('cat3', {'B': 'C'}, [83634, 147361, 9005]), 
    ('cat4', {'A': 'B', 'D': 'B'}, [239397, 603]),
    ('cat6', {'D': 'A', 'E': 'B', 'G': 'C', 'H': 'B', 'I': 'A'}, [234203, 5145, 652]),
    ('cat7', {'A': 'B', 'C': 'B', 'F': 'D', 'I': 'B'}, [4606, 19784, 214027, 1583]),
    ('cat8', {'B': 'G', 'F': 'E'}, [30338, 96743, 2953, 76085, 33881]),
    ('cat9', {'C': 'H', 'D': 'B', 'E': 'L'}, [10678, 2846, 85944, 8320, 19987, 40070, 5501, 16743, 33793, 7819, 3331, 4968])
]

for c, d, cnt in repl_list:
    print(c, d, cnt)
    s_repl = df_train[c].replace(d) # 치환을 합니다. (아직 반영은 하지 않습니다.)
    if not (s_repl.value_counts().sort_index() == cnt).all(): # 치환후 카운트를 체크합니다.
        print("Error", c, d, cnt, s_repl.value_counts().sort_index()) # 에러 내용을 출력합니다.
        break
    df_train[c] = s_repl # 치환한 결과를 반영합니다.
    df_test[c] = df_test[c].replace(d) # 테스트에 대해서도 반영합니다.

cat3 {'B': 'C'} [83634, 147361, 9005]
cat4 {'A': 'B', 'D': 'B'} [239397, 603]
cat6 {'D': 'A', 'E': 'B', 'G': 'C', 'H': 'B', 'I': 'A'} [234203, 5145, 652]
cat7 {'A': 'B', 'C': 'B', 'F': 'D', 'I': 'B'} [4606, 19784, 214027, 1583]
cat8 {'B': 'G', 'F': 'E'} [30338, 96743, 2953, 76085, 33881]
cat9 {'C': 'H', 'D': 'B', 'E': 'L'} [10678, 2846, 85944, 8320, 19987, 40070, 5501, 16743, 33793, 7819, 3331, 4968]


In [4]:
# 문제 4번을 활용하기 위해 반듭니다.
df_train['targetA'] = df_train['target'] <= 7.45

In [5]:
q = [i for i in np.arange(0, 1.01, 0.01)]
# 나머지 변수에 대해서도 해당 파생 변수를 만들어 줍니다.
for i in range(0, 14):
    col = 'cont{}'.format(i)
    qt = df_train[col].quantile(q)
    qt.iloc[[0, -1]] = [-np.inf, np.inf]
    q_cut = pd.cut(df_train[col], bins=qt)
    q_mean = df_train.groupby(q_cut)['target'].mean()
    df_train[col + '_q'] = q_cut.map(q_mean).astype('float')
    df_test[col + '_q'] = pd.cut(df_test[col], bins=qt).map(q_mean).astype('float')

In [6]:
from scipy.stats import norm


mu_A, s_A = 6.769, 0.616
mu_B, s_B = 8.123, 0.527

df_train_clf = df_train.assign(
    # 귀무가설 : target은 A입니다, 대립가설: target은 B입니다.
    prob_A = 1 - norm.cdf(df_train['target'], loc=mu_A, scale=s_A),
    # 귀무가설 : target은 B입니다, 대립가설: target은 A입니다.
    prob_B = norm.cdf(df_train['target'], loc=mu_B, scale=s_B)
)
df_train_clf = df_train_clf.query('prob_B < 0.01 or prob_A < 0.01').copy()

In [7]:
# 공통
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold 

cv = KFold(n_splits=5)
neg_rmse = make_scorer(lambda y, y_hat: -mean_squared_error(y, y_hat) ** 0.5)
df_ans = pd.read_csv('test_prob_ans.csv', index_col='id')

In [8]:
# 문제 4번 모델을 만듭니다. targetA일 확률을 활용할 예정입니다.
import xgboost as xgb

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)])
])
X_xgb = ['cont{}'.format(i) for i in range(14)] + ['cat{}'.format(i) for i in range(10)]
clf_xgb = make_pipeline(
    ct,
    xgb.XGBClassifier(
        max_depth = 2, # 트리의 최대 깊이 2
        reg_alpha = 0.1, # L1 규제 0.1
        reg_lambda = 0.1, # L2 규제 0.1
        colsample_bytree=0.25, # 트리 당 컬럼 샘플링 비율 0.25
        n_estimators=500, # 트리의 수 500
        random_state=123, # random_state 123
    )
)

X_xgb = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)]
clf_xgb.fit(df_train_clf[X_xgb], df_train_clf['targetA'])

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ohe',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['cat0', 'cat1', 'cat2',
                                     

In [9]:
df_train['targetA_prob'] = clf_xgb.predict_proba(df_train[X_xgb])[:, 1]
df_test['targetA_prob'] = clf_xgb.predict_proba(df_test[X_xgb])[:, 1]

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 가용한 모든 변수를 사용하라고 합니다.
# Baseline 모델로 삼으라고 합니다.

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['cat{}'.format(i) for i in range(10)]),# 선형 모델 OHE는 drop='first' 잊지 마세요
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)])
])

X_lr = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)]
reg_lr_1 = make_pipeline(
    ct, 
    LinearRegression()
)
score_ = cross_val_score(reg_lr_1, df_train[X_lr], df_train['target'], scoring=neg_rmse, cv=cv)
np.mean(score_)

-0.8632456423386845

In [24]:
# Baselie 제출파일입니다.
pd.DataFrame(
    reg_lr_1.predict(df_test[X_lr]),
    index=df_test.index, columns=['target']
).to_csv('answer6.csv')

In [11]:
reg_lr_1.fit(df_train[X_lr], df_train['target'])
mean_squared_error(df_ans['target'], reg_lr_1.predict(df_test[X_lr])) ** 0.5

0.8657267201878256

In [12]:
# 위에서 도출한 targetA_prob를 추가해봅니다.
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)] + ['targetA_prob'])
])

X_lr = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)] + ['targetA_prob']
reg_lr_2 = make_pipeline(
    ct, 
    LinearRegression()
)
score_ = cross_val_score(reg_lr_1, df_train[X_lr], df_train['target'], scoring=neg_rmse, cv=cv)
np.mean(score_)

-0.8632456423386845

In [13]:
reg_lr_2.fit(df_train[X_lr], df_train['target'])
mean_squared_error(df_ans['target'], reg_lr_2.predict(df_test[X_lr])) ** 0.5

0.8493184210459027

In [14]:
# quantile -> mean 파생변수의 Leak을 없앤 버젼의 교차 검증을 들고 왔습니다.
from sklearn.linear_model import Ridge

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}_q'.format(i) for i in range(14)] + ['targetA_prob'])
])
X_rd = ['cat{}'.format(i) for i in range(10)] + ['cont{}_q'.format(i) for i in range(14)] + ['targetA_prob']
reg_rd= make_pipeline(
    ct, 
    Ridge(alpha=0.1)
)
q = [i for i in np.arange(0, 1.01, 0.01)]
score_ = []
for train_idx, test_idx in cv.split(df_train):
    df_cv_train, df_cv_test = df_train.iloc[train_idx].copy(), df_train.iloc[test_idx].copy()
    # 검증셋에서 train으로 파생변수를 만들고
    # 검증셋의 test(겹외셋)에 검증셋의 train으로 만든 통계값(mean)을 반영합니다.
    for i in range(0, 14):
        col = 'cont{}'.format(i)
        qt = df_cv_train[col].quantile(q)
        qt.iloc[[0, -1]] = [-np.inf, np.inf]
        q_cut = pd.cut(df_cv_train[col], bins=qt)
        q_mean = df_cv_train.groupby(q_cut)['target'].mean()
        df_cv_train[col + '_q'] = q_cut.map(q_mean).astype('float')
        df_cv_test[col + '_q'] = pd.cut(df_cv_test[col], bins=qt).map(q_mean).astype('float')
    reg_rd.fit(df_cv_train[X_rd], df_cv_train['target'])
    score_.append(-(mean_squared_error(df_cv_test['target'], reg_rd.predict(df_cv_test[X_rd])) ** 0.5))
np.mean(score_), score_

(-0.8436393559001433,
 [-0.8405098833949913,
  -0.8462240557462019,
  -0.841351783962399,
  -0.8421568739266677,
  -0.847954182470457])

In [17]:
reg_rd.fit(df_train[X_rd], df_train['target'])
mean_squared_error(df_ans['target'], reg_rd.predict(df_test[X_rd])) ** 0.5

0.847877050449364

In [18]:
# 문제5에서 최적의 모델을 가져옵니다.
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)] + ['targetA_prob'])
])
X_xgb = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)] + ['targetA_prob']
reg_xgb = make_pipeline(
    ct,
    xgb.XGBRegressor(
        colsample_bytree=0.25, 
        n_estimators=500,
        max_depth=2, # 트리의 최대 깊이는 2, 
        random_state=123 # 랜덤 시드는 123
    )
)
score_ = cross_val_score(reg_xgb, df_train[X_xgb], df_train['target'], scoring=neg_rmse, cv=cv)
np.mean(score_)

-0.8429296162391553

In [19]:
reg_xgb.fit(df_train[X_xgb], df_train['target'])
mean_squared_error(df_ans['target'], reg_xgb.predict(df_test[X_xgb])) ** 0.5

0.8473356006330207

In [20]:
# 앙상블할 모델들을 모아 왔습니다.
from sklearn.linear_model import Ridge, LinearRegression

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)] + ['targetA_prob'])
])

X_lr = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)] + ['targetA_prob']
reg_lr_2 = make_pipeline(
    ct, 
    LinearRegression()
)

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}_q'.format(i) for i in range(14)] + ['targetA_prob'])
])
X_rd = ['cat{}'.format(i) for i in range(10)] + ['cont{}_q'.format(i) for i in range(14)] + ['targetA_prob']
reg_rd= make_pipeline(
    ct, 
    Ridge(alpha=0.1)
)

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['cat{}'.format(i) for i in range(10)]),
    ('pt', 'passthrough', ['cont{}'.format(i) for i in range(14)] + ['targetA_prob'])
])
X_xgb = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)] + ['targetA_prob']
reg_xgb = make_pipeline(
    ct,
    xgb.XGBRegressor(
        colsample_bytree=0.25, # 트리 생성시 컬럼샘플링 비율: 0.25
        n_estimators=500, # 트리의 수: 500
        max_depth=2, # 트리의 최대 깊이는 2, 
        random_state=123 # 랜덤 시드는 123
    )
)

In [21]:
# Voting 앙상블 기법을 활용하여 앙상블 모델을 만듭니다.
from sklearn.ensemble import VotingRegressor

reg_vt = VotingRegressor([
    ('lr_1', reg_lr_2),
    ('lr_2', reg_rd),
    ('xgb', reg_xgb),
])
X_vt = ['cat{}'.format(i) for i in range(10)] + ['cont{}'.format(i) for i in range(14)] \
    + ['cont{}_q'.format(i) for i in range(14)] + ['targetA_prob']
score_ = cross_val_score(reg_vt, df_train[X_vt], df_train['target'], scoring=neg_rmse, cv=cv)
np.mean(score_), score_

(-0.8422019061048704,
 array([-0.83891828, -0.84479408, -0.8398083 , -0.84088908, -0.8465998 ]))

In [25]:
# 최종 제출파일입니다.
pd.DataFrame(
    reg_vt.predict(df_test[X_vt]),
    index=df_test.index, columns=['target']
).to_csv('answer6.csv')

In [23]:
reg_vt.fit(df_train[X_vt], df_train['target'])
mean_squared_error(df_ans['target'], reg_vt.predict(df_test[X_vt])) ** 0.5

0.8475964306877575

**총 8일간의 강의 들으시느라 고생 많으셨습니다. 꼭 좋은 결과가 있기를 바라며, 궁금한 사항 있으시면 연락주세요!** 

**멀티캠퍼스 강선구(sunku0316.kang@multicampus.com) 올림**