# 문제 6

[Kaggle 형] train_prob.csv로 문제 failure 예측하는 모델을 만들고, 

test_prob.csv에 대한 failure가 1일 확률 예측하여 다음과 같은 형식의 answer6.csv를 만들어라. 

측정 지표는 AUC(area under of ROC curve)이다. id 는 테스트 케이스의 id 이고, failure에는 failure가 1이 될 확률이다.

id,failure

16115, 0.1

16116, 0.2


**강사: 멀티캠퍼스 강선구(sunku0316.kang@multicampus.com, sun9sun9@gmail.com)**

In [67]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys
import xgboost as xgb

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels, xgb]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1
xgboost 0.80


In [68]:
df_train = pd.read_csv('train_prob.csv', index_col='id')
df_test = pd.read_csv('test_prob.csv', index_col='id')
df_ans = pd.read_csv('test_prob_ans.csv', index_col='id') # 실제 시험에서는 존재 x, 자가 채점용

In [69]:
df_train[['na_1', 'na_2']] = df_train[['measurement_3', 'measurement_5']].isna()
df_test[['na_1', 'na_2']] = df_test[['measurement_3', 'measurement_5']].isna()

In [70]:
df_test['product_code'].value_counts()

D    5112
Name: product_code, dtype: int64

In [71]:
df_train['product_code'].value_counts()

C    5765
E    5343
B    5250
A    5100
Name: product_code, dtype: int64

In [72]:
# 전처리 단계에서 Iterative Imputer를 통한 결측치를 처리하는 루틴을 가져옵니다.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

imp = IterativeImputer(
    estimator = LinearRegression(fit_intercept=True),
    random_state=123
)

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']

df_train[X_imp] = df_train.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x[X_imp]), index=x.index, columns=x.columns)
)
df_test[X_imp] = df_test.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x[X_imp]), index=x.index, columns=x.columns)
)

In [73]:
# 방법 1로 구현합니다. (참고)

from sklearn.experimental import enable_iterative_imputer# 구문을 사용하여 실험 단계인 모듈을 활성화하고, 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']
# train에 등장하지 않은 수준이 있습니다, test를 포함하여 결측처리 모델을 만듭니다.
s_imp = pd.concat([
        df_train[X_imp + ['product_code']],
        df_test[X_imp + ['product_code']]
], axis=0).groupby('product_code')\
.apply(
    lambda x: IterativeImputer(estimator=LinearRegression(),random_state=123).fit(x[X_imp])
)
# train에 적용합니다.
df_train[X_imp] = df_train[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )
# test에 적용합니다.
df_test[X_imp] = df_test[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )

In [74]:
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
df_train[X_mean] = df_train.groupby('product_code')[X_mean].transform(lambda x: x.fillna(x.mean()))
df_test[X_mean] = df_test.groupby('product_code')[X_mean].transform(lambda x: x.fillna(x.mean()))

In [75]:
# 합쳐서 구해봅니다. (참고)
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
# 역시 train에 등장하지 않은 수준을 처리하기 위해 합치니다.
df_mean = pd.concat([
            df_train[['product_code'] + X_mean],
            df_test[['product_code'] + X_mean]
        ]).groupby('product_code')[X_mean].agg('mean')

df_train[X_mean] = df_train.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))
df_test[X_mean] = df_test.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))

In [76]:
df_train['loading'] = df_train['loading'].fillna(df_train['loading'].mean())
df_test['loading'] = df_test['loading'].fillna(df_train['loading'].mean())

# Kaggle형 풀이 단계

Step 1: 검증 방법을 정하고, 검증 루틴을 만듭니다.

Step 2: Baseline 모델을 만듭니다

Step 3: 모델 선택 루틴을 만듭니다.

Step 4: 모델 개선 작업을 합니다.

In [188]:
s_hist = list()

In [189]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

X_all = df_test.columns.tolist()

# Step1: 검증 방법을 정합니다. 4-fold Grouped CV 
gcv = GroupKFold(4)

# GroupKFold의 특징을 보여드리기 위한 루틴입니다.
# 실제로는 필요한 코드는 아니지만 참고용으로 남겨 둡니다.
# Validation set에는 Validation Train에 등장하지 않은 범주값으로 구성이 되도록하는 합니다.
for train_idx, test_idx in gcv.split(df_train[X_all], df_train['failure'], groups=df_train['product_code']):
    print(df_train.iloc[train_idx]['product_code'].unique(), df_train.iloc[test_idx]['product_code'].unique())


def eval_model(model):
    return cross_validate(
        model, df_train[X_all], df_train['failure'], 
        groups=df_train['product_code'], scoring='roc_auc', cv=gcv, return_train_score=True
    )

def print_result(model_name, result):
    output = 'Valid: {:.5f}±{:.5f},  V.Train: {:.5f}±{:.5f}'.format( 
            np.mean(result['test_score']), np.std(result['test_score']),
            np.mean(result['train_score']), np.std(result['train_score']),
        )
    print(output)
    s_hist.append(pd.Series([model_name, output], index=['model name', 'result']))

# Step 3: 모델 선택 루틴입니다.
def select_model(model):
    # 전체 데이터셋으로 학습을 합니다.
    model.fit(df_train[X_all], df_train['failure'])
    # 평가 데이터셋으로 예측합니다.
    prd = model.predict_proba(df_test[X_all])[:, 1]
    # 제출 파일을 만듭니다.
    pd.DataFrame({
        'id': df_test.index,
        'failure': prd
    }).to_csv('answer6.csv', index=None)
    return prd

['A' 'B' 'E'] ['C']
['A' 'B' 'C'] ['E']
['A' 'C' 'E'] ['B']
['B' 'C' 'E'] ['A']


In [190]:
# Step2 Baseline 모델을 만들어 검증합니다.
# Baseline: LR - SFS - ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17', 'na_1']
from sklearn.linear_model import LogisticRegression

clf_lr = make_pipeline(
    ColumnTransformer([
        ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
        ('pt', 'passthrough', ['na_1'])
    ]), 
    LogisticRegression(solver='lbfgs')
)
result = eval_model(clf_lr)
print_result('baseline', result)

Valid: 0.58937±0.00379,  V.Train: 0.59191±0.00146


In [191]:
prd = select_model(clf_lr)
print("Baseline 채점 결과:", roc_auc_score(df_ans['failure'], prd))

Baseline 채점 결과: 0.5883988309352517


In [192]:
# lr2: LR + feature PCA(n_components = 7)
from sklearn.decomposition import PCA

ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), ['measurement_{}'.format(i) for i in range(18)]),
    ('std', StandardScaler(), ['loading']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr2 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model(clf_lr2)
print_result('lr2', result)

Valid: 0.58857±0.00294,  V.Train: 0.59163±0.00102


In [193]:
# lr3: Basline + np.log (loading)
from sklearn.preprocessing import FunctionTransformer

ct = ColumnTransformer([
    ('std', StandardScaler(), ['measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('log_std', make_pipeline(FunctionTransformer(func=np.log, validate=False), StandardScaler()), ['loading']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr3 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model(clf_lr3)
print_result('lr3', result)

Valid: 0.58935±0.00389,  V.Train: 0.59181±0.00145


In [194]:
# LDA
# TODO: 속성선택도 적용해봅니다.
clf_lda = make_pipeline(
    ColumnTransformer([
        ('std', StandardScaler(), ['measurement_{}'.format(i) for i in range(18)] + ['loading']),
        #('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
        ('pt', 'passthrough', ['na_1'])
    ]), 
    LinearDiscriminantAnalysis()
)
result = eval_model(clf_lda)
print_result('lda', result)

Valid: 0.58716±0.00329,  V.Train: 0.59336±0.00160


In [195]:
prd = select_model(clf_lda)
print("LDA 채점 결과:", roc_auc_score(df_ans['failure'], prd))

LDA 채점 결과: 0.5894689748201439


In [196]:
# RF: RandomForestClassifier: {'max_depth': 7, 'min_samples_split': 512, 'n_estimators': 15} 튜닝
# TODO: 수업시간에 튜닝을 해봅니다.
"""
from sklearn.ensemble import RandomForestClassifier
ct = ColumnTransformer([
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)])
])
clf_rf = make_pipeline(ct, RandomForestClassifier(
    n_estimators=?, max_depth=?, min_samples_split=?, random_state=123, 
))
result = eval_model(clf_rf)
print_result('rf', result)
"""

"\nfrom sklearn.ensemble import RandomForestClassifier\nct = ColumnTransformer([\n    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)])\n])\nclf_rf = make_pipeline(ct, RandomForestClassifier(\n    n_estimators=?, max_depth=?, min_samples_split=?, random_state=123, \n))\nresult = eval_model(clf_rf)\nprint_result('rf', result)\n"

In [197]:
# RF: RandomForestClassifier + LinearDiscriminantAnalysis
# TODO: LDA를 결합시키는 루틴을 만들어 보겠습니다.
"""
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

ct = ColumnTransformer([
    ('std_lda', ? , ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'])# + ['measurement_{}'.format(i) for i in range(18)])
])
clf_rf2 = make_pipeline(ct, RandomForestClassifier(
    n_estimators=?, max_depth=?, min_samples_split=?, random_state=123,
))
result = eval_model(clf_rf2)
print_result('rf2', result)
"""

"\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.ensemble import RandomForestClassifier\n\nct = ColumnTransformer([\n    ('std_lda', ? , ['measurement_{}'.format(i) for i in range(18)]),\n    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'])# + ['measurement_{}'.format(i) for i in range(18)])\n])\nclf_rf2 = make_pipeline(ct, RandomForestClassifier(\n    n_estimators=?, max_depth=?, min_samples_split=?, random_state=123,\n))\nresult = eval_model(clf_rf2)\nprint_result('rf2', result)\n"

In [198]:
# XGB: XGBoost
# TODO: 튜닝을 같이 해봅니다.
"""
import xgboost as xgb
ct = ColumnTransformer([
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)])
])
clf_xgb = xgb.XGBClassifier(
    learning_rate=?, n_estimators=?, subsample=?, colsample_bytree=?, max_depth=?, random_state=123
)
clf_xgb = make_pipeline(ct, clf_xgb)
result = eval_model(clf_xgb)
print_result('xgb', result)
"""

"\nimport xgboost as xgb\nct = ColumnTransformer([\n    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)])\n])\nclf_xgb = xgb.XGBClassifier(\n    learning_rate=?, n_estimators=?, subsample=?, colsample_bytree=?, max_depth=?, random_state=123\n)\nclf_xgb = make_pipeline(ct, clf_xgb)\nresult = eval_model(clf_xgb)\nprint_result('xgb', result)\n"

In [199]:
pd.DataFrame(s_hist).groupby('model name').last()

Unnamed: 0_level_0,result
model name,Unnamed: 1_level_1
baseline,"Valid: 0.58937±0.00379, V.Train: 0.59191±0.00146"
lda,"Valid: 0.58716±0.00329, V.Train: 0.59336±0.00160"
lr2,"Valid: 0.58857±0.00294, V.Train: 0.59163±0.00102"
lr3,"Valid: 0.58935±0.00389, V.Train: 0.59181±0.00145"


In [200]:
# Voting을 합니다.
# TODO: 같이 Voting 모델까지 만들어봅니다.
from sklearn.ensemble import VotingClassifier
"""
clf_vt = VotingClassifier([
    ('baseline', clf_lr), # LR + SFS
    ('lr2', clf_lr2), # LR.2: LR + feature PCA
    ('lr3', clf_lr3), # Basline + np.log (loading)
    ('lda', clf_lda), # LDA
    ('rf', clf_rf), # RF 튜닝
    ('rf2', clf_rf2), # RF + LDA
    ('xgb', clf_xgb), # XGB
], voting='soft')
result = eval_model(clf_vt)
result,  np.mean(result['test_score']), np.mean(result['train_score'])
"""

"\nclf_vt = VotingClassifier([\n    ('baseline', clf_lr), # LR + SFS\n    ('lr2', clf_lr2), # LR.2: LR + feature PCA\n    ('lr3', clf_lr3), # Basline + np.log (loading)\n    ('lda', clf_lda), # LDA\n    ('rf', clf_rf), # RF 튜닝\n    ('rf2', clf_rf2), # RF + LDA\n    ('xgb', clf_xgb), # XGB\n], voting='soft')\nresult = eval_model(clf_vt)\nresult,  np.mean(result['test_score']), np.mean(result['train_score'])\n"