# 문제 6

[Kaggle 형] train_prob.csv로 문제 failure 예측하는 모델을 만들고, 

test_prob.csv에 대한 failure가 1일 확률 예측하여 다음과 같은 형식의 answer6.csv를 만들어라. 

측정 지표는 AUC(area under of ROC curve)이다. id 는 테스트 케이스의 id 이고, failure에는 failure가 1이 될 확률이다.

id,failure

16115, 0.1

16116, 0.2


In [1]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1


In [2]:
df_train = pd.read_csv('train_prob.csv', index_col='id')
df_test = pd.read_csv('test_prob.csv', index_col='id')

In [3]:
# 결측치를 처리하기 전에,
# 결측 여부가 failure를 예측하는데, 유용할 만하다고 도출된
# measurement_3, measurement_5의 결측 여부만 남깁니다.
df_train[['na_1', 'na_2']] = df_train[['measurement_3', 'measurement_5']].isna()
df_test[['na_1', 'na_2']] = df_test[['measurement_3', 'measurement_5']].isna()

In [4]:
df_train['product_code'].value_counts()

C    5765
E    5343
B    5250
A    5100
Name: product_code, dtype: int64

In [5]:
df_test['product_code'].value_counts()

D    5112
Name: product_code, dtype: int64

In [6]:
from sklearn.experimental import enable_iterative_imputer# 구문을 사용하여 실험 단계인 모듈을 활성화하고, 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']
# train에 등장하지 않은 수준이 있습니다, test를 포함하여 결측처리 모델을 만듭니다.
s_imp = pd.concat([
        df_train[X_imp + ['product_code']],
        df_test[X_imp + ['product_code']]
], axis=0).groupby('product_code')\
.apply(
    lambda x: IterativeImputer(estimator=LinearRegression(),random_state=123).fit(x[X_imp])
)
# train에 적용합니다.
df_train[X_imp] = df_train[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )
# test에 적용합니다.
df_test[X_imp] = df_test[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )

In [7]:
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
# 역시 train에 등장하지 않은 수준을 처리하기 위해 합치니다.
df_mean = pd.concat([
            df_train[['product_code'] + X_mean],
            df_test[['product_code'] + X_mean]
        ]).groupby('product_code')[X_mean].agg('mean')

df_train[X_mean] = df_train.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))
df_test[X_mean] = df_test.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))

In [8]:
df_train['loading'] = df_train['loading'].fillna(df_train['loading'].mean())
# loading은 train에서의 평균으로 결측치를 처리합니다.
df_test['loading'] = df_test['loading'].fillna(df_train['loading'].mean())
df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)

In [17]:
# 공통으로 사용할 만한 요소입니다.
from sklearn.model_selection import StratifiedKFold, cross_validate, GroupKFold
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True) # 5겹의 층화교차검증을 사용합니다.
gcv = GroupKFold(n_splits=4) # 그룹 교차검증을 사용합니다.
X = df_test.columns.tolist()

# 계층적 교차 검증으로 검증합니다.
def eval_model_cv(model):
    return cross_validate(
        model, df_train[X], df_train['failure'], cv=cv, scoring='roc_auc', 
        return_train_score=True
    )
# 그룹 교차 검증으로 검증합니다.
def eval_model_gcv(model):
    return cross_validate(
        model, df_train[X], df_train['failure'], cv=gcv, scoring='roc_auc', 
        return_train_score=True, groups=df_train['product_code']
    )

# 모델을 선택하고, train 셋으로 학습, test로 제출 결과를 뽑아냅니다.
def choose_model(model):
    model.fit(df_train[X], df_train['failure'])
    prd = model.predict_proba(df_test[X])[:, 1]
    pd.DataFrame({
        'failure': prd
    }, index=df_test.index).to_csv('answer6.csv')
    return prd

df_ans = pd.read_csv('test_prob_ans.csv', index_col='id')

In [18]:
for train_idx, test_idx in gcv.split(df_train[X], df_train['failure'], groups=df_train['product_code']):
    print(
        df_train.iloc[train_idx]['product_code'].unique(),
        df_train.iloc[test_idx]['product_code'].unique()
    )

['A' 'B' 'E'] ['C']
['A' 'B' 'C'] ['E']
['A' 'C' 'E'] ['B']
['B' 'C' 'E'] ['A']


In [19]:
from sklearn.linear_model import LogisticRegression
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr = make_pipeline(ct, 
    LogisticRegression(
        solver = 'lbfgs'
    )
)
result = eval_model_cv(clf_lr)
result, np.mean(result['test_score'])

({'fit_time': array([0.02523446, 0.01967311, 0.01565123, 0.01565051, 0.01565766]),
  'score_time': array([0.00299335, 0.0020256 , 0.01559186, 0.01559186, 0.01558542]),
  'test_score': array([0.58840017, 0.5909264 , 0.57725538, 0.61231432, 0.58375591]),
  'train_score': array([0.59221214, 0.59124045, 0.59530826, 0.58642301, 0.59368373])},
 0.590530435171204)

In [20]:
prd = choose_model(clf_lr)

In [21]:
roc_auc_score(df_ans['failure'], prd)

0.5883988309352517

In [22]:
# Train/Test 구성과 유사하게 하니,
# 실제 채점 결과와 계층적 교차검증보다 가깝습니다. 
result = eval_model_gcv(clf_lr)
result, np.mean(result['test_score'])

({'fit_time': array([0.01579094, 0.02072024, 0.01763535, 0.03124261]),
  'score_time': array([0.        , 0.00398898, 0.        , 0.        ]),
  'test_score': array([0.58822089, 0.58492694, 0.58894173, 0.59538985]),
  'train_score': array([0.59262299, 0.59350682, 0.59192443, 0.58956962])},
 0.5893698519267131)

In [23]:
# PCA + LogisticRegression 모델입니다.
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading']),
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)) , ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['na_1', 'na_2'])
])
X_lr = ['loading'] + ['measurement_{}'.format(i) for i in range(18)] + ['na_1', 'na_2']
clf_lr_2 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model_cv(clf_lr_2)
result, np.mean(result['test_score'])

({'fit_time': array([0.06183243, 0.06248498, 0.07810616, 0.06586289, 0.0599103 ]),
  'score_time': array([0.      , 0.      , 0.      , 0.004987, 0.      ]),
  'test_score': array([0.58192954, 0.5891719 , 0.57115526, 0.61208137, 0.58374581]),
  'train_score': array([0.5926737 , 0.59170395, 0.59499   , 0.58692047, 0.59377472])},
 0.5876167763141719)

In [24]:
# RandomForestClassifier 모델도 만들어 봅니다.  (좀더 튜닝했습니다.)
# PCA도 넣어 봅니다.
from sklearn.ensemble import RandomForestClassifier
X_rf = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), X_rf),
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'])
])
clf_rf = make_pipeline(ct, RandomForestClassifier(
    n_estimators=150, max_depth=7, min_samples_split= 512, random_state=123, n_jobs=4
))
result = eval_model_gcv(clf_rf)
result,  np.mean(result['test_score'])

({'fit_time': array([0.751055  , 0.78743219, 0.78190827, 0.78105474]),
  'score_time': array([0.1249702 , 0.12497139, 0.12497663, 0.12498283]),
  'test_score': array([0.5827436 , 0.58031524, 0.58424684, 0.58467245]),
  'train_score': array([0.63213667, 0.63584284, 0.63679302, 0.6299596 ])},
 0.5829945338197152)

In [25]:
# XGBoost로 해봅니다.
import xgboost as xgb
X_xgb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_xgb)
])
clf_xgb = xgb.XGBClassifier(learning_rate=0.01, n_estimators=300, max_depth=2, random_state=123)
clf_xgb = make_pipeline(ct, clf_xgb)
result = eval_model_gcv(clf_xgb)
result,  np.mean(result['test_score'])

({'fit_time': array([2.28295064, 2.31192613, 2.28074241, 2.29630136]),
  'score_time': array([0.03124142, 0.0312438 , 0.01562238, 0.01562309]),
  'test_score': array([0.58401089, 0.58209848, 0.58241511, 0.59154288]),
  'train_score': array([0.60348145, 0.60425951, 0.60456887, 0.60260103])},
 0.5850168400222092)

In [26]:
# GradientBoosting 모델을 만들어 봅니다.
from sklearn.ensemble import GradientBoostingClassifier
X_gb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_gb)
])
clf_gb = make_pipeline(ct, GradientBoostingClassifier(
    n_estimators=100, max_depth=2, learning_rate=0.01, random_state=123
))
result = eval_model_gcv(clf_gb)
result,  np.mean(result['test_score'])

({'fit_time': array([0.96959281, 0.97944617, 0.98414373, 0.96852136]),
  'score_time': array([0.01562309, 0.01562214, 0.01562238, 0.01562214]),
  'test_score': array([0.58394407, 0.58068345, 0.5846679 , 0.59357819]),
  'train_score': array([0.59490185, 0.59741658, 0.59562136, 0.59252278])},
 0.5857184027349147)

In [27]:
# Multi-Layer-Perceptron으로 해봅니다.
from sklearn.neural_network import MLPClassifier
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
clf_mlp = make_pipeline(ct, 
    MLPClassifier(
        hidden_layer_sizes=[4],
        random_state=123
    )
)
result = eval_model_gcv(clf_mlp)
result,  np.mean(result['test_score'])

({'fit_time': array([0.58997488, 0.61275887, 0.56079626, 0.59363747]),
  'score_time': array([0.        , 0.00444078, 0.        , 0.        ]),
  'test_score': array([0.58419268, 0.58659698, 0.58761482, 0.58167056]),
  'train_score': array([0.59506176, 0.59357187, 0.59179506, 0.59154855])},
 0.5850187607866695)

In [28]:
# Baseline입니다.
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
X_lr = ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17', 'na_1']
clf_lr_1 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))

# PCA + LogisticRegression 모델입니다.
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading']),
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)) , ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['na_1', 'na_2'])
])
X_lr = ['loading'] + ['measurement_{}'.format(i) for i in range(18)] + ['na_1', 'na_2']
clf_lr_2 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))

# GradientBoosting 모델입니다.
X_gb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_gb)
])
clf_gb = make_pipeline(ct, GradientBoostingClassifier(
    n_estimators=100, max_depth=2, learning_rate=0.01, random_state=123
))

# Random Forest 입니다.
ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'])
])
clf_rf = make_pipeline(ct, RandomForestClassifier(
    n_estimators=100, max_depth=7, min_samples_split= 512, random_state=123
))

# XGBoost 입니다.
X_xgb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_xgb)
])
clf_xgb = make_pipeline(ct, xgb.XGBClassifier(
    n_estimators=300, max_depth=2, learning_rate=0.01, random_state=123
))

# MLP 모델입니다.
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
clf_mlp = make_pipeline(ct, 
    MLPClassifier(
        hidden_layer_sizes=[4],
        random_state=123
    )
)

In [29]:
from sklearn.ensemble import VotingClassifier
X_vt = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)] 
# 모두 앙상블을 해봅니다. roc_auc에서 확률을 활용하므로 voting 을 soft로 합니다.
clf_vt = VotingClassifier(
    [
        ('lr', clf_lr_1), #  Baseline
        ('lr_2', clf_lr_2), # PCA + 속성 선택 모델 
        ('gb', clf_gb), # GradientBoost
        ('rf', clf_rf), # Random Forest
        ('xgb', clf_xgb), # xgboost
        ('mlp', clf_mlp) # MLP
    ],
    voting='soft'
)
result = eval_model_gcv(clf_vt)
result,  np.mean(result['test_score'])

({'fit_time': array([5.28160286, 5.42056918, 5.65805578, 5.48310256]),
  'score_time': array([0.07813454, 0.07810807, 0.07813406, 0.07810521]),
  'test_score': array([0.58839276, 0.58575449, 0.58929202, 0.59346193]),
  'train_score': array([0.60512637, 0.60647891, 0.60552289, 0.60304054])},
 0.5892253004317329)

In [30]:
prd = choose_model(clf_vt)
roc_auc_score(df_ans['failure'], prd)

0.5923370053956835