# 문제 6

[Kaggle 형] train_prob.csv로 문제 failure 예측하는 모델을 만들고, 

test_prob.csv에 대한 failure가 1일 확률 예측하여 다음과 같은 형식의 answer6.csv를 만들어라. 

측정 지표는 AUC(area under of ROC curve)이다. id 는 테스트 케이스의 id 이고, failure에는 failure가 1이 될 확률이다.

id,failure

16115, 0.1

16116, 0.2


**강사: 멀티캠퍼스 강선구(sunku0316.kang@multicampus.com, sun9sun9@gmail.com)**

In [1]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys
import xgboost as xgb

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels, xgb]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1
xgboost 0.80


In [2]:
df_train = pd.read_csv('train_prob.csv', index_col='id')
df_test = pd.read_csv('test_prob.csv', index_col='id')
df_ans = pd.read_csv('test_prob_ans.csv', index_col='id') # 정답을 가져옵니다.

In [3]:
# 결측치를 처리하기 전에,
# 결측 여부가 failure를 예측하는데, 유용할 만하다고 도출된
# measurement_3, measurement_5의 결측 여부만 남깁니다.
df_train[['na_1', 'na_2']] = df_train[['measurement_3', 'measurement_5']].isna()
df_test[['na_1', 'na_2']] = df_test[['measurement_3', 'measurement_5']].isna()

In [4]:
df_train['product_code'].value_counts()

C    5765
E    5343
B    5250
A    5100
Name: product_code, dtype: int64

In [5]:
df_test['product_code'].value_counts()

D    5112
Name: product_code, dtype: int64

In [6]:
from sklearn.experimental import enable_iterative_imputer# 구문을 사용하여 실험 단계인 모듈을 활성화하고, 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']
# train에 등장하지 않은 수준이 있습니다, test를 포함하여 결측처리 모델을 만듭니다.
s_imp = pd.concat([
        df_train[X_imp + ['product_code']],
        df_test[X_imp + ['product_code']]
], axis=0).groupby('product_code')\
.apply(
    lambda x: IterativeImputer(estimator=LinearRegression(),random_state=123).fit(x[X_imp])
)
# train에 적용합니다.
df_train[X_imp] = df_train[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )
# test에 적용합니다.
df_test[X_imp] = df_test[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )

In [7]:
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
# 역시 train에 등장하지 않은 수준을 처리하기 위해 합치니다.
df_mean = pd.concat([
            df_train[['product_code'] + X_mean],
            df_test[['product_code'] + X_mean]
        ]).groupby('product_code')[X_mean].agg('mean')

df_train[X_mean] = df_train.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))
df_test[X_mean] = df_test.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))

In [8]:
df_train['loading'] = df_train['loading'].fillna(df_train['loading'].mean())
# loading은 train에서의 평균으로 결측치를 처리합니다.
df_test['loading'] = df_test['loading'].fillna(df_train['loading'].mean())
df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)

In [14]:
# 공통으로 사용할 만한 요소입니다.
from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA

X_all = df_test.columns.tolist()
X_num = ['measurement_{}'.format(i) for i in range(18)]
gcv = GroupKFold(n_splits=4)

def eval_model(model):
    return cross_validate(
        model, df_train[X_all], df_train['failure'], cv=gcv, groups=df_train['product_code'], scoring='roc_auc',
        return_train_score=True
    )

def choose_model(model):
    model.fit(df_train[X_all], df_train['failure'])
    prd = model.predict_proba(df_test[X_all])[:, 1]
    pd.DataFrame(
        {
            'id': df_test.index.values,
            'failue': prd
        }
    ).to_csv('answer6.csv', index=None)
    return prd

In [15]:
# GroupKFold, group을 기준으로 test에는 train 등장하지 않는 수준으로 데이터를 구성합니다.
for train_idx, test_idx in gcv.split(df_train[X_all], df_train['failure'], groups=df_train['product_code']):
    df_cv_train = df_train.iloc[train_idx]
    df_cv_test = df_train.iloc[test_idx]
    print(df_cv_train['product_code'].unique(), df_cv_test['product_code'].unique())

['A' 'B' 'E'] ['C']
['A' 'B' 'C'] ['E']
['A' 'C' 'E'] ['B']
['B' 'C' 'E'] ['A']


In [16]:
# Baseline: LogisticRegression + Feature Selection 결과
from sklearn.linear_model import LogisticRegression

ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])

clf_lr = make_pipeline(
    ct,
    LogisticRegression(solver='lbfgs')
)

result = eval_model(clf_lr)
result, np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([0.02954531, 0.02253437, 0.02034092, 0.02004647]),
  'score_time': array([0.        , 0.00499153, 0.        , 0.00994658]),
  'test_score': array([0.58822089, 0.58492694, 0.58894173, 0.59538985]),
  'train_score': array([0.59262299, 0.59350682, 0.59192443, 0.58956962])},
 0.5893698519267131,
 0.5919059643763757)

In [17]:
prd = choose_model(clf_lr)
print("Baseline:", roc_auc_score(df_ans['failure'], prd))

Baseline: 0.5883988309352517


In [18]:
# LR2: LogisticRegression + PCA + loading_log

from sklearn.decomposition import PCA

ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), X_num),
    ('std', make_pipeline(FunctionTransformer(np.log, validate=False), StandardScaler()), ['loading']),
    ('pt', 'passthrough', ['na_1'])
])

clf_lr2 = make_pipeline(
    ct,
    LogisticRegression(solver='lbfgs')
)

result = eval_model(clf_lr2)
result, np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([0.08953071, 0.07006931, 0.07983232, 0.07975936]),
  'score_time': array([0.        , 0.01004124, 0.01043367, 0.0100956 ]),
  'test_score': array([0.58698931, 0.58494743, 0.58870126, 0.59365481]),
  'train_score': array([0.59247495, 0.59206317, 0.59131198, 0.58981003])},
 0.5885731997270945,
 0.5914150310507083)

In [19]:
# MLPClassifier: X_num + ['na_1', 'na_2']

from sklearn.neural_network import MLPClassifier
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading'] + ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['na_1', 'na_2'])
])

clf_mlp = make_pipeline(ct, MLPClassifier(hidden_layer_sizes=[4, 4], alpha=0.001, random_state=123))
result = eval_model(clf_mlp)
result, np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([0.96342087, 1.02677727, 1.72272348, 1.40716934]),
  'score_time': array([0.00660253, 0.00638103, 0.01019931, 0.01017404]),
  'test_score': array([0.58102148, 0.57867625, 0.57843643, 0.58456933]),
  'train_score': array([0.60183525, 0.60205625, 0.60462231, 0.60053106])},
 0.5806758736037123,
 0.6022612171962629)

In [20]:
# RF: RandomForestClassifier
# PCA도 넣어 봅니다.
from sklearn.ensemble import RandomForestClassifier
X_rf = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), X_rf),
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'])
])
clf_rf = make_pipeline(ct, RandomForestClassifier(
    n_estimators=150, max_depth=7, min_samples_split= 512, random_state=123, n_jobs=4
))
result = eval_model(clf_rf)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([1.46019888, 0.91251802, 0.90078855, 0.92212367]),
  'score_time': array([0.13073111, 0.13086629, 0.11725378, 0.13141513]),
  'test_score': array([0.582639  , 0.58183483, 0.58455389, 0.58543893]),
  'train_score': array([0.62905209, 0.63594675, 0.63614013, 0.6324617 ])},
 0.58361666379284,
 0.6334001676204187)

In [77]:
# XGB: XGBoost
import xgboost as xgb
X_xgb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_xgb)
])
clf_xgb = xgb.XGBClassifier(
    learning_rate=0.01, n_estimators=300, subsample=0.5, colsample_bytree=0.9, max_depth=2, random_state=123
)
clf_xgb = make_pipeline(ct, clf_xgb)
result = eval_model(clf_xgb)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([2.43056989, 2.47152638, 2.49979544, 2.50026822]),
  'score_time': array([0.04686403, 0.03048968, 0.03047991, 0.0304811 ]),
  'test_score': array([0.58536186, 0.58262546, 0.58334564, 0.59225923]),
  'train_score': array([0.60852592, 0.60910272, 0.6080427 , 0.60689529])},
 0.5858980476811373,
 0.6081416590998701)

In [78]:
# GB: GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
X_gb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_gb)
])
clf_gb = make_pipeline(ct, GradientBoostingClassifier(
    n_estimators=100, max_depth=2, learning_rate=0.01, random_state=123
))
result = eval_model(clf_gb)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([1.12517786, 1.13749313, 1.15024376, 1.11028147]),
  'score_time': array([0.0156312 , 0.01226044, 0.01022434, 0.01019835]),
  'test_score': array([0.58394407, 0.58068345, 0.5846679 , 0.59357819]),
  'train_score': array([0.59490185, 0.59741658, 0.59562136, 0.59252278])},
 0.5857184027349147,
 0.5951156410437852)

In [88]:
from sklearn.ensemble import VotingClassifier

clf_vt = VotingClassifier([
    ('baseline', clf_lr),
    #('lr_2', clf_lr2),
    # ('mlp', clf_mlp),
    #('rf', clf_rf),
    #('xgb', clf_xgb),
    ('gb', clf_gb)
], voting='soft')
result = eval_model(clf_vt)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([1.1723752 , 1.18235707, 1.17683125, 1.12888598]),
  'score_time': array([0.02259493, 0.01722622, 0.01216984, 0.02025962]),
  'test_score': array([0.58930567, 0.58573059, 0.58939867, 0.59653137]),
  'train_score': array([0.59567237, 0.59693881, 0.59580892, 0.59247848])},
 0.5902415739208746,
 0.5952246451419261)

In [90]:
prd = choose_model(clf_vt)
print("Voting:", roc_auc_score(df_ans['failure'], prd))

Voting: 0.5927039118705035
