# 문제 6

[Kaggle 형] train_prob.csv로 문제 failure 예측하는 모델을 만들고, 

test_prob.csv에 대한 failure가 1일 확률 예측하여 다음과 같은 형식의 answer6.csv를 만들어라. 

측정 지표는 AUC(area under of ROC curve)이다. id 는 테스트 케이스의 id 이고, failure에는 failure가 1이 될 확률이다.

id,failure

16115, 0.1

16116, 0.2


**강사: 멀티캠퍼스 강선구(sunku0316.kang@multicampus.com, sun9sun9@gmail.com)**

In [1]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys
import xgboost as xgb

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels, xgb]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1
xgboost 0.80


In [2]:
df_train = pd.read_csv('train_prob.csv', index_col=['id'])
df_test = pd.read_csv('test_prob.csv', index_col=['id'])
df_ans = pd.read_csv('test_prob_ans.csv', index_col='id')

In [3]:
df_train = df_train.assign(
    na_1 = lambda x: x['measurement_3'].isna(),
    na_2 = lambda x: x['measurement_5'].isna()
)
df_test = df_test.assign(
    na_1 = lambda x: x['measurement_3'].isna(),
    na_2 = lambda x: x['measurement_5'].isna()
)

In [4]:
df_train['product_code'].value_counts()

C    5765
E    5343
B    5250
A    5100
Name: product_code, dtype: int64

In [5]:
df_test['product_code'].value_counts()

D    5112
Name: product_code, dtype: int64

In [6]:
# 방법 2: product_code 별로 IterativeImputer를 통한 결측처리를 하고 바로 적용합니다.
from sklearn.experimental import enable_iterative_imputer# 구문을 사용하여 실험 단계인 모듈을 활성화하고, 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']

imp = IterativeImputer(
    estimator = LinearRegression(),
    random_state=123
)

df_train[X_imp] =  df_train.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x), index=x.index, columns=x.columns)
)

df_test[X_imp] =  df_test.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x), index=x.index, columns=x.columns)
)

In [7]:
# 방법 1로 구현합니다. (참고)

from sklearn.experimental import enable_iterative_imputer# 구문을 사용하여 실험 단계인 모듈을 활성화하고, 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']
# train에 등장하지 않은 수준이 있습니다, test를 포함하여 결측처리 모델을 만듭니다.
s_imp = pd.concat([
        df_train[X_imp + ['product_code']],
        df_test[X_imp + ['product_code']]
], axis=0).groupby('product_code')\
.apply(
    lambda x: IterativeImputer(estimator=LinearRegression(),random_state=123).fit(x[X_imp])
)
# train에 적용합니다.
df_train[X_imp] = df_train[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )
# test에 적용합니다.
df_test[X_imp] = df_test[X_imp + ['product_code']]\
            .groupby('product_code')\
            .apply(
                lambda x: pd.DataFrame(s_imp.loc[x.name].transform(x[X_imp]), index=x.index, columns=X_imp)
            )

In [8]:
# 방법 1: groupby ~ transform
# Transform을 통해 수준별 평균으로 1:1 변환된 값으로 구성된 DataFrame을 받아서, 
# 각각의 요소별로 결측이면 해당값으로 치환되게 구성합니다.
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
df_train[X_mean] = df_train.groupby('product_code')[X_mean]\
        .transform(lambda x: x.fillna(x.mean()))

df_test[X_mean] = df_test.groupby('product_code')[X_mean]\
        .transform(lambda x: x.fillna(x.mean()))

In [9]:
# 합쳐서 구해봅니다. (참고)
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
# 역시 train에 등장하지 않은 수준을 처리하기 위해 합치니다.
df_mean = pd.concat([
            df_train[['product_code'] + X_mean],
            df_test[['product_code'] + X_mean]
        ]).groupby('product_code')[X_mean].agg('mean')

df_train[X_mean] = df_train.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))
df_test[X_mean] = df_test.groupby('product_code')[X_mean]\
            .apply(lambda x: pd.DataFrame(x.fillna(df_mean.loc[x.name]), index=x.index, columns=x.columns))

In [10]:
df_train['loading'] = df_train['loading'].fillna(df_train['loading'].mean())
df_test['loading'] = df_test['loading'].fillna(df_test['loading'].mean())

In [11]:
df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)

# Kaggle형 풀이 단계

Step 1: 검증 방법을 정하고, 검증 루틴을 만듭니다.

Step 2: Baseline 모델을 만듭니다

Step 3: 모델 선택 루틴을 만듭니다.

Step 4: 모델 개선 작업을 합니다.

In [12]:
# 공통 사용 요소 입니다.

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

X_all = [i for i in df_train.columns if i != 'failure']
gcv = GroupKFold(4)

def eval_model(model):
    return cross_validate(
        model, df_train[X_all], df_train['failure'], cv = gcv, groups=df_train['product_code'], 
        scoring='roc_auc', return_train_score=True
    )

def choose_model(model):
    model.fit(df_train[X_all], df_train['failure'])
    prd = model.predict_proba(df_test[X_all])[:, 1]
    pd.DataFrame({
        'id': df_test.index.values,
        'failure': prd
    }).to_csv('answer6.csv', index=None)
    return prd

In [13]:
# GroupKFold의 동작을 보여드립니다: 검증셋은 검증 학습셋에 등장하지 않은 group(product_code)이 나옵니다
for train_idx, test_idx in gcv.split(df_train[X_all], df_train['failure'], groups=df_train['product_code']):
    print(df_train.iloc[train_idx]['product_code'].unique(), df_train.iloc[test_idx]['product_code'].unique())

['A' 'B' 'E'] ['C']
['A' 'B' 'C'] ['E']
['A' 'C' 'E'] ['B']
['B' 'C' 'E'] ['A']


In [14]:
# Baseline: LogisticRegression + feature selection
from sklearn.linear_model import LogisticRegression

ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model(clf_lr)
np.mean(result['test_score']), np.std(result['test_score']), np.mean(result['train_score'])

(0.5893698519267131, 0.0037908863862941063, 0.5919059643763757)

In [15]:
# 모델을 선택합니다.
prd = choose_model(clf_lr)
# 자가 채점 합니다.
roc_auc_score(df_ans['failure'], prd)

0.5883705035971223

In [16]:
# Basline + feature PCA
from sklearn.decomposition import PCA

ct = ColumnTransformer([
    ('std_pca', make_pipeline(StandardScaler(), PCA(n_components=7)), ['measurement_{}'.format(i) for i in range(18)]),
    ('std', StandardScaler(), ['loading']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr_2 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model(clf_lr_2)
np.mean(result['test_score']), np.std(result['test_score']), np.mean(result['train_score'])

(0.5885708721211668, 0.002936880606326784, 0.5916331911156281)

In [17]:
# Basline + np.log (loading)
from sklearn.preprocessing import FunctionTransformer

ct = ColumnTransformer([
    ('std', StandardScaler(), ['measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('log_std', make_pipeline(FunctionTransformer(func=np.log, validate=False), StandardScaler()), ['loading']),
    ('pt', 'passthrough', ['na_1'])
])
clf_lr_3 = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
result = eval_model(clf_lr_3)
np.mean(result['test_score']), np.std(result['test_score']), np.mean(result['train_score'])

(0.5893511045085978, 0.0038853447385856898, 0.5918119259522319)

In [18]:
# RF: RandomForestClassifier + LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.ensemble import RandomForestClassifier
ct = ColumnTransformer([
    ('std_lda', make_pipeline(StandardScaler(), LinearDiscriminantAnalysis()), ['measurement_{}'.format(i) for i in range(18)]),
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2']),# + ['measurement_{}'.format(i) for i in range(18)])
])
clf_rf = make_pipeline(ct, RandomForestClassifier(
    n_estimators=150, max_depth=5, min_samples_split= 512, random_state=123
))
result = eval_model(clf_rf)
np.mean(result['test_score']), np.std(result['test_score']), np.mean(result['train_score'])

(0.5870176835777384, 0.001962859562434625, 0.6143993333495047)

In [19]:
# XGB: XGBoost
import xgboost as xgb
ct = ColumnTransformer([
    ('pt', 'passthrough', ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)])
])
clf_xgb = xgb.XGBClassifier(
    learning_rate=0.01, n_estimators=200, subsample=0.25, colsample_bytree=0.9, max_depth=2, random_state=123
)
clf_xgb = make_pipeline(ct, clf_xgb)
result = eval_model(clf_xgb)
np.mean(result['test_score']), np.std(result['test_score']), np.mean(result['train_score'])

(0.5859260509703783, 0.004071350389365754, 0.6037708850584805)

In [20]:
# GB: GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
X_gb = ['loading', 'na_1', 'na_2'] + ['measurement_{}'.format(i) for i in range(18)]
ct = ColumnTransformer([
    ('pt', 'passthrough', X_gb)
])
clf_gb = make_pipeline(ct, GradientBoostingClassifier(
    n_estimators=200, max_depth=2, learning_rate=0.007, random_state=123
))
result = eval_model(clf_gb)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([1.89585948, 1.85885024, 1.88291907, 1.82760525]),
  'score_time': array([0.01562428, 0.01562381, 0.01561451, 0.01562262]),
  'test_score': array([0.58331404, 0.5815396 , 0.58292017, 0.59404156]),
  'train_score': array([0.59819038, 0.59880856, 0.59920214, 0.5964019 ])},
 0.5854538391204598,
 0.5981507433353104)

In [21]:
# Voting을 합니다.
from sklearn.ensemble import VotingClassifier

clf_vt = VotingClassifier([
    #('baseline', clf_lr_3),
    ('lr_2', clf_lr_2),
    ('lr_3', clf_lr_3),
    ('rf', clf_rf),
    #('xgb', clf_xgb),
    ('gb', clf_gb)
], voting='soft')
result = eval_model(clf_vt)
result,  np.mean(result['test_score']), np.mean(result['train_score'])

({'fit_time': array([2.85138106, 2.85726476, 2.85609913, 2.82746625]),
  'score_time': array([0.07810593, 0.07808304, 0.07801247, 0.07810545]),
  'test_score': array([0.58914805, 0.58721561, 0.58907306, 0.59560419]),
  'train_score': array([0.60176033, 0.60239064, 0.60134444, 0.59818017])},
 0.5902602263616302,
 0.6009188951819)

In [22]:
# 모델을 선택합니다.
prd = choose_model(clf_vt)
# 자가 채점 합니다.
roc_auc_score(df_ans['failure'], prd)

0.5922951888489209