# 문제 6

[Kaggle 형] train_prob.csv로 문제 failure 예측하는 모델을 만들고, 

test_prob.csv에 대한 failure가 1일 확률 예측하여 다음과 같은 형식의 answer6.csv를 만들어라. 

측정 지표는 AUC(area under of ROC curve)이다. id 는 테스트 케이스의 id 이고, failure에는 failure가 1이 될 확률이다.

id,failure

16115, 0.1

16116, 0.2


**강사: 멀티캠퍼스 강선구(sunku0316.kang@multicampus.com, sun9sun9@gmail.com)**

In [1]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys
import xgboost as xgb

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels, xgb]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1
xgboost 0.80


In [2]:
df_train = pd.read_csv('train_prob.csv', index_col='id')
df_test = pd.read_csv('test_prob.csv', index_col='id')
df_ans = pd.read_csv('test_prob_ans.csv', index_col='id')

In [3]:
df_train[['na_1', 'na_2']] = df_train[['measurement_{}'.format(i) for i in [3, 5]]].isna()
df_test[['na_1', 'na_2']] = df_test[['measurement_{}'.format(i) for i in [3, 5]]].isna()

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

imp = IterativeImputer(
    estimator=LinearRegression(fit_intercept=True),
    random_state=123
)
X_imp = ['measurement_{}'.format(i) for i in range(3, 10)] + ['measurement_17']
df_train[X_imp] = df_train.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x), index=x.index, columns=x.columns)
)

df_test[X_imp] = df_test.groupby('product_code')[X_imp].apply(
    lambda x: pd.DataFrame(imp.fit_transform(x), index=x.index, columns=x.columns)
)

In [5]:
X_mean = ['measurement_{}'.format(i) for i in range(10, 17)]
df_train[X_mean] = df_train.groupby('product_code')[X_mean].transform(
    lambda x: x.fillna(x.mean())
)
df_test[X_mean] = df_test.groupby('product_code')[X_mean].transform(
    lambda x: x.fillna(x.mean())
)

In [6]:
df_train['loading'] = df_train['loading'].fillna(df_train['loading'].mean())
df_test['loading'] = df_test['loading'].fillna(df_train['loading'].mean())

In [7]:
# log 자연 로그 함수를 적용하여 파생 변수 loading_log
# LogisticRegression X_sfs_best = ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17', 'na_1'] 0.5838326230092876
# LinearDiscriminantAnalysis() measurement_0 ~ 17 ?
# PCA() measurement_0 ~ 17 , n_components=7 0.581757510516433
# RandomForestClassifier(random_state=123) 'n_estimators': 15, 'max_depth': 7, 'min_samples_split': 512} 0.5745226991354744

In [8]:
df_train['product_code'].value_counts()

C    5765
E    5343
B    5250
A    5100
Name: product_code, dtype: int64

In [9]:
df_test['product_code'].value_counts()

D    5112
Name: product_code, dtype: int64

In [10]:
from sklearn.model_selection import GroupKFold

gcv = GroupKFold(4)

for train_idx, valid_idx in gcv.split(df_train, groups=df_train['product_code']):
    print(
        df_train.iloc[train_idx]['product_code'].unique(), 
        df_train.iloc[valid_idx]['product_code'].unique()
    )

['A' 'B' 'E'] ['C']
['A' 'B' 'C'] ['E']
['A' 'C' 'E'] ['B']
['B' 'C' 'E'] ['A']


# Kaggle형 풀이 단계

Step 1: 검증 방법을 정하고, 검증 루틴을 만듭니다.

Step 2: Baseline 모델을 만듭니다

Step 3: 모델 선택 루틴을 만듭니다.

Step 4: 모델 개선 작업을 합니다.

# Step1

In [26]:
s_hist = list()

from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

gcv = GroupKFold(4)
X_all = df_test.columns.tolist()

def eval_model(model_name, clf):
    result = cross_validate(
        clf, df_train[X_all], df_train['failure'], groups=df_train['product_code'], cv=gcv, return_train_score=True, 
        scoring='roc_auc'
    )
    
    s = 'Valid: {:.5f}±{:.5f},  V.Train: {:.5f}±{:.5f}'.format( 
            np.mean(result['test_score']), np.std(result['test_score']),
            np.mean(result['train_score']), np.std(result['train_score']),
        )
    print(s)
    s_hist.append(pd.Series({'model': model_name, 'result': s}))
    display(pd.DataFrame(s_hist).groupby('model').last())
    
def select_model(clf):
    clf.fit(df_train[X_all], df_train['failure'])
    prd = clf.predict_proba(df_test[X_all])[:, 1]
    pd.DataFrame({
        'id': df_test.index,
        'failure': prd
    }).to_csv('answer6.csv', index=None)
    return prd

# Step2

In [21]:
from sklearn.linear_model import LogisticRegression
ct = ColumnTransformer([
    ('std', StandardScaler(), ['loading', 'measurement_1', 'measurement_4', 'measurement_14', 'measurement_17']),
    ('pt', 'passthrough', ['na_1'])
])
reg_lr = make_pipeline(ct, LogisticRegression(solver='lbfgs'))
eval_model('baseline', reg_lr)

Valid: 0.58937±0.00379,  V.Train: 0.59191±0.00146


Unnamed: 0_level_0,result
model,Unnamed: 1_level_1
baseline,"Valid: 0.58937±0.00379, V.Train: 0.59191±0.00146"


# Step3

In [27]:
prd = select_model(reg_lr)
print("자가채점: {}".format(roc_auc_score(df_ans['failure'], prd)))

자가채점: 0.5883988309352517


# Step4