# 데이콘_시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회

### 팀명: 끙정의 아이들

### Public Score: 0.83775 / 53th(12.6%)

### Private Score: 0.83557 / 42th(10.0%)

### Summary: Catb, LGBM, GBC 를 활용한 10-fold OOF 앙상블

# Library & Data Import

In [110]:
import warnings
warnings.filterwarnings(action='ignore')

import os, sys
import time
import datetime

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve, recall_score, precision_score

import lightgbm
from lightgbm import LGBMClassifier
import catboost
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold

In [113]:
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)
print(lightgbm.__version__)
print(catboost.__version__)

1.19.3
1.0.5
0.23.2
3.0.0
0.24.1


In [114]:
TRAIN_P_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_problem_data.csv'
TRAIN_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_quality_data.csv'
TRAIN_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/train_err_data.csv'
TEST_Q_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_quality_data.csv'
TEST_E_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/test_err_data.csv'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37/Data/systemError/sample_submission.csv'

In [154]:
%%time
train_p = pd.read_csv(TRAIN_P_PATH)
train_q = pd.read_csv(TRAIN_Q_PATH)
train_e = pd.read_csv(TRAIN_E_PATH)
test_q = pd.read_csv(TEST_Q_PATH)
test_e = pd.read_csv(TEST_E_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

Wall time: 14.7 s


In [155]:
train_p.shape, train_q.shape, train_e.shape, test_q.shape, test_e.shape, submission.shape

((5429, 2),
 (828624, 16),
 (16554663, 6),
 (747972, 16),
 (16532648, 6),
 (14999, 2))

# Preprocessing

In [117]:
def preprocessing_problem(df, object_='binary'):
    """
    definition:
    train_problem 테이블을 받아서 target 값으로 변환
    1. {0, 1}의 binary로 변환
    2. {0 ~ n}의 multiclass로 변환
    """
    
    # 10001부터 24999까지의 index를 만들어줍니다.
    user_id_idx = np.array(range(10000, 25000, 1))
    
    # train_new_p라는 새로운 df를 만들고 index는 위에서 만든 user_id_idx 로 지정해줍니다.
    new_p = pd.DataFrame(index = user_id_idx)
    new_p['target'] = 0
    
    if object_ == 'binary':
        new_p.iloc[df.user_id.unique()-10000] = 1
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    elif object_ == 'multi':
        # multi는 count()로 집계를 해줍니다.
        new_p['target'] = df.groupby('user_id')['time'].count()
        new_p = new_p.fillna(0)
        new_p = new_p.reset_index()
        new_p.rename({'index':'user_id'}, axis=1, inplace=True)
        
    return new_p

In [118]:
train_b_p = preprocessing_problem(train_p, 'binary')
train_m_p = preprocessing_problem(train_p, 'multi')

In [156]:
def preprocessing_quality(df):
    """
    definition:
    EDA를 통해 알아낸 정보로 train_q, test_q를 정리해서 내뿜어줍니다.
    1. qaulity_3, quality_4 를 drop 합니다.(단일 value)
    2. qaulity_k 변수들을 정수로 encoding 합니다.
    """
    # 먼저 3, 4번을 drop 합니다.
    df.drop(['quality_3', 'quality_4'], axis=1, inplace=True)
    
    # qual 변수만 할당해주고, 정수로 형변환 해줍니다.
    columns = train_q.columns[train_q.columns.str.contains('quality')]
    # for문을 통해 각 column을 반복 작업해줍니다.
    for col in columns:
        if df[col].dtype == 'float64': # 기존에 float은 패스
            df[col] = df[col].fillna(-2)
        elif df[col].dtype == 'int64': # 기존에 int도 패스
            df[col] = df[col].fillna(-2)
            #print(col)
        else:
            # nan값이 있으면 float으로 갈 수 없으니 '-2' 으로 채워줍니다.
            df[col] = df[col].fillna('-2')
            df[col] = df[col].apply(lambda x: str(x).replace(',' , ''))
            df[col] = df[col].astype(np.float64)
        
    # fwver 에서 null 값이 꽤 있습니다. missing으로 채우겠습니다.
    df.fwver = df.fwver.fillna('missing')
        
    return df

In [157]:
train_q = preprocessing_quality(train_q)
test_q = preprocessing_quality(test_q)

In [160]:
def preprocessing_fwver(df):
    """
    definition:
    별건 아니고, e-set에 fwver 변수에서 '10' 이라는 값이 있는데, 이게 errtype이랑 겹쳐요.
    그래서 10을 -> 8.5.2 으로 바꿔줄 겁니다.
    굳이 이렇게 바꾸는 이유는, 해당 fw가 8.5.3버전과 같은 model_nm을 공유하기 때문입니다.
    """
    df.fwver = df.fwver.replace('10', '8.5.2')
        
    return df

In [161]:
train_e = preprocessing_fwver(train_e)
test_e = preprocessing_fwver(test_e)

In [162]:
def make_datetime(df):
    """
    definition:
    'time' column이 str로 되어 있으니, datetime으로 바꿔주는 함수입니다.
    다만 'time'양식이 pandas함수에 적용이 안되니, 강제로 슬라이싱해서 만들어줘야 합니다.
    
    """
    
    df['year'] = df['time'].apply(lambda x: str(x)[:4])
    df['month'] = df['time'].apply(lambda x: str(x)[4:6])
    df['day'] = df['time'].apply(lambda x: str(x)[6:8])
    df['hour'] = df['time'].apply(lambda x: str(x)[8:10])
    df['minute'] = '00' # minute을 넣어주지 않으면 datetime이 완성이 안되니, 00으로 넣어줍니다.
    
    df['time'] = pd.to_datetime(df.year + df.month + df.day + df.hour + df.minute)
    
    return df

In [163]:
%%time
train_p = make_datetime(train_p)
train_q = make_datetime(train_q)
test_q = make_datetime(test_q)
train_e = make_datetime(train_e)
test_e = make_datetime(test_e)

Wall time: 57.9 s


# Feature Engineering

## from Error_log

### 사용한 model의 개수

In [166]:
train_model_count = train_e[['user_id', 'model_nm']].drop_duplicates().groupby('user_id').count()
test_model_count = test_e[['user_id', 'model_nm']].drop_duplicates().groupby('user_id').count()

### 사용한 fwver 의 개수

In [167]:
train_fwver_count = train_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()
test_fwver_count = test_e[['user_id', 'fwver']].drop_duplicates().groupby('user_id').count()

### 경험한 error 의 총 개수

In [168]:
train_err_count = train_e.groupby('user_id')['errcode'].count()
test_err_count = test_e.groupby('user_id')['errcode'].count()

### 경험한 각 errtype의 value별 개수

In [169]:
# 23호
train_errcode_23 = train_e[train_e.errtype == 23][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_23 = test_e[test_e.errtype == 23][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_23.columns = ['UNKNOWN', 'ACTIVE', 'connLMP', 'connESTA', 'connTO', 'connLOCAL', 'STANDBY', 'TERMINATE']
test_errcode_23.columns = ['UNKNOWN', 'ACTIVE', 'connLMP', 'connESTA', 'connTO', 'connLOCAL', 'STANDBY', 'TERMINATE']

In [170]:
# 31호
train_errcode_31 = train_e[train_e.errtype == 31][['user_id', 'errcode', 'time']].\
                            groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_31 = test_e[test_e.errtype == 31][['user_id', 'errcode', 'time']].\
                            groupby(['user_id', 'errcode']).count().unstack().fillna(0)
train_errcode_31.columns = ['err_31_0', 'err_31_1']
test_errcode_31.columns =['err_31_0', 'err_31_1']

In [171]:
# 33호
train_errcode_33 = train_e[train_e.errtype == 33][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_33 = test_e[test_e.errtype == 33][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_33.columns = ['err_33_1', 'err_33_2', 'err_33_3']
test_errcode_33.columns = ['err_33_1', 'err_33_2', 'err_33_3']

In [172]:
# 34호
train_errcode_34 = train_e[train_e.errtype == 34][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)
test_errcode_34 = test_e[test_e.errtype == 34][['user_id', 'errcode', 'hour']].groupby(['user_id', 'errcode']).count().unstack().fillna(0)

train_errcode_34.columns = ['err_34_1', 'err_34_2', 'err_34_3', 'err_34_4', 'err_34_5', 'err_34_6']
test_errcode_34.columns = ['err_34_1', 'err_34_2', 'err_34_3', 'err_34_4', 'err_34_5', 'err_34_6']

### day max를 mean으로 나눈 것

In [173]:
%%time
train_e['day_2'] = train_e['time'].apply(lambda x: str(x)[:10])

train_meanDay = (train_e
                 .groupby(['user_id','day_2'])['day_2']
                 .count()
                 .unstack()
                 .fillna(0)
                 .loc[:, '2020-11-01':'2020-11-30']
                 .mean(axis=1))

train_maxDay = (train_e
                .groupby(['user_id','day_2'])['day_2']
                .count()
                .unstack()
                .fillna(0)
                .loc[:, '2020-11-01':'2020-11-30']
                .max(axis=1))

train_maxBymean = pd.Series(data = np.array(train_maxDay) / np.array(train_meanDay),
                            index = train_e.user_id.unique(),
                            name = 'mbm')

Wall time: 1min 19s


In [174]:
%%time
test_e['day_2'] = test_e['time'].apply(lambda x: str(x)[:10])

test_meanDay = (test_e
                .groupby(['user_id','day_2'])['day_2']
                .count()
                .unstack()
                .fillna(0)
                .loc[:, '2020-11-01':'2020-11-30']
                .mean(axis=1))

test_maxDay = (test_e
               .groupby(['user_id','day_2'])['day_2']
               .count()
               .unstack()
               .fillna(0)
               .loc[:, '2020-11-01':'2020-11-30']
               .max(axis=1))

test_maxBymean = pd.Series(data = np.array(test_maxDay) / np.array(test_meanDay),
                           index = test_e.user_id.unique(),
                           name = 'mbm')

Wall time: 1min 18s


## from Quality_log

### 각 quality의 표준편차

In [176]:
train_qual_std = train_q.groupby(['user_id']).std()
test_qual_std = test_q.groupby(['user_id']).std()

new_columns = ['q_std_0', 'q_std_1', 'q_std_2', 'q_std_5', 'q_std_6', 'q_std_7', 'q_std_8', 'q_std_9', 'q_std_10','q_std_11', 'q_std_12']

train_qual_std.columns = new_columns
test_qual_std.columns = new_columns

### 기록한 quality log의 개수(12개당 1번)

In [177]:
train_qual_log = train_q.groupby('user_id')['time'].count()/12
test_qual_log = test_q.groupby('user_id')['time'].count()/12

### quality당 순수 개수

In [178]:
# 값을 만들기 위해 새로운 df를 받아옵니다.
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

# 0 값을 전부 nan 값으로 바꿔줍니다.
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] == 0, 'quality_{}'.format(i)] = np.nan
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] == 0, 'quality_{}'.format(i)] = np.nan

# 필요없는 변수들을 버려줍니다.
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

# 그룹바이 카운트 해줍니다.
train_qual_counts = train_q_temp.groupby('user_id').count()
test_qual_counts = test_q_temp.groupby('user_id').count()

# 겹치는 컬럼명을 바꿔줍니다.
train_qual_counts.columns = ['q_c_0', 'q_c_1', 'q_c_2', 'q_c_5', 'q_c_6',
                             'q_c_7', 'q_c_8', 'q_c_9', 'q_c_10', 'q_c_11', 'q_c_12']
test_qual_counts.columns = ['q_c_0', 'q_c_1', 'q_c_2', 'q_c_5', 'q_c_6',
                             'q_c_7', 'q_c_8', 'q_c_9', 'q_c_10', 'q_c_11', 'q_c_12']

### quality당 음수, 0에 대한 count

In [179]:
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

In [180]:
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

In [181]:
# 0 값만 count를 위해서 음수와 양수를 전부 nan으로 만들겠습니다.

for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] < 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] < 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] > 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] > 0, 'quality_{}'.format(i)] = np.nan

In [182]:
train_qual_zeroCount = train_q_temp.groupby('user_id').count()
test_qual_zeroCount = test_q_temp.groupby('user_id').count()

In [183]:
new_columns = ['q_z_c_0', 'q_z_c_1', 'q_z_c_2', 'q_z_c_5', 'q_z_c_6', 'q_z_c_7', 'q_z_c_8', 'q_z_c_9', 'q_z_c_10','q_z_c_11', 'q_z_c_12']

train_qual_zeroCount.columns = new_columns
test_qual_zeroCount.columns = new_columns

In [184]:
train_q_temp = train_q.copy()
test_q_temp = test_q.copy()

In [185]:
train_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)
test_q_temp.drop(['time', 'fwver', 'year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=True)

In [186]:
# 음수 값만 count를 위해서 음수와 0을 전부 nan으로 만들겠습니다.

for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    train_q_temp.loc[train_q_temp['quality_{}'.format(i)] >= 0, 'quality_{}'.format(i)] = np.nan
    
for i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]:
    test_q_temp.loc[test_q_temp['quality_{}'.format(i)] >= 0, 'quality_{}'.format(i)] = np.nan

In [187]:
train_qual_negaCount = train_q_temp.groupby('user_id').count()
test_qual_negaCount = test_q_temp.groupby('user_id').count()

In [188]:
new_columns = ['q_n_c_0', 'q_n_c_1', 'q_n_c_2', 'q_n_c_5', 'q_n_c_6', 'q_n_c_7', 'q_n_c_8', 'q_n_c_9', 'q_n_c_10','q_n_c_11', 'q_n_c_12']

train_qual_negaCount.columns = new_columns
test_qual_negaCount.columns = new_columns

### quality를 sum으로 groupby

In [189]:
train_each_quality_sum = train_q.groupby('user_id').sum().loc[:, 'quality_0':'quality_12']
test_each_quality_sum = test_q.groupby('user_id').sum().loc[:, 'quality_0':'quality_12']

quality_sum_colnms = ['quality_0_sum', 'quality_1_sum', 'quality_2_sum', 'quality_5_sum', 'quality_6_sum', 
                      'quality_7_sum', 'quality_8_sum', 'quality_9_sum','quality_10_sum', 'quality_11_sum', 
                      'quality_12_sum']

train_each_quality_sum.columns = quality_sum_colnms
test_each_quality_sum.columns = quality_sum_colnms

## from both

### time에 대한 유저별 표준편차

In [190]:
def time_to_seconds(x):
    return time.mktime(x.timetuple())

In [191]:
%%time
train_e['time_sec'] = train_e.time.apply(lambda x: time_to_seconds(x))
test_e['time_sec'] = test_e.time.apply(lambda x: time_to_seconds(x))
train_q['time_sec'] = train_q.time.apply(lambda x: time_to_seconds(x))
test_q['time_sec'] = test_q.time.apply(lambda x: time_to_seconds(x))

Wall time: 2min 15s


In [192]:
%%time
train_err_timestd = train_e.groupby(['user_id'])['time_sec'].std()
test_err_timestd = test_e.groupby(['user_id'])['time_sec'].std()
train_err_timestd = train_err_timestd.rename(level = 0, index = 't_e_std') 
test_err_timestd = test_err_timestd.rename(level = 0, index = 't_e_std') 

Wall time: 480 ms


In [193]:
%%time
train_qual_timestd = (train_q[['user_id', 'time_sec']].drop_duplicates()).groupby(['user_id']).std()
test_qual_timestd = (test_q[['user_id', 'time_sec']].drop_duplicates()).groupby(['user_id']).std()
train_qual_timestd.columns = ['t_q_std']
test_qual_timestd.columns = ['t_q_std']

Wall time: 119 ms


# Train & Predict

### Base set

In [82]:
# errtype을 유저별로 카운트 해줍니다.

X = train_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)
y = test_e.groupby(['user_id', 'errtype'])['errcode'].count().unstack().fillna(0)

X.columns = ['et_1', 'et_2', 'et_3', 'et_4', 'et_5', 'et_6', 'et_7', 'et_8', 'et_9', 'et_10', 'et_11', 'et_12', 'et_13',
 'et_14', 'et_15', 'et_16', 'et_17', 'et_18', 'et_19', 'et_20', 'et_21', 'et_22', 'et_23', 'et_24', 'et_25',
 'et_26', 'et_27', 'et_28', 'et_30', 'et_31', 'et_32', 'et_33', 'et_34', 'et_35', 'et_36', 'et_37', 'et_38',
 'et_39', 'et_40', 'et_41', 'et_42']
y.columns = ['et_1', 'et_2', 'et_3', 'et_4', 'et_5', 'et_6', 'et_7', 'et_8', 'et_9', 'et_10', 'et_11', 'et_12', 'et_13',
 'et_14', 'et_15', 'et_16', 'et_17', 'et_18', 'et_19', 'et_20', 'et_21', 'et_22', 'et_23', 'et_24', 'et_25',
 'et_26', 'et_27', 'et_28', 'et_30', 'et_31', 'et_32', 'et_33', 'et_34', 'et_35', 'et_36', 'et_37', 'et_38',
 'et_39', 'et_40', 'et_41', 'et_42']

In [83]:
# 하나 사라진 유저를 채워주는 코드입니다.
y = y.reindex(pd.RangeIndex(y.index.max() + 1)).ffill(0)[30000:]

In [84]:
X.shape, y.shape

((15000, 41), (14999, 41))

### Add Features

In [85]:
X = pd.concat([X,
               train_err_count, # 유저가 기록한 총 err수
               train_fwver_count, # 유저가 사용한 fw수
               train_model_count, # 유저가 사용한 model 수
               train_qual_std, # 각 퀄리티에 대한 유저별 편차
               train_qual_log, # 유저별 퀄리티 로그의 수
               train_errcode_23, # 23호 에러타입의 밸류별 개수
               train_errcode_33, # 33호 상동
               train_errcode_34, # 34호 상동
               train_qual_counts, # 각 퀄리티에서 0을 제외한 카운트
               train_qual_negaCount, # 각 퀄리티에 대해 음수만 카운트
               train_qual_zeroCount, # 각 퀄리티에 대해 0.만 카운트
               train_err_timestd,
               train_qual_timestd,
               train_each_quality_sum,
               train_errcode_31, # 31호 상동
               train_maxBymean # time
              ], axis=1).fillna(0)
X.shape

(15000, 122)

In [86]:
y = pd.concat([y,
               test_err_count, # 유저가 기록한 총 err수
               test_fwver_count, # 유저가 사용한 fw수
               test_model_count, # 유저가 사용한 model 수
               test_qual_std, # 각 퀄리티에 대한 유저별 편차
               test_qual_log, # 유저별 퀄리티 로그의 수
               test_errcode_23, # 23호 에러타입의 밸류별 개수
               test_errcode_33, # 33호 상동
               test_errcode_34, # 34호 상동
               test_qual_counts, # 각 퀄리티에서 0을 제외한 카운트
               test_qual_negaCount, # 각 퀄리티에 대해 음수만 카운트
               test_qual_zeroCount, # 각 퀄리티에 대해 0.만 카운트
               test_err_timestd,
               test_qual_timestd,
               test_each_quality_sum,
               test_errcode_31, # 31호 상동
               test_maxBymean, # time
              ], axis=1).fillna(0)
y.shape

(14999, 122)

In [87]:
# 일부 다중공선성을 띄는 변수들을 제거해줍니다.
X.drop(['et_20', 'et_36'], axis=1, inplace = True)
y.drop(['et_20', 'et_36'], axis=1, inplace = True)

X.shape, y.shape

((15000, 120), (14999, 120))

## Model

### Tuned Catb

In [89]:
def catb_fold_train_pred(train_x, train_y):

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5
    # 파라미터 설정
    params =      {
                    'nan_mode': 'Min',
                    'eval_metric': 'Logloss',
                    'iterations': 1000,
                    'sampling_frequency': 'PerTree',
                    'leaf_estimation_method': 'Newton',
                    'grow_policy': 'SymmetricTree',
                    'penalties_coefficient': 1,
                    'boosting_type': 'Plain',
                    'model_shrink_mode': 'Constant',
                    'feature_border_type': 'GreedyLogSum',
                    'l2_leaf_reg': 3,
                    'random_strength': 1,
                    'rsm': 1,
                    'boost_from_average': False,
                    'model_size_reg': 0.5,
                    'subsample': 0.800000011920929,
                    'use_best_model': False,
                    'class_names': [0, 1],
                    'random_seed': 2584,
                    'depth': 6,
                    'posterior_sampling': False,
                    'border_count': 254,
                    'classes_count': 0,
                    'auto_class_weights': 'None',
                    'sparse_features_conflict_fraction': 0,
                    'leaf_estimation_backtracking': 'AnyImprovement',
                    'best_model_min_trees': 1,
                    'model_shrink_rate': 0,
                    'min_data_in_leaf': 1,
                    'loss_function': 'Logloss',
                    'learning_rate': 0.028116999194025993,
                    'score_function': 'Cosine',
                    'task_type': 'CPU',
                    'leaf_estimation_iterations': 10,
                    'bootstrap_type': 'MVS',
                    'max_leaves': 64
                    }
    #-------------------------------------------------------------------------------------
    # 5 Kfold cross validation
    s_fold = StratifiedKFold(n_splits=5, shuffle=True ,random_state=42)    

    for train_idx, val_idx in s_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        #run traning
        model = CatBoostClassifier(**params, verbose=0)
        model.fit(X, y)

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)
        
    return models, auc_scores, recalls, precisions

In [90]:
# loss 비교를 위해 지우지 않습니다.
# 최고점 모델입니다.
models, auc_scores, _, _ = catb_fold_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

0.7301499999999999


In [91]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(y)
    pred_y_list.append(pred_y[:, 1].reshape(-1,1))
    
pred_ensemble_catb = np.mean(pred_y_list, axis = 0)

In [92]:
pred_ensemble_catb

array([[0.93218016],
       [0.22254721],
       [0.48239791],
       ...,
       [0.70990508],
       [0.8953148 ],
       [0.43733988]])

### Tuned LGBM

In [94]:
def lgbm_fold_train_pred(train_x, train_y):

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5
    # 파라미터 설정
    params =      {
                    'boosting_type':'gbdt', 
                    'class_weight':None,
                    'colsample_bytree':1.0,
                    'importance_type':'split',
                    'learning_rate':0.1,
                    'max_depth':-1,
                    'min_child_samples':20,
                    'min_child_weight':0.001,
                    'min_split_gain':0.0,
                    'n_estimators':100,
                    'n_jobs':-1,
                    'num_leaves':31,
                    'objective':None,
                    'random_state':2584,
                    'reg_alpha':0.0,
                    'reg_lambda':0.0,
                    'silent':True,
                    'subsample':1.0,
                    'subsample_for_bin':200000,
                    'subsample_freq':0
                    }
    #-------------------------------------------------------------------------------------
    # 5 Kfold cross validation
    s_fold = StratifiedKFold(n_splits=5, shuffle=True ,random_state=42)    

    for train_idx, val_idx in s_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        #run traning
        model = LGBMClassifier(**params)
        model.fit(X, y)

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)
        
    return models, auc_scores, recalls, precisions

In [97]:
models, auc_scores, _, _ = lgbm_fold_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

0.73295


In [98]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(y)
    pred_y_list.append(pred_y[:, 1].reshape(-1,1))
    
pred_ensemble_lgbm = np.mean(pred_y_list, axis = 0)

In [99]:
pred_ensemble_lgbm

array([[0.94799909],
       [0.22247434],
       [0.48895694],
       ...,
       [0.74520416],
       [0.87992041],
       [0.34860365]])

### Tuned GBC

In [101]:
gbc = GradientBoostingClassifier(
                                ccp_alpha=0.0,
                                criterion='friedman_mse',
                                init=None,
                                learning_rate=0.1,
                                loss='deviance',
                                max_depth=3,
                                max_features=None,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                min_weight_fraction_leaf=0.0,
                                n_estimators=100,
                                n_iter_no_change=None,
                                presort='deprecated',
                                random_state=2584,
                                subsample=1.0,
                                tol=0.0001,
                                validation_fraction=0.1,
                                verbose=0,
                                warm_start=False
                            )

In [102]:
def gbc_fold_train_pred(train_x, train_y, N_SPLIT=5):

    # Train
    models     = []
    recalls    = []
    precisions = []
    auc_scores   = []
    threshold = 0.5

    k_fold = StratifiedKFold(n_splits=N_SPLIT, shuffle=True, random_state=42)
    for train_idx, val_idx in k_fold.split(train_x, train_y):

        # split train, validation set
        X = train_x.iloc[train_idx]
        y = train_y.iloc[train_idx]
        valid_x = train_x.iloc[val_idx]
        valid_y = train_y.iloc[val_idx]

        #run traning
        model = gbc.fit(X, y)

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > threshold, 1, 0)

        # cal scores
        recall    = recall_score(    valid_y, valid_pred)
        precision = precision_score( valid_y, valid_pred)
        auc_score = roc_auc_score(   valid_y, valid_prob)
        
        # append scores
        models.append(model)
        recalls.append(recall)
        precisions.append(precision)
        auc_scores.append(auc_score)
    
    return models, auc_scores, recalls, precisions

In [103]:
%%time
models, auc_scores, _, _ = gbc_fold_train_pred(X, train_b_p.target)
print(np.mean(auc_scores))

0.72455
Wall time: 36.9 s


In [104]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(y)
    pred_y_list.append(pred_y[:, 1].reshape(-1,1))
    
pred_ensemble_gbc = np.mean(pred_y_list, axis = 0)

In [105]:
pred_ensemble_gbc

array([[0.94729912],
       [0.20899576],
       [0.48470505],
       ...,
       [0.75559137],
       [0.87481054],
       [0.34822194]])

## Ensemble

In [106]:
pred_ensemble_catb

array([[0.93218016],
       [0.22254721],
       [0.48239791],
       ...,
       [0.70990508],
       [0.8953148 ],
       [0.43733988]])

In [107]:
pred_ensemble_lgbm

array([[0.94799909],
       [0.22247434],
       [0.48895694],
       ...,
       [0.74520416],
       [0.87992041],
       [0.34860365]])

In [108]:
pred_ensemble_gbc

array([[0.94729912],
       [0.20899576],
       [0.48470505],
       ...,
       [0.75559137],
       [0.87481054],
       [0.34822194]])

In [109]:
pred_ensemble_best = (
                            pred_ensemble_catb +
                            pred_ensemble_lgbm  +
                            pred_ensemble_gbc
                        ) / 3

submission.problem = pred_ensemble_best
submission

Unnamed: 0,user_id,problem
0,30000,0.942493
1,30001,0.218006
2,30002,0.485353
3,30003,0.791206
4,30004,0.898756
...,...,...
14994,44994,0.206501
14995,44995,0.288811
14996,44996,0.736900
14997,44997,0.883349


끝