In [3]:
from pycaret.classification import *
import collections
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import pickle as pkl
warnings.filterwarnings(action='ignore') 

## 데이터 삽입

In [2]:
train_err  = pd.read_csv('data/train_err_data.csv')
# 에러 데이터
# 유저 아이디, 시간, 모델, 펌웨어, 에러타입, 에러코드
train_qual = pd.read_csv('data/train_quality_data.csv')
# 지표 데이터
# 문제 발생 시점으로부터 10분 간격으로 2시간치 수집
# err 데이터와 별개
train_prob = pd.read_csv('data/train_problem_data.csv')
# 라벨 데이터
# 불만콜 여부, 시간

test_err  = pd.read_csv('data/test_err_data.csv')
test_qual = pd.read_csv('data/test_quality_data.csv')

missing_value = pd.Series([43262, train_err.time.mode()[0], train_err.model_nm.mode()[0], train_err.fwver.mode()[0], train_err.errtype.mode()[0], train_err.errcode.mode()[0]], index=test_err.columns)
test_err = test_err.append(missing_value, ignore_index=True)
# 값이 하나도 없는 고객이 있어서 훈련 데이터의 최빈값을 삽입

## Feature

### 고객별 각 에러 발생 횟수

* 각 에러 코드를 피쳐로, 횟수를 값으로 가짐

In [47]:
def count_errcode(data):
    idx_cnt = len(data['user_id'].unique())
    errcode_cnt = 42
    error = np.zeros((idx_cnt, errcode_cnt))
    min_id = min(data['user_id'])
    
    idx_error = data[['user_id','errtype']].values
    for person_idx, err in tqdm(idx_error):
        error[person_idx - min_id, err - 1] += 1
        
    result = pd.DataFrame(data=error, columns = ['err_' + str(i+1) + '_cnt' for i in range(42)]) 
    
    return result

빈도가 낮은 에러타입을 제외해봤으나 성능이 매우 떨어짐 : 전부 써야함

In [5]:
# err_list = list(train_err['errtype'].value_counts(normalize = True)[train_err['errtype'].value_counts(normalize = True).values >= 0.001].index)

In [6]:
# def count_errcode2(data):
#     idx_cnt = len(data['user_id'].unique())
#     errcode_cnt = 42
#     error = np.zeros((idx_cnt, errcode_cnt))
#     min_id = min(data['user_id'])
    
#     idx_error = data[['user_id','errtype']].values
#     for person_idx, err in tqdm(idx_error):
#         if err in err_list:
#             error[person_idx - min_id, err - 1] += 1
        
#     result = pd.DataFrame(data=error, columns = ['err_' + str(i+1) + '_cnt' for i in range(42)]) 
    
#     for i in result.columns:
#         if sum(result[i]) == 0:
#             result.drop(i, axis=1, inplace=True)
    
#     return result

### 3일간 최대 많이 발생한 횟수

In [7]:
# def scope_errtype(data):
#     scope = 3
#     for i in tqdm(range(len(data['user_id'].unique()))):
#         user_id = i + min(data['user_id'])
#         user_errtype_dict = {i : 0 for i in range(1, 43)}
#         for day in range(20201101, 20201130 - scope):
#             target = train_err2[train_err2['user_id'] == user_id]
#             target = target[target['date'].between(day, day + scope - 1, inclusive = True)]
#             scope_dict = dict(target.errtype.value_counts())

#             for errtype in scope_dict:
#                 user_errtype_dict[errtype] = max(user_errtype_dict[errtype], scope_dict[errtype])

In [55]:
def scope_errtype(data, scope = 3, option = 'max'):
    tqdm.pandas()
    data2 = data.copy()
    data2['date'] = data['time'].apply(lambda x: int(str(x)[:8]))
    
    errtype_array = np.zeros((len(data2['user_id'].unique()), 30, 42)) # user_id, date, errtype 
    user_id_min = min(data2['user_id'])
    
    def myfunc(x):
        for day in range(x['date'] % 100, min(31, x['date'] % 100 + scope)):
            errtype_array[x['user_id'] - user_id_min, day-1, x['errtype']-1] += 1
    
    data2.progress_apply(myfunc, axis=1)
    
    np.save('custom_data/scope_{}.npy'.format(scope), errtype_array)
    errtype_array_max = errtype_array.max(axis=1)
    errtype_array_min = errtype_array.min(axis=1)
    errtype_array_std = errtype_array.std(axis=1)
    
    if option == 'max': # 최대값만
        return pd.DataFrame(data=errtype_array_max, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_max' for i in range(42)])
    elif option == 'min': # 최소값만
        return pd.DataFrame(data=errtype_array_min, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_min' for i in range(42)])
    elif option == 'std' : # 표준편차
        return pd.DataFrame(data=errtype_array_std, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_std' for i in range(42)])
    elif option == 'all': # 최대값, 최소값, 표준편차
        return pd.concat([pd.DataFrame(data=errtype_array_max, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_max' for i in range(42)]), pd.DataFrame(data=errtype_array_min, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_min' for i in range(42)]), pd.DataFrame(data=errtype_array_std, columns = ['err_' + str(i+1) + '_scope_' + str(scope) + '_std' for i in range(42)])], axis=1)

### 시간당 최대 발생 횟수

In [258]:
def max_errtype_hour(data):
    tqdm.pandas()
    data2 = data.copy()
    
    user_id_min = min(data2['user_id'])
    errtype_array = np.zeros((len(data2['user_id'].unique()),42))
    
    data2['hour'] = data['time'].apply(lambda x: int(str(x)[:10]))
    data2 = data2.groupby(['user_id', 'hour', 'errtype']).size().reset_index()
    data2 = data2.groupby(['user_id', 'errtype']).max().reset_index()[['user_id', 'errtype', 0]]
    data2.columns = ['user_id', 'errtype', 'max_count']
    
    for idx, x in data2.iterrows():
        errtype_array[x['user_id'] - user_id_min, x['errtype']-1] = x['max_count']
        
    return pd.DataFrame(data = errtype_array, columns = ['err_{}_hour_max'.format(i+1) for i in range(42)])
    

### 일별 상위 퍼센트

In [304]:
# def avg_errtype_rank(data):
#     tqdm.pandas()
#     data2 = data.copy()

#     user_id_min = min(data2['user_id'])
#     errtype_array = np.zeros((len(data2['user_id'].unique()), 42)) # user_id, date, errtype
    
#     data2['date'] = data['time'].apply(lambda x: int(str(x)[:8]))
#     data2 = data2.groupby(['user_id', 'date', 'errtype']).size().reset_index()
#     data2.columns = ['user_id', 'date', 'errtype', 'count']
#     data2['rank'] = data2.groupby(['errtype', 'date'])['count'].rank(method = 'min', ascending=True)
#     data2 = data2.groupby(['user_id', 'errtype'])['rank'].max().reset_index()
    
#     for idx, x in tqdm(data2.iterrows()):
#         x = x.astype(int)
#         errtype_array[x['user_id'] - user_id_min, x['errtype']-1] = x['rank']
    
#     return pd.DataFrame(data = errtype_array, columns = ['err_{}_rank_avg'.format(i+1) for i in range(42)])

In [367]:
def sum_errtype_rank(data):
    tqdm.pandas()
    data2 = data.copy()

    user_id_min = min(data2['user_id'])
    errtype_array = np.zeros((len(data2['user_id'].unique()), 42)) # user_id, date, errtype
    
    data2['date'] = data['time'].apply(lambda x: int(str(x)[:8]))
    data2 = data2.groupby(['user_id', 'date', 'errtype']).size().reset_index()
    data2.columns = ['user_id', 'date', 'errtype', 'count']
    data2['rank'] = data2.groupby(['errtype', 'date'])['count'].rank(method = 'min', ascending=True)
    
    for day in tqdm(range(20201101, 20201131)):
        for errtype in range(1, 43):
            data2.loc[(data2['errtype'] == errtype) & (data2['date'] == day), 'rank'] += (15000 - len(data2[(data2['errtype'] == errtype) & (data2['date'] == day)]))

    data2 = data2.groupby(['user_id', 'errtype'])['rank'].sum().reset_index()
    
    for idx, x in tqdm(data2.iterrows()):
        x = x.astype(int)
        errtype_array[x['user_id'] - user_id_min, x['errtype']-1] = x['rank']/15000
    
    return pd.DataFrame(data = errtype_array, columns = ['err_{}_rank_avg'.format(i+1) for i in range(42)])

### 합치기

In [None]:
count_err = count_errcode(train_err)

In [None]:
scope_3 = scope_errtype(train_err, scope = 3, option = 'all')

In [None]:
scope_7 = scope_errtype(train_err, scope = 7, option = 'all')

In [None]:
err_count_hour = max_errtype_hour(train_err)

In [368]:
err_rank_sum = sum_errtype_rank(train_err)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:41<00:00,  1.37s/it]
231039it [00:29, 7958.18it/s]


----

## 모델링 및 평가

In [359]:
set_config('seed', 77)

In [360]:
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1

In [361]:
train = pd.concat([count_err, scope_3, scope_7, err_count_hour, err_rank], axis=1)
train['problem'] = problem

In [362]:
for i in train.columns:
    if 'min' in i :
        train.drop(i, axis='columns', inplace=True)
    if 'diff' in i :
        train.drop(i, axis='columns', inplace=True)

In [363]:
clf = setup(data = train, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,7398
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 295)"
5,Missing Values,False
6,Numeric Features,294
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [364]:
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7873,0.8212,0.5128,0.7585,0.6115,0.4727,0.49,11.02
lightgbm,Light Gradient Boosting Machine,0.7864,0.8168,0.5282,0.7442,0.6177,0.4754,0.4889,0.571
gbc,Gradient Boosting Classifier,0.7832,0.8157,0.4837,0.7678,0.5931,0.4557,0.4784,1.773
et,Extra Trees Classifier,0.7865,0.8125,0.5032,0.7634,0.6062,0.4683,0.4876,0.446
rf,Random Forest Classifier,0.7795,0.8089,0.4784,0.7579,0.5862,0.4464,0.4685,0.508
xgboost,Extreme Gradient Boosting,0.7793,0.8038,0.5405,0.7152,0.6155,0.465,0.474,1.843
ada,Ada Boost Classifier,0.7773,0.7992,0.4889,0.7424,0.589,0.4452,0.4638,0.366
lda,Linear Discriminant Analysis,0.7748,0.7921,0.4368,0.7768,0.5587,0.4233,0.4551,0.164
lr,Logistic Regression,0.7504,0.7319,0.3838,0.7233,0.5008,0.3549,0.3868,2.264
nb,Naive Bayes,0.7165,0.7299,0.2606,0.6718,0.3746,0.2355,0.2799,0.019


In [365]:
blended = blend_models(estimator_list = best_5, fold = 10, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7733,0.815,0.4855,0.7325,0.5839,0.4368,0.4543
1,0.8057,0.8499,0.5378,0.8043,0.6446,0.5181,0.5379
2,0.781,0.8097,0.4767,0.7664,0.5878,0.4495,0.4729
3,0.7952,0.8267,0.5291,0.7745,0.6287,0.4941,0.5112
4,0.7914,0.8225,0.5015,0.7818,0.611,0.4777,0.4996
5,0.7695,0.8105,0.4461,0.7463,0.5584,0.4156,0.4408
6,0.7867,0.8123,0.4898,0.7742,0.6,0.4644,0.487
7,0.7933,0.8209,0.5102,0.7812,0.6173,0.4841,0.5048
8,0.8067,0.8552,0.5423,0.8017,0.647,0.5206,0.5394
9,0.7865,0.8175,0.4985,0.7668,0.6042,0.4669,0.4872


In [366]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7865,0.834,0.5172,0.798,0.6277,0.4874,0.5099


| name | min | new_feature | cat | blended_auc |
|:---:|:-------:|:---:|---|:---:|
| train | O | O | 0.8246 | 0.8269 |
| train2 | X | X | 0.812 | 0.829 |
| train3 | O | X | 0.8109 | 0.8229 |
| train4 | X | O | 0.8175 | 0.832 |

| err_rank | cat | blended_auc |
|:---:|---|:---:|
| O | 0.8222 | 0.834 |
| X | 0.8175 | 0.832 |

----

In [281]:
train_scope_3 = scope_errtype(train_err, scope = 3, option = 'all')
train_scope_7 = scope_errtype(train_err, scope = 7, option = 'all')
train_err_count_hour = max_errtype_hour(train_err)
train_err_rank_sum = sum_errtype_rank(train_err)
train_data = pd.concat([train_scope_3, train_scope_7, train_err_count_hour, train_err_rank_sum], axis=1)
train_data.to_csv("custom_data/kio_train_0201.csv", index=False)

100%|███████████████████████████████████████████████████████████████████| 16554663/16554663 [13:21<00:00, 20665.58it/s]
100%|███████████████████████████████████████████████████████████████████| 16554663/16554663 [23:32<00:00, 11721.66it/s]


In [282]:
test_scope_3 = scope_errtype(test_err, scope = 3, option = 'all')
test_scope_7 = scope_errtype(test_err, scope = 7, option = 'all')
test_err_count_hour = max_errtype_hour(test_err)
test_err_rank_sum = sum_errtype_rank(test_err)
test_data = pd.concat([test_scope_3, test_scope_7, test_err_count_hour, test_err_rank_sum], axis=1)
test_data.to_csv("custom_data/kio_test_0201.csv", index=False)

100%|███████████████████████████████████████████████████████████████████| 16532649/16532649 [13:14<00:00, 20815.17it/s]
100%|███████████████████████████████████████████████████████████████████| 16532649/16532649 [23:47<00:00, 11583.39it/s]
