In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.metrics import roc_auc_score  
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold      
from bayes_opt import BayesianOptimization    
from functools import partial               
import lightgbm as lgb                      
import warnings
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore") 

pd.set_option('display.max_columns', 1500)

In [None]:
# datetime 함수

def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

In [None]:
# train_data load

train_qual = pd.read_csv('./data/train_quality_data.csv')
train_err = pd.read_csv('./data/train_err_data.csv')
train_problem = pd.read_csv('./data/train_problem_data.csv')

In [None]:
# test_data load

test_err = pd.read_csv('./data/test_err_data.csv')
test_qual = pd.read_csv('./data/test_quality_data.csv')
test_submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
# datetime 변환 후 주말 1, 평일 0 으로 저장

train_err['datetime'] = train_err['time'].apply(make_datetime)
test_err['datetime'] = test_err['time'].apply(make_datetime)

train_err['dayname'] = pd.to_datetime(train_err['datetime'], format='%Y%m%d').map(lambda x : x.strftime('%A'))
d1 = {'Monday':0, 'Tuesday':0, 'Wednesday':0, 'Thursday':0,
      'Friday':0, 'Saturday':1, 'Sunday':1}
train_err['we_or_wk'] = train_err['dayname'].map(d1)

test_err['dayname'] = pd.to_datetime(test_err['datetime'], format='%Y%m%d').map(lambda x : x.strftime('%A'))
d1 = {'Monday':0, 'Tuesday':0, 'Wednesday':0, 'Thursday':0,
      'Friday':0, 'Saturday':1, 'Sunday':1}
test_err['we_or_wk'] = test_err['dayname'].map(d1)

In [None]:
# 11월 동안의 데이터만 사용 (train, test에서 모두 10월, 12월 데이터는 거의 없어서 11월만 사용)

train_err = train_err.loc[(train_err.datetime >= pd.to_datetime('2020-11-01 00:00:00')) & (train_err.datetime <= pd.to_datetime('2020-11-30 23:59:59'))]
test_err = test_err.loc[(test_err.datetime >= pd.to_datetime('2020-11-01 00:00:00')) & (test_err.datetime <= pd.to_datetime('2020-11-30 23:59:59'))]

In [None]:
# train, test error 데이터에 주, 일, 시간 데이터 분리

train_err['week'] = train_err.datetime.dt.isocalendar().week
test_err['week'] = test_err.datetime.dt.isocalendar().week

train_err['day'] = train_err.datetime.dt.day
test_err['day'] = test_err.datetime.dt.day

train_err['hour'] = train_err.datetime.dt.hour
test_err['hour'] = test_err.datetime.dt.hour

# 파생변수 생성
+ 유저별 최신 펌웨어 버전 (fwver_most_recent)
+ 유저별 시간대별 (시간대를 4 섹션으로 구분) error 발생 횟수 카운트 (time_count)
+ 유저별 가장 오래 사용한 펌웨어 버전 (longest_used_fwver)
+ 유저별 펌웨어 버전 변화하는 것 잡아내는 것 (fwver_change)
+ 유저별 가장 많이 사용한 펌웨어 버전 (most_used_fwver)
+ train_problem에 있는 펌웨어 버전에 해당하는지 (problem_fwver_check)
+ train_problem에 있는 펌웨어 버전에 몇 번 해당하는지 (problem_fwver_check_count)
+ 유저별 connection error 카운트 (connection_user_count)
+ err 데이터에서 유저별 요일별 error 발생 횟수 카운트 (day_count)
+ 유저별 펌웨어 버전 변경 횟수 카운트 (fw_changed_count)
+ 요일별 errtype의 평균, 표준편차, 최댓값 (day_errtype)
+ 유저 - 모델명 사용 여부만 (user_model_count1)
+ 유저 - 모델명 err에 찍힌 개수만큼 (user_model_count2)
+ 유저 - 모델명 err에 찍힌 개수만큼 (연월일 기준 중복 제거) (user_model_count3)
+ 주말/주중, 주별, 일별, 시간별 에러코드의 평균, 표준편차, 최댓값

+ model_nm별 카운트
+ err_type별 카운트
+ err_code 상위 50개 카운트
+ quality issue 유무
+ connect 관련 error 발생 카운트
+ 전체 err_type 합계

## fwver_most_recent

In [None]:
def fwver_most_recent(data)
    data.sort_values(by = ['user_id', 'time'], inplace = True)
    data2 = data[['user_id', 'time', 'fwver']]
    data2.drop_duplicates('user_id', keep = 'last', inplace = True)
    data2['fwver'] = data2['fwver'].apply(lambda x: x[:5])
    data2['fwver'] = data2['fwver'].apply(lambda x: x if x != '8.5.3' else '8.5')
    fwver_most_recent = data2[['user_id', 'fwver']].reset_index(drop = True)
    fwver_most_recent['fwver'] = fwver_most_recent['fwver'].str.lstrip('0')

return fwver_most_recent

## time_count

In [3]:
def time_count1(x):
    if x >= '06' and x <= '11':
        return 1
    else:
        return 0

def time_count2(x):
    if x >= '12' and x <= '17':
        return 1
    else:
        return 0
    
def time_count3(x):
    if x >= '18' and x <= '23':
        return 1
    else:
        return 0

def time_count4(x):
    if x >= '00' and x <= '05':
        return 1
    else:
        return 0

In [None]:
def time_count(data):
    data['time'] = data['time'].apply(lambda x: str(x)[8:10])
    
    data['time_06_11'] = data['time'].apply(time_count1)
    data['time_12_17'] = data['time'].apply(time_count2)
    data['time_18_23'] = data['time'].apply(time_count3)
    data['time_00_05'] = data['time'].apply(time_count4)
    
    time_count = data.groupby('user_id')['time_06_11', 'time_12_17', 'time_18_23', 'time_00_05'].sum()
    time_count = time_count.reset_index()
    
    return time_count

## longest_used_fwver

In [None]:
def make_datetime2(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    
    return dt.datetime(year, month, day)

In [None]:
def most_used_fwver(data):
    data['datetime2'] = data['time'].apply(make_datetime2)
    data = pd.DataFrame(data.groupby(['user_id', 'datetime2'])['fwver'].value_counts())
    data.columns = ['fwver_count']
    data.reset_index(inplace = True)
    data = pd.DataFrame(data.groupby(['user_id', 'fwver'])['fwver_count'].sum())
    data = data.sort_values(by = 'fwver_count', ascending = False)
    data.reset_index(inplace = True)
    data = data.drop_duplicates('user_id', keep = 'first')
    data = data.sort_values('user_id', ascending = True)
    data['count'] = 1
    data = pd.pivot_table(data, index = 'user_id', columns = 'fwver', values = 'count', fill_value = 0).reset_index()
    data_col = ['fwv_' + str(i) for i in data.columns[1:]]
    data_col.insert(0, 'user_id')
    data.columns = data_col
    return data

## fwver_change

In [None]:
# fwver 변화하는 것 잡아내는 함수

def get_fwver_change(data):
    data['datetime2'] = data['time'].apply(make_datetime2)
    data = pd.DataFrame(data.groupby(['user_id', 'datetime2'])['fwver'].value_counts())
    data.columns = ['fwver_count']
    data.reset_index(inplace = True)
    data = pd.DataFrame(data.groupby(['user_id', 'fwver'])['fwver_count'].sum())
    data = data.reset_index()
    
    users = data['user_id'].unique()
    tmp = []
    
    for user in users:
        tmp.append(tuple(data[data['user_id'] == user]['fwver'].unique()))
        
    fw = list(set(tmp))
    return fw

In [None]:
tr_fw = get_fwver_change(train_err)
te_fw = get_fwver_change(test_err)

In [None]:
# train, test의 fwver 변화한 것 모두 합침

fw_change_list = list(set(tr_fw + te_fw))
fw_change_list

In [None]:
# fwver 변화 잡는거 이어서 데이터프레임 만들기

def make_fwver_change(data, train=True):
    data['datetime2'] = data['time'].apply(make_datetime2)
    data = pd.DataFrame(data.groupby(['user_id', 'datetime2'])['fwver'].value_counts())
    data.columns = ['fwver_count']
    data.reset_index(inplace = True)
    data = pd.DataFrame(data.groupby(['user_id', 'fwver'])['fwver_count'].sum())
    data = data.reset_index()
    
    users = data['user_id'].unique()
    
    fwver_change = pd.DataFrame()
    if train == True:
        fwver_change['user_id'] = range(10000, 25000)
    else:
        fwver_change['user_id'] = range(30000, 44999)
    
    for col in fw_change_list:
        column = '_'.join(col)
        fwver_change[column] = 0
        
    for user in users:
        col_name = tuple(data[data['user_id'] == user]['fwver'].unique())
        col_name = '_'.join(col_name)
        fwver_change.loc[fwver_change['user_id'] == user, col_name] = 1
    
    return fwver_change

## most_used_fwver

In [None]:
def most_used_fwver(data, train = True):
    data['datetime2'] = data['time'].apply(make_datetime2)
    data = pd.DataFrame(data.groupby(['user_id', 'datetime2'])['fwver'].value_counts())
    data.columns = ['fwver_count']
    data.reset_index(inplace = True)
    data = pd.DataFrame(data.groupby(['user_id', 'fwver'])['fwver_count'].sum())
    data = data.sort_values(by = 'fwver_count', ascending = False)
    data.reset_index(inplace = True)
    data = data.drop_duplicates('user_id', keep = 'first')
    data = data.sort_values('user_id', ascending = True)
    data['count'] = 1
    users = data['user_id'].unique()
    most_used = pd.DataFrame()
    if train == True:
        most_used['user_id'] = range(10000, 25000)
    else:
        most_used['user_id'] = range(30000, 44999)
    
    for col in fw_list:
        column = 'fwv_' + col
        most_used[column] = 0
    
    for user in users:
        col_name = data[data['user_id'] == user]['fwver']
        col_name = 'fwv_' + col_name
        most_used.loc[most_used['user_id'] == user, col_name] = 1
    
    return most_used    

## problem_fwver_check

In [None]:
def problem_fwver_check(data, train = True):
    
    users = data['user_id'].unique()
    
    problem_fwver_check = pd.DataFrame()
    
    if train == True:
        problem_fwver_check['user_id'] = range(10000, 25000)
    else:
        problem_fwver_check['user_id'] = range(30000, 44999)
    
    for col in train_problem_fw_list:
        problem_fwver_check[col] = 0
    
    for user in users:
        for col in train_problem_fw_list:
            if col in data[data['user_id'] == user]['fwver'].tolist():
                problem_fwver_check.loc[problem_fwver_check['user_id'] == user, col] = 1
    
    return problem_fwver_check

## problem_fwver_check_count

In [None]:
def problem_fwver_check_count(data):
    data = problem_fwver_check_count(data)
    data['problem_fw_count'] = data[data.columns[1:]].sum(axis = 1)
    data2 = data[['user_id', 'problem_fw_count']]
    
    return data2

## connection_err_count

In [None]:
def connection_err_count(data):
    data = data[['user_id', 'time', 'errcode']]
    data['count'] = 1
    data2 = data[(data['errcode'] == 'connection timeout') | (data['errcode'] == 'connection fail to establish') | 
                 (data['errcode'] == 'connectionterminated by local host') | (data['errcode'] == 'connection fail for LMP response timout') | 
                 (data['errcode'] == 'L2CAP connection cancelled')]
    data3 = pd.pivot_table(data2, index = 'user_id', columns = 'errcode', values = 'count', aggfunc = 'sum', fill_value = 0).reset_index()
    data4 = data[['user_id']].drop_duplicates().reset_index(drop = True)
    data5 = pd.merge(data4, data3, how = 'left')
    data5 = data5.fillna(0)
    return data5

## day_count

In [None]:
def user_dayname_count(data):
    data = pd.DataFrame(data.groupby('user_id')['dayname'].value_counts())
    data.columns = ['day_count']
    data.reset_index(inplace = True)
    data = pd.pivot_table(data, values = 'day_count', index = 'user_id', columns = 'dayname', fill_value = 0).reset_index()
    data = data[['user_id', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
    return data

## day_errtype

In [None]:
def day_errtype(data, num_of_users, first_index):

    data['day'] = data.datetime.dt.isocalendar().day

    df3 = data[['user_id', 'errtype', 'day']]

    df4 = df3[['user_id', 'day', 'errtype']].value_counts().to_frame().reset_index()

    df5 = df4.sort_values(['user_id', 'day']).rename(columns = {0 : 'counts'}).reset_index(drop = True)

    tmp1 = df5.loc[df5['day'] == 1][['user_id', 'errtype', 'counts']].values
    tmp2 = df5.loc[df5['day'] == 2][['user_id', 'errtype', 'counts']].values
    tmp3 = df5.loc[df5['day'] == 3][['user_id', 'errtype', 'counts']].values
    tmp4 = df5.loc[df5['day'] == 4][['user_id', 'errtype', 'counts']].values
    tmp5 = df5.loc[df5['day'] == 5][['user_id', 'errtype', 'counts']].values
    tmp6 = df5.loc[df5['day'] == 6][['user_id', 'errtype', 'counts']].values
    tmp7 = df5.loc[df5['day'] == 7][['user_id', 'errtype', 'counts']].values
    
    day_data = np.zeros((num_of_users, 42, 7))

    for i, dfa in enumerate([tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7]):
        for inx, val1, val2 in tqdm(dfa):
            day_data[:, :, i][inx - first_index, val1 - 1] = val2

    day_data_mean = day_data.mean(axis = 2)
    day_data_std = day_data.std(axis = 2)
    day_data_max = day_data.max(axis = 2)

    ddmean = pd.DataFrame(day_data_mean, columns = ['err_type_day_mean' + str(i) for i in range(1, 43)])
    ddstd = pd.DataFrame(day_data_std, columns = ['err_type_day_std' + str(i) for i in range(1, 43)])
    ddmax = pd.DataFrame(day_data_max, columns = ['err_type_day_max' + str(i) for i in range(1, 43)])
    
    ddmean.drop('err_type_day_mean29', axis = 1, inplace = True)
    ddstd.drop('err_type_day_std29', axis = 1, inplace = True)
    ddmax.drop('err_type_day_max29', axis = 1, inplace = True)
    
    return ddmean, ddstd, ddmax

In [None]:
train_errtype_day_mean, train_errtype_day_std, train_errtype_day_max = day_errtype(train_err, 15000, 10000)

In [None]:
test_errtype_day_mean, test_errtype_day_std, test_errtype_day_max = day_errtype(test_err, 14999, 30000)

## user_model_count1 (유저 - 모델명 사용 여부만)

In [None]:
def user_model_count1(data):
    data = data[['user_id', 'time', 'model_nm']]
    data['count'] = 1
    data = pd.pivot_table(data, index = 'user_id', columns = 'model_nm', values = 'count', fill_value = 0).reset_index()
    return data

## user_model_count2 (유저 - 모델명 err에 찍힌 개수만큼)

In [None]:
def user_model_count2(data):
    data = data[['user_id', 'time', 'model_nm']]
    data['count'] = 1
    data['time'] = data['time'].apply(lambda x: str(x)[:8])
    data = pd.DataFrame(data.groupby('user_id')['model_nm'].value_counts())
    data.columns = ['model_count']
    data.reset_index(inplace = True)
    data = pd.pivot_table(data, index = 'user_id', columns = 'model_nm', values = 'model_count', fill_value = 0).reset_index()
    return data

## user_model_count3 (유저 - 모델명 err에 찍힌 개수만큼 (연월일 기준 중복 제거))

In [None]:
def user_model_count3(data):
    data = data[['user_id', 'time', 'model_nm']]
    data['count'] = 1
    data['time'] = data['time'].apply(lambda x: str(x)[:8])
    data = data.drop_duplicates(['user_id', 'time', 'model_nm'], keep = 'last')
    data = pd.DataFrame(data.groupby('user_id')['model_nm'].value_counts())
    data.columns = ['model_count']
    data.reset_index(inplace = True)
    data = pd.pivot_table(data, index = 'user_id', columns = 'model_nm', values = 'model_count', fill_value = 0).reset_index()
    return data

## 주말/주중, 주별, 일별, 시간별 에러코드의 평균, 표준편차, 최댓값

In [None]:
def we_or_wk(df,num_df_user,first_index):
    
    df = df.loc[(df.datetime >= pd.to_datetime('2020-11-01 00:00:00')) & (df.datetime <= pd.to_datetime('2020-11-30 23:59:59'))]
    datas = df[['user_id','errtype','we_or_wk']]
    df_ = datas[['user_id','we_or_wk','errtype']].value_counts().to_frame().reset_index()
    df_ = df_.sort_values(['user_id','we_or_wk']).rename(columns = {0:'counts'}).reset_index(drop = True)

    df1 = df_.loc[df_.we_or_wk == 0][['user_id','errtype','counts']].values
    df2 = df_.loc[df_.we_or_wk == 1][['user_id','errtype','counts']].values

    day_data = np.zeros((num_df_user,42,2))
    for i, dfa in enumerate([df1,df2]):
        for inx, val1,val2 in dfa:
            day_data[:, :, i][inx-first_index,val1 - 1] = val2

    m = day_data.mean(axis = 2)
    std = day_data.std(axis = 2)
    m_2 = day_data.max(axis = 2)

    m = pd.DataFrame(m, columns = ['err_type_weorwk_mean' + str(i) for i in range(1, 43)])
    std = pd.DataFrame(std, columns = ['err_type_weorwk_std' + str(i) for i in range(1, 43)])
    m_2 = pd.DataFrame(m_2, columns = ['err_type_weorwk_max' + str(i) for i in range(1, 43)])

    m.drop('err_type_weorwk_mean29', axis = 1, inplace = True)
    std.drop('err_type_weorwk_std29', axis = 1, inplace = True)
    m_2.drop('err_type_weorwk_max29', axis = 1, inplace = True)
    return m, std, m_2

In [None]:
def wk(df,num_df_user,first_index):

    df = df.loc[(df.datetime >= pd.to_datetime('2020-11-01 00:00:00')) & (df.datetime <= pd.to_datetime('2020-11-30 23:59:59'))]
    datas = df[['user_id','errtype','week']]
    df_ = datas[['user_id','week','errtype']].value_counts().to_frame().reset_index()
    df_ = df_.sort_values(['user_id','week']).rename(columns = {0:'counts'}).reset_index(drop = True)

    df1 = df_.loc[df_.week == 44][['user_id','errtype','counts']].values
    df2 = df_.loc[df_.week == 45][['user_id','errtype','counts']].values
    df3 = df_.loc[df_.week == 46][['user_id','errtype','counts']].values
    df4 = df_.loc[df_.week == 47][['user_id','errtype','counts']].values
    df5 = df_.loc[df_.week == 48][['user_id','errtype','counts']].values

    day_data = np.zeros((num_df_user, 42, 5))
    for i, dfa in enumerate([df1, df2, df3, df4, df5]):
        for inx, val1, val2 in dfa:
            day_data[:, :, i][inx-first_index,val1 - 1] = val2

    m = day_data.mean(axis = 2)
    std = day_data.std(axis = 2)
    m_2 = day_data.max(axis = 2)
    
    m = pd.DataFrame(m, columns = ['err_type_wk_mean' + str(i) for i in range(1, 43)])
    std = pd.DataFrame(std, columns = ['err_type_wk_std' + str(i) for i in range(1, 43)])
    m_2 = pd.DataFrame(m_2, columns = ['err_type_wk_max' + str(i) for i in range(1, 43)])

    m.drop('err_type_wk_mean29', axis = 1, inplace = True)
    std.drop('err_type_wk_std29', axis = 1, inplace = True)
    m_2.drop('err_type_wk_max29', axis = 1, inplace = True)
    
    return m, std, m_2

In [None]:
def dy(df,num_df_user,first_index): 
    
    df = df.loc[(df.datetime >=pd.to_datetime('2020-11-01 00:00:00'))& (df.datetime<=pd.to_datetime('2020-11-30 23:59:59'))]
    datas = df[['user_id','errtype','day']]
    df_=datas[['user_id','day','errtype']].value_counts().to_frame().reset_index()
    df_ =df_.sort_values(['user_id','day']).rename(columns = {0:'counts'}).reset_index(drop=True)


    day_data = np.zeros((num_df_user,42,30))
    for i in range(30):
        dfa = df_.loc[df_['day']==(i+1)][['user_id','errtype','counts']].values
        for inx , val1 ,val2 in dfa:
            day_data[:,:,i][inx-first_index,val1-1] = val2

    m=day_data.mean(axis=2)
    std=day_data.std(axis=2)       
    m_2=day_data.max(axis=2)
    
    m=pd.DataFrame(m,columns=['err_type_dy_mean'+str(i) for i in range(1,43)])
    std=pd.DataFrame(std,columns=['err_type_dy_std'+str(i) for i in range(1,43)])
    m_2=pd.DataFrame(m_2,columns=['err_type_dy_max'+str(i) for i in range(1,43)])

    m.drop('err_type_dy_mean29',axis=1,inplace=True)
    std.drop('err_type_dy_std29',axis=1,inplace=True)
    m_2.drop('err_type_dy_max29',axis=1,inplace=True)
    
    return m,std,m_2

In [None]:
def hr(df,num_df_user,first_index):     
    
    df = df.loc[(df.datetime >=pd.to_datetime('2020-11-01 00:00:00'))& (df.datetime<=pd.to_datetime('2020-11-30 23:59:59'))]
    datas = df[['user_id','errtype','hour']]
    df_=datas[['user_id','hour','errtype']].value_counts().to_frame().reset_index()
    df_ =df_.sort_values(['user_id','hour']).rename(columns = {0:'counts'}).reset_index(drop=True)


    day_data = np.zeros((num_df_user,42,24))
    for i in range(24):
        dfa = df_.loc[df_['hour']==i][['user_id','errtype','counts']].values
        for inx , val1 ,val2 in dfa:
            day_data[:,:,i][inx-first_index,val1-1] = val2

    m=day_data.mean(axis=2)
    std=day_data.std(axis=2)       
    m_2=day_data.max(axis=2)
    
    m=pd.DataFrame(m,columns=['err_type_hr_mean'+str(i) for i in range(1,43)])
    std=pd.DataFrame(std,columns=['err_type_hr_std'+str(i) for i in range(1,43)])
    m_2=pd.DataFrame(m_2,columns=['err_type_hr_max'+str(i) for i in range(1,43)])

    m.drop('err_type_hr_mean29',axis=1,inplace=True)
    std.drop('err_type_hr_std29',axis=1,inplace=True)
    m_2.drop('err_type_hr_max29',axis=1,inplace=True)
    
    return m,std,m_2

In [None]:
# errcode 중복 정리

train_err['errcode'] = train_err['errcode'].apply(lambda x: str(x).strip())
test_err['errcode'] = test_err['errcode'].apply(lambda x: str(x).strip())

In [None]:
def make_int(row):
    if ',' in str(row):
        string = str(row).replace(',', '')
    elif '.' in str(row):
        string = str(row).replace('.', '')
    else:
        string = str(row)
    return int(string)

def make_col_prefix(dataframe):
    new_c = dataframe.columns
    new_cols = []
    for i in new_c:
        if str(i).isdigit() == True:
            newname = 'errtype_'+str(i)
            new_cols.append(newname)
        else:
            new_cols.append(i)
    return new_cols

def make_dataframe(err, qual, train=True):
    if train == True:
        err_users = err['user_id'].unique() # error data에 있는 unique 유저들
        df = pd.DataFrame() # 빈 데이터프레임 생성
        df['user_id'] = err_users # user 생성
    else:
        df = test_submission[['user_id']]
    
    err['count'] = 1 # count 세기 위한 용도로 컬럼 추가
    
    
    # 1) model_nm : model_nm를 pivot_table로 만듬
    print('making model_nm data...')
    err_model_nm = pd.pivot_table(err, index='user_id', columns='model_nm', values='count', fill_value=0)
    err_model_nm = err_model_nm.reset_index()
    print('done.')
    
    # 2) errtype : errtype을 pivot_table로 만듬
    print('making errtype data...')
    err_type = pd.pivot_table(err, index='user_id', columns='errtype', values='count', aggfunc='sum', fill_value=0)
    err_type = err_type.reset_index()
    err_type_cols = make_col_prefix(err_type)
    err_type.columns = err_type_cols
    print('done.')
    
    # 3) errcode : errcode 상위 50개를 pivot_table로 만듬.
    print('making errcode data...')
    errcode_50 = train_err['errcode'].value_counts(ascending=False)[:50].index # train_err에서 나온 상위 50개 errcode
    err_code = err[err['errcode'].isin(errcode_50)] # errcode_50에 해당하는 errcode만 뽑음.
    err_code = pd.pivot_table(err_code, index=['user_id'], columns='errcode', values='count', fill_value=0, aggfunc='sum').reset_index()
    print('done.')
    
    # 4) quality_issue : qual에 있는 사용자만을 뽑아서 만듬.
    print('making quality_issue data...')
    qual_id = qual.user_id.unique()
    print('done.')
    
    
    # 5) quality_0 ~ quality_12 : qual 에서 먼저 nan값 0처리해주고, 쉼표/점 처리하여 str -> int 화해야함.
    print('making quality_log data...')
    qual.fillna(0, inplace=True)
    for i in range(13):
        qual[f'quality_{i}'] = qual[f'quality_{i}'].apply(make_int)
    
#     model_fw = err[['model_nm', 'fwver']].drop_duplicates().reset_index(drop=True) # model_nm와 fwver의 중복값 모두 제거한 것.
#     qual = pd.merge(qual, model_fw, on=['fwver']) 
    qual = qual.groupby(['user_id']).sum().reset_index()
    qual = qual.drop(columns='time')
#     new_qual_avg = qual.groupby(['user_id']).agg(lambda x: sum(x)/len(x)).reset_index()
#     new_qual_avg = new_qual_avg.drop(columns='time')
#     new_qual_avg
    print('done.')
    
    # 6) weekend: 주말 =1, 주중 0
    print('making weekend data...')
    wk=pd.pivot_table(err, index=['user_id'], columns=['we_or_wk'], values='count', aggfunc='sum', fill_value=0).reset_index()
    wk_col=['weekend_'+str(i) for i in wk.columns[1:]]
    wk_col.insert(0,'user_id')
    wk.columns=wk_col
    print('done.')
    
    # df에 column 추가 : model_nm, errtype, errcode, quality_issue, quality_0~12, weekend, time 순으로
    print('Merging data into DataFrame...')
    df = pd.merge(df, err_model_nm, on=['user_id'], how='left')
    df = pd.merge(df, err_type, on=['user_id'], how='left')
    df = pd.merge(df, err_code, on=['user_id'], how='left')
    df['quality_issue'] = 0
    df.loc[df['user_id'].isin(qual_id), 'quality_issue'] = 1
    df = pd.merge(df, qual, on=['user_id'], how='left')
    df = pd.merge(df, wk, on=['user_id'], how='left')
    
    #전체 건수 합계 추가 
    df['total_err']=df.loc[:,['errtype_'+str(i) for i in range(1,29)]+['errtype_'+str(i) for i in range(30,43)]].sum(axis=1)
    
    
    # 새로 만든 파생변수들을 모두 merge 하면 됨
    if train == True:
        df = pd.merge(df, time_tr, on=['user_id'], how='left')
        df = pd.merge(df, fwver_tr, on=['user_id'], how='left')
        df = pd.merge(df, pro_tr_fwver_cnt, on=['user_id'], how='left')
    else:
        df = pd.merge(df, time_te, on=['user_id'], how='left')
        df = pd.merge(df, fwver_te, on=['user_id'], how='left')
        df = pd.merge(df, pro_te_fwver_cnt, on=['user_id'], how='left')
    print('done.')
    
    # df에 column 추가 :주말여부별, 주별, 일별, 시간별 에러타입 변수
    if train == True:
        m_wok,std_wok,m2_wok=we_or_wk(err,15000,10000)
        m_wk,std_wk,m2_wk=wk(err,15000,10000)
        m_dy,std_dy,m2_dy=dy(err,15000,10000)
        m_hr,std_hr,m2_hr=hr(err,15000,10000)
        df=pd.concat([df,m_wok,std_wok,m2_wok,m_wk,std_wk,m2_wk,m_dy,std_dy,m2_dy,m_hr,std_hr,m2_hr],axis=1)
    else:
        m_wok,std_wok,m2_wok=we_or_wk(err,14999,30000)
        m_wk,std_wk,m2_wk=wk(err,14999,30000)
        m_dy,std_dy,m2_dy=dy(err,14999,30000)
        m_hr,std_hr,m2_hr=hr(err,14999,30000)
        df=pd.concat([df,m_wok,std_wok,m2_wok,m_wk,std_wk,m2_wk,m_dy,std_dy,m2_dy,m_hr,std_hr,m2_hr],axis=1)
    
    print('done.')
    
    df.fillna(0, inplace=True) # quality log 가 없는 유저들은 nan -> 0으로 처리해줘야함.
    
    if train == True: # train data인 경우에는 라벨링 값이 필요
        angry_users = train_problem['user_id'].unique()
        df['angry'] = [1 if user in angry_users else 0 for user in df['user_id']] # label data : 불만사용자면 1, 아니면 0 으로 라벨링
        
    return df

In [None]:
train_df = make_dataframe(train_err, train_qual, True)

In [None]:
test_df = make_dataframe(test_err, test_qual, False)

## Modeling 1

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
# from bayes_opt import BayesianOptimization

In [None]:
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold,StratifiedKFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings                             
warnings.filterwarnings("ignore") 

In [None]:
train_x=train_df.iloc[:,1:-1]
test_x=test_df.iloc[:,1:]
y = train_df.iloc[:, -1]

In [None]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = StratifiedKFold(n_splits=n_splits,random_state=2021,shuffle = True)
    models = []
    for train_index, valid_index in kf.split(x_data,y_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda
        )
        
        model.fit(x_train, y_train,eval_set=[(x_valid,y_valid)],early_stopping_rounds=300,eval_metric=['auc'],verbose=0)
        best_iter = model.best_iteration_
#         print(best_iter)
        models.append(model)
        
        pred = model.predict_proba(x_valid,num_iteration=best_iter)[:, 1]
        true = y_valid
        
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
#  모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=train_x, y_data=y, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (1, 500),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (1, 2000),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=15, n_iter=20) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

In [None]:
#  모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=train_x, y_data=y, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (1, 500),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (1, 2000),      # n_estimators,     범위(16~1024)
        'subsample': (0.1, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0.1, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=15, n_iter=20) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화


In [None]:
params = lgbBO.max['params']
params

In [None]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=train_x, y_data=y, n_splits=10, output='model')

In [None]:
params = lgbBO.max['params']
score = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=train_x, y_data=y, n_splits=5, output='score')

In [None]:
# 모델 score
score

In [None]:
preds = []
for model in models:
    pred = model.predict_proba(test_x)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

In [None]:
test_submission['problem'] = pred
test_submission

## Modeling 2

In [None]:
y=train_df['angry'].values
train_x=train_df.drop(['user_id','angry'],axis=1)
test_x=test_df.drop(['user_id'],axis=1)

In [None]:
def catboost_modeling(x_train, y_train, x_test, grow_policy, depth, learning_rate, l2_leaf_reg, random_seed,n_split):
    cat_model=[]
    aucs=[]

    test_pred = pd.Series([0 for x in range(len(x_test))], index=x_test.index)


    kf = StratifiedKFold(n_splits=n_split,random_state=2021,shuffle = True)
    for train_index, valid_index in kf.split(x_train,y_train):
        train_X, train_y = x_train.iloc[train_index], y_train[train_index]
        valid_X, valid_y = x_train.iloc[valid_index], y_train[valid_index]


        model = CatBoostClassifier(eval_metric = 'AUC',             
                                 iterations = 2000,               
                                 metric_period = 100,          
                                 early_stopping_rounds = 300,     
                                 task_type = 'CPU',                
                                 grow_policy = grow_policy,      
                                 depth = depth,                  
                                 learning_rate = learning_rate,  
                                 l2_leaf_reg = l2_leaf_reg,        
                                 random_seed = random_seed     
                                 
                                 )

        model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
        aucs.append(model.best_score_['validation']['AUC'])
        cat_model.append(model)
        
    
        test_pred += model.predict_proba(x_test)[:,1] / (n_split)


    return test_pred,cat_model,aucs

In [None]:
cat_result,models,aucs=catboost_modeling(train_x, y, test_x, 'Depthwise', 10, 0.03, 30, 2021,10)

In [None]:
# k-fold 결과 평균 AUC
sum(aucs)/10

In [None]:
# test data를 k-fold 모델들로 예측한 값의 평균
preds = []
for model in models:
    pred = model.predict_proba(test_x)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

In [None]:
# 제출 결과물 저장
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['problem']=pred
sample_submission.to_csv('CatBoost_result.csv', index=False)