In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

def sort_time(x):
    x     = str(x)
    hour  = int(x[8:10])
    if (hour>=0) and (hour<5): # 새벽
        t = 0
    elif (hour>=5) and (hour<12): # 오전
        t = 1
    elif (hour>=12) and (hour<18): # 오후
        t = 2
    else: # 저녁
        t = 3
    return t

train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

PATH = 'D:/[0] Study/[9] 데이터분석/lg_system_err_data/'

In [2]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_err['datetime'] = train_err['time'].apply(make_datetime)
train_err['time_category'] = train_err['time'].apply(sort_time)

In [3]:
train_quality = pd.read_csv(PATH+'train_quality_data.csv')
train_quality['datetime'] = train_quality['time'].apply(make_datetime)

In [4]:
train_problem = pd.read_csv(PATH+'train_problem_data.csv')

In [5]:
train_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,datetime,time_category
0,10000,20201101025616,model_3,05.15.2138,15,1,2020-11-01 02:00:00,0
1,10000,20201101030309,model_3,05.15.2138,12,1,2020-11-01 03:00:00,0
2,10000,20201101030309,model_3,05.15.2138,11,1,2020-11-01 03:00:00,0
3,10000,20201101050514,model_3,05.15.2138,16,1,2020-11-01 05:00:00,1
4,10000,20201101050515,model_3,05.15.2138,4,0,2020-11-01 05:00:00,1


In [6]:
train_quality.head()

Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,datetime
0,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0,2020-11-29 09:00:00
1,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0,2020-11-29 09:00:00
2,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0,2020-11-29 09:00:00
3,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0,2020-11-29 09:00:00
4,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0,2020-11-29 09:00:00


### 라벨(1, 0)

In [7]:
problem = np.zeros(15000)
# error와 동일한 방법으로 person_idx -10000 위치에
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_problem.user_id.unique()-10000] = 1
problem.shape

(15000,)

In [8]:
problem

array([0., 1., 0., ..., 1., 1., 0.])

### Input 만들기

- model_nm: categorical data
- errtype: categorical data
- errcode: categorical data
- time_category: categorical data
- quality: categorical data

#### model_nm

In [9]:
# 모든 일자에 대해서 model_nm count
# numpy로 placeholder를 만들어 구현함.
# categorical data라 encoding 필요
id_model_nm = train_err[['user_id','model_nm']].values
n_model_nm = len(train_err.model_nm.unique())

In [10]:
model_nm_table = train_err.groupby(['user_id'])['model_nm'].apply(lambda x: x.value_counts().head(1))

In [11]:
model_nm = []
for i in range(len(model_nm_table)):
    model_nm.append(model_nm_table.keys()[:][i][1])

In [12]:
model_nm

['model_3',
 'model_2',
 'model_3',
 'model_2',
 'model_0',
 'model_0',
 'model_1',
 'model_0',
 'model_0',
 'model_1',
 'model_1',
 'model_0',
 'model_2',
 'model_0',
 'model_2',
 'model_0',
 'model_3',
 'model_2',
 'model_3',
 'model_3',
 'model_2',
 'model_1',
 'model_2',
 'model_1',
 'model_2',
 'model_0',
 'model_3',
 'model_3',
 'model_2',
 'model_0',
 'model_3',
 'model_2',
 'model_1',
 'model_2',
 'model_2',
 'model_3',
 'model_3',
 'model_3',
 'model_2',
 'model_1',
 'model_2',
 'model_1',
 'model_0',
 'model_3',
 'model_0',
 'model_2',
 'model_1',
 'model_2',
 'model_2',
 'model_3',
 'model_1',
 'model_3',
 'model_3',
 'model_3',
 'model_2',
 'model_1',
 'model_2',
 'model_3',
 'model_2',
 'model_0',
 'model_3',
 'model_1',
 'model_2',
 'model_2',
 'model_2',
 'model_0',
 'model_0',
 'model_2',
 'model_3',
 'model_3',
 'model_2',
 'model_3',
 'model_3',
 'model_0',
 'model_2',
 'model_7',
 'model_3',
 'model_1',
 'model_1',
 'model_1',
 'model_4',
 'model_0',
 'model_1',
 'mo

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
model_nm = np.reshape(model_nm, (len(model_nm), 1))

In [15]:
onehot_encoder = OneHotEncoder()
model_nm_onehot = onehot_encoder.fit_transform(model_nm)

In [16]:
model_nm_onehot = model_nm_onehot.toarray()

In [17]:
model_nm_onehot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### errtype

In [18]:
# 모든 일자에 대해서 errtype count
# numpy로 placeholder를 만들어 구현함.
id_errtype = train_err[['user_id','errtype']].values
n_errtype = len(train_err.errtype.unique())
errtype = np.zeros((train_user_number,42))

for person_idx, errt in tqdm(id_errtype):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    errtype[person_idx - train_user_id_min, errt - 1] += 1
errtype.shape

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16554663/16554663 [00:47<00:00, 350046.62it/s]


(15000, 42)

In [19]:
errtype

array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

#### errcode

In [20]:
# 모든 일자에 대해서 errcode count
# numpy로 placeholder를 만들어 구현함.
errcode = train_err.groupby(['user_id', 'errcode']).size().unstack(fill_value=0)
errcode.head()

errcode,-269,-270,0,0001,1,100,10005,10018,10043,10073,...,Y-00008,active,connection fail for LMP response timout,connection fail to establish,connection timeout,connectionterminated by local host,http,scanning timeout,standby,terminate by peer user
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0,0,104,0,212,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10001,0,0,183,0,1274,0,0,0,0,0,...,0,126,0,0,0,0,0,0,625,0
10002,0,0,132,0,172,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,42,0,229,0,0,0,0,0,...,0,12,0,0,7,0,0,0,0,0
10004,0,0,98,0,529,0,0,0,0,0,...,0,7,0,1,104,0,0,0,5,2


In [21]:
errcode.shape

(15000, 2805)

#### time_category

In [22]:
# 모든 일자에 대해서 time_category count
# numpy로 placeholder를 만들어 구현함.
time_category = train_err.groupby(['user_id', 'time_category']).size().unstack(fill_value=0)
time_category.head()

time_category,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000,103,121,45,48
10001,212,221,1068,864
10002,50,98,48,110
10003,110,79,19,98
10004,145,176,102,354


In [23]:
time_category = time_category.to_numpy()

In [24]:
time_category

array([[ 103,  121,   45,   48],
       [ 212,  221, 1068,  864],
       [  50,   98,   48,  110],
       ...,
       [ 191,  292,   82,  261],
       [  26,   70,   15,   44],
       [  51,  299,  179,   41]], dtype=int64)

#### quality

In [25]:
quality = train_quality.groupby(['user_id'])[['quality_0','quality_1','quality_2','quality_3','quality_4','quality_5','quality_6','quality_7','quality_8','quality_9','quality_10','quality_11', 'quality_12']].sum().reset_index()

In [26]:
quality

Unnamed: 0,user_id,quality_0,quality_1,quality_2,quality_3,quality_4,quality_6,quality_11,quality_12
0,10000,0.0,0,0.0,0,0,0,0,0
1,10002,0.0,-2,-1.0,0,0,44,-2,0
2,10004,-2.0,-2,-2.0,0,0,85,-2,0
3,10005,-10.0,-10,-10.0,0,0,26,-10,0
4,10006,0.0,0,0.0,0,0,4,0,0
...,...,...,...,...,...,...,...,...,...
8276,24990,0.0,-8,-8.0,0,0,-8,-8,0
8277,24992,-12.0,-12,-12.0,0,0,-12,-12,0
8278,24993,-7.0,-7,-7.0,0,0,-7,-7,0
8279,24995,-10.0,-10,-10.0,0,0,-10,-10,0


In [27]:
train_quality_base = pd.DataFrame(np.zeros((15000, 1))).reset_index().rename(columns={"index": "user_id"})

In [28]:
user_id = train_quality_base['user_id']+10000

In [29]:
train_quality_base['user_id'] = user_id

In [30]:
train_quality_base

Unnamed: 0,user_id,0
0,10000,0.0
1,10001,0.0
2,10002,0.0
3,10003,0.0
4,10004,0.0
...,...,...
14995,24995,0.0
14996,24996,0.0
14997,24997,0.0
14998,24998,0.0


In [31]:
merge_outer = pd.merge(quality,train_quality_base, how='right',on='user_id')

In [32]:
merge_outer = merge_outer.iloc[:, 1:-1].fillna(0)
merge_outer.head()

Unnamed: 0,quality_0,quality_1,quality_2,quality_3,quality_4,quality_6,quality_11,quality_12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,-2.0,-1.0,0.0,0.0,44.0,-2.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.0,-2.0,-2.0,0.0,0.0,85.0,-2.0,0.0


In [33]:
quality_list = merge_outer.columns

### 테스트 피처만들기

In [46]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [47]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_err['time_category'] = test_err['time'].apply(sort_time)
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,time_category
0,30000,20201101030227,model_1,04.16.3553,31,1,0
1,30000,20201101030227,model_1,04.16.3553,33,2,0
2,30000,20201101030228,model_1,04.16.3553,15,1,0
3,30000,20201101030256,model_1,04.16.3553,22,1,0
4,30000,20201101030300,model_1,04.16.3553,11,1,0


#### model_nm

In [67]:
# 모든 일자에 대해서 model_nm count
# numpy로 placeholder를 만들어 구현함.
# categorical data라 encoding 필요
test_id_model_nm = test_err[['user_id','model_nm']].values
test_n_model_nm = len(test_err.model_nm.unique())
test_model_nm_table = test_err.groupby(['user_id'])['model_nm'].apply(lambda x: x.value_counts().head(1))
test_model_nm = []
for i in range(len(test_model_nm_table)):
    test_model_nm.append(test_model_nm_table.keys()[:][i][1])

In [69]:
test_err.groupby(['user_id'])['model_nm'].head()

0           model_1
1           model_1
2           model_1
3           model_1
4           model_1
             ...   
16531775    model_1
16531776    model_1
16531777    model_1
16531778    model_1
16531779    model_1
Name: model_nm, Length: 74801, dtype: object

In [49]:
from sklearn.preprocessing import OneHotEncoder
test_model_nm = np.reshape(test_model_nm, (len(test_model_nm), 1))
onehot_encoder = OneHotEncoder()
test_model_nm_onehot = onehot_encoder.fit_transform(test_model_nm)
test_model_nm_onehot = test_model_nm_onehot.toarray()

In [51]:
test_model_nm_onehot

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

#### errtype

In [39]:
# 모든 일자에 대해서 errtype count
# numpy로 placeholder를 만들어 구현함.
test_id_errtype = test_err[['user_id','errtype']].values
test_n_errtype = len(test_err.errtype.unique())
test_errtype = np.zeros((test_user_number,42))

for person_idx, errt in tqdm(test_id_errtype):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_errtype[person_idx - test_user_id_min, errt - 1] += 1
test_errtype.shape

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 16531775/16532648 [00:48<00:00, 342008.14it/s]


IndexError: index 14998 is out of bounds for axis 0 with size 14998

In [45]:
len(test_err.errtype.unique())

41

In [None]:
np.shape(test_errtype)

#### errcode

- training과 test에서 특성이 다름

In [None]:
# 모든 일자에 대해서 errcode count
# numpy로 placeholder를 만들어 구현함.
test_errcode = test_err.groupby(['user_id', 'errcode']).size().unstack(fill_value=0)
test_errcode

In [None]:
test_errcode_list = test_err.errcode.unique()

In [None]:
test_errcode_list

In [None]:
test_errcode.shape

#### time_category

In [None]:
# 모든 일자에 대해서 time_category count
# numpy로 placeholder를 만들어 구현함.
test_time_category = test_err.groupby(['user_id', 'time_category']).size().unstack(fill_value=0)
test_time_category.head()

In [None]:
test_time_category

In [None]:
test_time_category = test_time_category.to_numpy()

#### quality

- quality의 종류가 training, test에서 달라, 공통 특성만 사용

In [None]:
test_quality  = pd.read_csv(PATH+'test_quality_data.csv')

In [None]:
test_quality = test_quality.groupby(['user_id'])[['quality_0','quality_1','quality_2','quality_3','quality_4','quality_5','quality_6','quality_7','quality_8','quality_9','quality_10','quality_11', 'quality_12']].sum().reset_index()
test_quality_base = pd.DataFrame(np.zeros((14998, 1))).reset_index().rename(columns={"index": "user_id"})
test_user_id = test_quality_base['user_id']+30000

In [None]:
test_user_id

In [None]:
test_quality_base['user_id'] = test_user_id

In [None]:
test_merge_outer = pd.merge(test_quality,test_quality_base, how='right',on='user_id')

In [None]:
test_merge_outer = test_merge_outer.iloc[:,:-1].fillna(0)

In [None]:
test_quality_list = test_merge_outer.columns

#### feature 합치기

In [None]:
# test와 training 특성 같게 만들기
train_errcode_list = train_err.errcode.unique()

In [None]:
common_errcode_feature = list(set(train_errcode_list) & set(test_errcode_list))

In [None]:
len(common_errcode_feature)

In [None]:
train_errcode_list

In [None]:
test_errcode_list

In [None]:
errcode = errcode.loc[:,  errcode.columns.isin(common_errcode_feature)]
errcode.to_numpy()

In [None]:
test_errcode = test_errcode.loc[:,  test_errcode.columns.isin(common_errcode_feature)]
test_errcode.to_numpy()

In [None]:
train_quality = merge_outer.loc[:, quality_list]
train_quality.to_numpy()

In [None]:
test_quality = test_merge_outer.loc[:, test_quality_list]
test_quality.to_numpy()

In [None]:
test_feature = np.concatenate((test_model_nm_onehot, test_errtype, test_errcode, test_time_category, test_quality), axis=1)

### 분류 모델 비교

#### feature 합치기

In [None]:
train_feature = np.concatenate((model_nm_onehot, errtype, errcode, time_category, train_quality), axis=1)

In [None]:
train_feature.shape

In [None]:
# label
problem

### 분류 모델

In [None]:
# Import required libraries for machine learning classifiers
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('SVM', LinearSVC()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))

In [None]:
results = []
names = []
scoring = 'accuracy'

In [None]:
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=72)
    cv_results = model_selection.cross_val_score(model, train_feature, problem, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print_result = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(print_result)