In [6]:
import pandas as pd 
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm 
import gc
import random
import lightgbm as lgbm
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [7]:
train=pd.read_csv('train_err_data.csv')

In [8]:
train.shape

(16554663, 6)

In [9]:
errcode_30=pd.DataFrame(train['errcode'].value_counts()[:30]).reset_index()
errcode_30.columns=['errcode','counts']
errcode_30

Unnamed: 0,errcode,counts
0,1,8906967
1,0,2599123
2,connection timeout,1835600
3,B-A8002,897863
4,80,334018
5,79,332422
6,14,263577
7,active,219238
8,2,166382
9,84,129876


### errcode_30 데이터를 원본 데이터에 merge 시키면 30개에 해당하지 않는 errcode는 nan이 됨을 이용한다.

In [13]:
train_errcode=pd.merge(train,errcode_30,on='errcode',how='left')
train_errcode['counts']=train_errcode['counts'].fillna(0) # nan을 0으로 전환해준다(counts를 합쳤기에 원래는 0이 없었음)

In [14]:
train_errcode['counts'].value_counts()

8906967.0    8906967
2599123.0    2599123
1835600.0    1835600
897863.0      897863
334018.0      334018
332422.0      332422
263577.0      263577
219238.0      219238
166382.0      166382
129876.0      129876
127744.0      127744
110370.0      110370
104138.0      104138
100940.0      100940
91223.0        91223
65992.0        65992
64799.0        64799
0.0            38497
34631.0        34631
23452.0        23452
22118.0        22118
20249.0        20249
12040.0        12040
11959.0        11959
9317.0          9317
7392.0          7392
7335.0          7335
5028.0          5028
4339.0          4339
4023.0          4023
4011.0          4011
Name: counts, dtype: int64

### counts가 0인(30개에 들지 못하는) errcode들을 'other'로 변경해준다.

In [15]:
train_ec=train_errcode.copy()
train_ec['errcode'][train_ec['counts']==0]='other'
train_ec['errcode'].value_counts()

1                                     8906967
0                                     2599123
connection timeout                    1835600
B-A8002                                897863
80                                     334018
79                                     332422
14                                     263577
active                                 219238
2                                      166382
84                                     129876
85                                     127744
standby                                110370
NFANDROID2                             104138
connection fail to establish           100940
3                                       91223
90                                      65992
89                                      64799
other                                   38497
S-61001                                 34631
95                                      23452
94                                      22118
4                                 

### errcode를 원핫 인코딩해주고 id와 model_nm, fwver를 기준으로 groupby 하여 sum을 해준다


In [16]:
#groupby 해줄 기준인 group 을 생성( 혹시 앞서 그룹화 한 데이터와 순서가 달라질 수 있으니)

from sklearn.preprocessing import LabelEncoder

encoder_model_tr=LabelEncoder()
encoder_model_tr.fit(train_ec['model_nm'])
label_model_tr=encoder_model_tr.transform(train_ec['model_nm'])
train_ec['model_nm']=label_model_tr


encoder_fwver_tr=LabelEncoder()
encoder_fwver_tr.fit(train_ec['fwver'])
label_fwver_tr=encoder_fwver_tr.transform(train_ec['fwver'])
train_ec['fwver']=label_fwver_tr

print('train_model_nm의 라벨 값:',train_ec['model_nm'].unique())
print('train_fwver의 라벨값:',train_ec['fwver'].unique())


#그룹화 하기위한 새로운 변수
train_ec['group']=train_ec['user_id']*1000 + train_ec['model_nm']*100 + train_ec['fwver']


train_model_nm의 라벨 값: [3 2 0 1 7 4 5 8 6]
train_fwver의 라벨값: [31 19 20 13 14  6 17  8 33 34  1 12  2 23 25 18 21 35 24 22 36 11  0 29
 16  5  9 15  7 26 32 30 10  3 28  4 27]


In [17]:
# errcode one-hot인코딩
train_errcode_OH=pd.get_dummies(train_ec,columns=['errcode'])
print(train_errcode_OH.shape)
print(train_errcode_OH.columns)

(16554663, 38)
Index(['user_id', 'time', 'model_nm', 'fwver', 'errtype', 'counts', 'group',
       'errcode_0', 'errcode_1', 'errcode_13', 'errcode_14', 'errcode_2',
       'errcode_3', 'errcode_4', 'errcode_5', 'errcode_6', 'errcode_78',
       'errcode_79', 'errcode_8.0', 'errcode_80', 'errcode_81', 'errcode_84',
       'errcode_85', 'errcode_86', 'errcode_89', 'errcode_90', 'errcode_94',
       'errcode_95', 'errcode_B-A8002', 'errcode_NFANDROID2',
       'errcode_Q-64002', 'errcode_S-61001', 'errcode_active',
       'errcode_connection fail to establish', 'errcode_connection timeout',
       'errcode_connectionterminated by local host', 'errcode_other',
       'errcode_standby'],
      dtype='object')


In [21]:
errcode_train_get=train_errcode_OH.groupby(['group'])['errcode_0', 'errcode_1', 'errcode_13', 'errcode_14', 'errcode_2',
       'errcode_3', 'errcode_4', 'errcode_5', 'errcode_6', 'errcode_78',
       'errcode_79', 'errcode_8.0', 'errcode_80', 'errcode_81', 'errcode_84',
       'errcode_85', 'errcode_86', 'errcode_89', 'errcode_90', 'errcode_94',
       'errcode_95', 'errcode_B-A8002', 'errcode_NFANDROID2',
       'errcode_Q-64002', 'errcode_S-61001', 'errcode_active',
       'errcode_connection fail to establish', 'errcode_connection timeout',
       'errcode_connectionterminated by local host','errcode_standby'].sum()

In [22]:
errcode_train_get=pd.DataFrame(errcode_train_get)

In [23]:
errcode_train_get

Unnamed: 0_level_0,errcode_0,errcode_1,errcode_13,errcode_14,errcode_2,errcode_3,errcode_4,errcode_5,errcode_6,errcode_78,...,errcode_95,errcode_B-A8002,errcode_NFANDROID2,errcode_Q-64002,errcode_S-61001,errcode_active,errcode_connection fail to establish,errcode_connection timeout,errcode_connectionterminated by local host,errcode_standby
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000331,104.0,212.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001219,86.0,235.0,1.0,9.0,3.0,0.0,8.0,0.0,0.0,0.0,...,0.0,9.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001220,97.0,1039.0,0.0,7.0,4.0,5.0,10.0,0.0,0.0,0.0,...,0.0,44.0,28.0,0.0,0.0,126.0,0.0,0.0,0.0,625.0
10002331,132.0,172.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10003219,13.0,55.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24997013,88.0,338.0,1.0,2.0,15.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0
24997014,58.0,260.0,0.0,0.0,5.0,6.0,0.0,0.0,0.0,0.0,...,0.0,8.0,8.0,0.0,0.0,4.0,0.0,0.0,0.0,11.0
24998013,0.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998014,13.0,118.0,0.0,1.0,11.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [31]:
#csv
errcode_train_get.to_csv(r"errcode30_train.csv", index=False)

In [None]:
#csv

## 동일한 방식으로 test도 진행


### 우선 test와 train의 상위 errcode들의 경향을 한번 살펴보자

In [3]:
test=pd.read_csv('test_err_data.csv')

In [12]:
#errcode의 count 경향을 보기위해서 test의 상위 40개와 train의 상위 30개를 비교하는 표를 생성한다.
errcode_test=pd.DataFrame(test['errcode'].value_counts()[:40]).reset_index()
errcode_test.columns=['errcode','counts']

In [13]:
errcode_test_train=pd.merge(errcode_test,errcode_30,on='errcode',how='left')
errcode_test_train


Unnamed: 0,errcode,counts_x,counts_y
0,1,8750154,8906967.0
1,0,2565532,2599123.0
2,connection timeout,1890632,1835600.0
3,B-A8002,855747,897863.0
4,80,326179,334018.0
5,79,324186,332422.0
6,14,316110,263577.0
7,active,225519,219238.0
8,2,155401,166382.0
9,84,140642,129876.0


#### connectionterminated by local host 를 제외하고는 어느정도 범위가 맞다. connectionterminated by local host 를 제거하면 의미를 어느정도는 맞출 수 있을듯

# train기준으로 test의 errcode에서 상위 30개를 추출

In [6]:
#train에서 얻은 errcode_30을 결합
#해당되지 않는 errcode의 counts는 nan이 된다.

test_errcode=pd.merge(test,errcode_30,on='errcode',how='left')
test_errcode['counts']=test_errcode['counts'].fillna(0)

In [7]:
#30개에 속하지 않으면 'other'로 전환
test_ec=test_errcode.copy()
test_ec['errcode'][test_ec['counts']==0]='other'
test_ec['errcode'].value_counts()

1                                     8750154
0                                     2565532
connection timeout                    1890632
B-A8002                                855747
80                                     326179
79                                     324186
14                                     316110
active                                 225519
2                                      155401
84                                     140642
standby                                139947
85                                     139255
connection fail to establish           109690
NFANDROID2                             108244
3                                       86451
connectionterminated by local host      84809
90                                      58820
89                                      57718
other                                   37704
S-61001                                 35026
4                                       19084
95                                

In [8]:
#groupby 해줄 기준인 group 을 생성( 혹시 앞서 그룹화 한 데이터와 순서가 달라질 수 있으니)

from sklearn.preprocessing import LabelEncoder

encoder_model_te=LabelEncoder()
encoder_model_te.fit(test_ec['model_nm'])
label_model_te=encoder_model_te.transform(test_ec['model_nm'])
test_ec['model_nm']=label_model_te


encoder_fwver_te=LabelEncoder()
encoder_fwver_te.fit(test_ec['fwver'])
label_fwver_te=encoder_fwver_te.transform(test_ec['fwver'])
test_ec['fwver']=label_fwver_te

print('test_model_nm의 라벨 값:',test_ec['model_nm'].unique())
print('test_fwver의 라벨값:',test_ec['fwver'].unique())


test_ec['group']=test_ec['user_id']*1000 + test_ec['model_nm']*100 + test_ec['fwver']

test_model_nm의 라벨 값: [1 2 3 0 4 5 6 7 8]
test_fwver의 라벨값: [ 4  6 21 32 14 16 18  2 20 13 31 26 28 36 34 23 24 22 25 27  9 39 12 11
  1 19 35  3 29  0 10 33 17  8 30 37 38  5 15  7]


In [9]:
# errcode one-hot인코딩
test_errcode_OH=pd.get_dummies(test_ec,columns=['errcode'])
print(test_errcode_OH.shape)
print(test_errcode_OH.columns)

(16532648, 38)
Index(['user_id', 'time', 'model_nm', 'fwver', 'errtype', 'counts', 'group',
       'errcode_0', 'errcode_1', 'errcode_13', 'errcode_14', 'errcode_2',
       'errcode_3', 'errcode_4', 'errcode_5', 'errcode_6', 'errcode_78',
       'errcode_79', 'errcode_8.0', 'errcode_80', 'errcode_81', 'errcode_84',
       'errcode_85', 'errcode_86', 'errcode_89', 'errcode_90', 'errcode_94',
       'errcode_95', 'errcode_B-A8002', 'errcode_NFANDROID2',
       'errcode_Q-64002', 'errcode_S-61001', 'errcode_active',
       'errcode_connection fail to establish', 'errcode_connection timeout',
       'errcode_connectionterminated by local host', 'errcode_other',
       'errcode_standby'],
      dtype='object')


In [10]:
errcode_test_get=test_errcode_OH.groupby(['group'])['errcode_0', 'errcode_1', 'errcode_13', 'errcode_14', 'errcode_2',
       'errcode_3', 'errcode_4', 'errcode_5', 'errcode_6', 'errcode_78',
       'errcode_79', 'errcode_8.0', 'errcode_80', 'errcode_81', 'errcode_84',
       'errcode_85', 'errcode_86', 'errcode_89', 'errcode_90', 'errcode_94',
       'errcode_95', 'errcode_B-A8002', 'errcode_NFANDROID2',
       'errcode_Q-64002', 'errcode_S-61001', 'errcode_active',
       'errcode_connection fail to establish', 'errcode_connection timeout',
       'errcode_connectionterminated by local host','errcode_standby'].sum()

In [11]:
errcode_test_get=pd.DataFrame(errcode_test_get)

In [12]:
errcode_test_get

Unnamed: 0_level_0,errcode_0,errcode_1,errcode_13,errcode_14,errcode_2,errcode_3,errcode_4,errcode_5,errcode_6,errcode_78,...,errcode_95,errcode_B-A8002,errcode_NFANDROID2,errcode_Q-64002,errcode_S-61001,errcode_active,errcode_connection fail to establish,errcode_connection timeout,errcode_connectionterminated by local host,errcode_standby
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30000104,52.0,1024.0,0.0,8.0,11.0,1.0,0.0,0.0,0.0,0.0,...,0.0,53.0,0.0,0.0,7.0,0.0,55.0,1107.0,0.0,0.0
30000106,29.0,188.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,137.0,0.0,0.0,0.0,3.0
30000221,9.0,39.0,0.0,4.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0
30001332,98.0,180.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
30002014,157.0,502.0,0.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,29.0,1.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44996014,126.0,324.0,0.0,0.0,525.0,569.0,6.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,4.0,44.0,0.0,0.0
44996016,125.0,374.0,3.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,104.0,0.0,0.0,0.0,11.0
44997014,54.0,9862.0,1.0,3.0,8.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,44.0,0.0,0.0,0.0,1010.0,10449.0,1.0,0.0
44997016,7.0,1511.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,3.0,5.0,0.0,0.0,1655.0,0.0,0.0,0.0,20.0


In [13]:
#csv
errcode_test_get.to_csv(r"errcode30_test.csv", index=False)