In [1]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==0.8.2 (from autogluon)
  Downloading autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.7/285.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.8.2 (from autogluon)
  Downloading autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from copy import deepcopy

from collections import Counter

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pd.options.display.max_columns = 100

In [6]:
data1 = pd.read_csv('/content/drive/MyDrive/my_data/kt_network/q2/Q2_train.csv')
data2 = pd.read_csv('/content/drive/MyDrive/my_data/kt_network/q2/Q2_test.csv')
data1.shape, data2.shape

((9322, 13), (37671, 12))

In [9]:
import pytz

target = 'alarmmsg_original'

def feature_engineering(data):
    '''
    params
        data : DataFrame - feature engineering의 대상이 되는 데이터
    return
        DataFrame
    '''
    # ticketno column을 int64로 변경
    temp = deepcopy(data)
    temp['ticketno'] = temp['ticketno'].astype('int64')

    # 값이 너무 큰 변수들을 로그화
    temp['ticketno_log1p'] = np.log1p(temp['ticketno'])
    temp['alarmno_log1p'] = np.log1p(temp['alarmno'])

    # alarmtime제거
    # temp.drop(columns=['alarmtime'], inplace=True)

    # alarmtime column을 datetime64로 변경
    tz_pytz = pytz.timezone('Asia/Seoul')
    temp['alarmtime'] = pd.to_datetime(temp['alarmtime'], unit='ns', utc=True).dt.tz_convert('Asia/Seoul')

    # site & sysname
    site_col_dict = {}
    for i in range(4):
        site_col_dict[f'site_{i}'] = []


    for idx, value in temp['site'].items():
        for num, each_char in enumerate(value):
            site_col_dict[f'site_{num}'].append(each_char)
    site_df = pd.DataFrame(site_col_dict)

    sysname_col_dict = {}
    for i in range(4):
        sysname_col_dict[f'sysname_{i}'] = []


    for idx, value in temp['sysname'].items():
        for num, each_char in enumerate(value):
            sysname_col_dict[f'sysname_{num}'].append(each_char)

    sysname_df = pd.DataFrame(sysname_col_dict)
    temp = pd.concat([temp, site_df, sysname_df], axis=1)


    #slot, port의 결측치는 100으로 채운다.
    temp['port'].fillna(0.0, inplace=True)
    temp['slot'].fillna(100.0, inplace=True)

    # port와 slot의 dtype을 int32로 바꾼다
    temp['port'] = temp['port'].astype('int32')
    temp['slot'] = temp['slot'].astype('int32')

    return temp

In [10]:
data1 = feature_engineering(data1)
data2 = feature_engineering(data2)
data1

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain,root_cause_type,ticketno_log1p,alarmno_log1p,site_0,site_1,site_2,site_3,sysname_0,sysname_1,sysname_2,sysname_3
0,21122633,1669820428245,2022-12-01 00:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143737,A,C,E,N,a,c,n,t
1,21122633,1669821318728,2022-12-01 00:17:15+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143738,A,C,E,N,a,c,n,t
2,21122633,1669822214832,2022-12-01 00:32:11+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143738,A,C,E,N,a,c,n,t
3,21122633,1669823114128,2022-12-01 00:47:10+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143739,A,C,E,N,a,c,n,t
4,21122633,1669824028082,2022-12-01 01:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143739,A,C,E,N,a,c,n,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,21774618,1671892499215,2022-12-24 23:37:14+09:00,7,OPT-LOS,AFAA,agow,EQPT,5,5,NSA,A,LinkCut,16.896256,28.144977,A,F,A,A,a,g,o,w
9318,15693425,877949375,2022-12-25 10:13:46+09:00,7,OPT-REMOVE,AGFD,aibb,EQPT,1,8,NSA,A,UnitFail,16.568752,20.593099,A,G,F,D,a,i,b,b
9319,21809789,1671974758375,2022-12-25 22:28:14+09:00,7,OPT-LOS,ADKA,aeaq,EQPT,3,3,NSA,A,LinkCut,16.897870,28.145027,A,D,K,A,a,e,a,q
9320,21811213,1671978167736,2022-12-25 23:25:03+09:00,7,OPT-LOS,ABZO,acie,EQPT,5,6,NSA,A,LinkCut,16.897935,28.145029,A,B,Z,O,a,c,i,e


In [11]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9322 entries, 0 to 9321
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype                     
---  ------             --------------  -----                     
 0   ticketno           9322 non-null   int64                     
 1   alarmno            9322 non-null   int64                     
 2   alarmtime          9322 non-null   datetime64[ns, Asia/Seoul]
 3   alarmlevel         9322 non-null   int64                     
 4   alarmmsg_original  9322 non-null   object                    
 5   site               9322 non-null   object                    
 6   sysname            9322 non-null   object                    
 7   unit               9322 non-null   object                    
 8   slot               9322 non-null   int32                     
 9   port               9322 non-null   int32                     
 10  sva                9322 non-null   object                    
 11  root_cause_domain

In [12]:
data2_a = data2.loc[data2['root_cause_domain'] == 'A']
data2_a

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain,ticketno_log1p,alarmno_log1p,site_0,site_1,site_2,site_3,sysname_0,sysname_1,sysname_2,sysname_3
39,21812793,1671895004991,2022-12-25 00:19:00+09:00,7,OPT-LOS,ACCJ,acll,EQPT,3,1,NSA,A,16.898007,28.144979,A,C,C,J,a,c,l,l
54,21812926,1671895306472,2022-12-25 00:24:01+09:00,7,OPT-LOS,AFIE,ahbz,EQPT,3,1,NSA,A,16.898013,28.144979,A,F,I,E,a,h,b,z
100,21812926,1671896687631,2022-12-25 00:47:02+09:00,7,ETH-LINK-FAIL,AFIE,ahbz,EQPT,3,1,NSA,A,16.898013,28.144980,A,F,I,E,a,h,b,z
101,21812926,1671896693079,2022-12-25 00:47:08+09:00,7,OPT-LOS,AFIE,ahbz,EQPT,3,1,NSA,A,16.898013,28.144980,A,F,I,E,a,h,b,z
435,21812926,1671904827921,2022-12-25 03:02:41+09:00,7,OPT-LOS,AFIE,ahbz,EQPT,3,1,NSA,A,16.898013,28.144985,A,F,I,E,a,h,b,z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37494,21968655,1672409112378,2022-12-30 23:07:32+09:00,5,LSP-LOC,AEOM,afwp,EQPT,1,1,NSA,A,16.905127,28.145286,A,E,O,M,a,f,w,p
37541,21968655,1672409982810,2022-12-30 23:22:02+09:00,5,LSP-LOC,AEOM,afwp,EQPT,1,1,NSA,A,16.905127,28.145287,A,E,O,M,a,f,w,p
37563,21986156,1672410376930,2022-12-30 23:28:36+09:00,7,OPT-LOS,ABZO,acie,EQPT,5,6,NSA,A,16.905924,28.145287,A,B,Z,O,a,c,i,e
37598,21966016,1672410896732,2022-12-30 23:37:16+09:00,7,OPT-LOS,AFAA,agow,EQPT,5,5,NSA,A,16.905007,28.145287,A,F,A,A,a,g,o,w


In [13]:
train_df = pd.concat([data1, data2_a], ignore_index=True)
train_df

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain,root_cause_type,ticketno_log1p,alarmno_log1p,site_0,site_1,site_2,site_3,sysname_0,sysname_1,sysname_2,sysname_3
0,21122633,1669820428245,2022-12-01 00:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143737,A,C,E,N,a,c,n,t
1,21122633,1669821318728,2022-12-01 00:17:15+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143738,A,C,E,N,a,c,n,t
2,21122633,1669822214832,2022-12-01 00:32:11+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143738,A,C,E,N,a,c,n,t
3,21122633,1669823114128,2022-12-01 00:47:10+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143739,A,C,E,N,a,c,n,t
4,21122633,1669824028082,2022-12-01 01:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,NSA,A,LinkCut,16.865856,28.143739,A,C,E,N,a,c,n,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12090,21968655,1672409112378,2022-12-30 23:07:32+09:00,5,LSP-LOC,AEOM,afwp,EQPT,1,1,NSA,A,,16.905127,28.145286,A,E,O,M,a,f,w,p
12091,21968655,1672409982810,2022-12-30 23:22:02+09:00,5,LSP-LOC,AEOM,afwp,EQPT,1,1,NSA,A,,16.905127,28.145287,A,E,O,M,a,f,w,p
12092,21986156,1672410376930,2022-12-30 23:28:36+09:00,7,OPT-LOS,ABZO,acie,EQPT,5,6,NSA,A,,16.905924,28.145287,A,B,Z,O,a,c,i,e
12093,21966016,1672410896732,2022-12-30 23:37:16+09:00,7,OPT-LOS,AFAA,agow,EQPT,5,5,NSA,A,,16.905007,28.145287,A,F,A,A,a,g,o,w


In [14]:
target_classes = set(train_df[target].value_counts().loc[train_df[target].value_counts() > 400].index)
target_classes

{'ETH-ERR',
 'ETH-LINK-FAIL',
 'ETH-NO-TX-TRAFFIC',
 'LSP-LOC',
 'OPT-LOS',
 'OPT-PWR-LOW',
 'OPT-REMOVE',
 'PSU-FAIL'}

In [15]:
# 하나로 뭉쳐버릴 label class
target_leftover = set(train_df[target].value_counts().loc[train_df[target].value_counts() <= 400].index)
target_leftover

{'48V-FAIL',
 'AUTONEGO_MISMATCH',
 'BATT-LOW',
 'BOOTING',
 'DCC-FAIL',
 'ETH-NO-RX-TRAFFIC',
 'FAN-48V-FAIL',
 'FAN-FAIL',
 'LINK_FAIL_MANUAL',
 'NVRAM-FAIL',
 'OPT-MIS',
 'OPT-PWR-HIGH',
 'OS_MISMATCH',
 'PSU-REM',
 'PW-LOC',
 'PortShutdown',
 'ProtectionSwitched',
 'QL_FAIL',
 'Restarted',
 'TDM-PW-LOF',
 'TDM-PW-RMT_FAIL',
 'TRK-CONN-MIS',
 'UNIT-IPC-FAIL',
 'UNIT-REM'}

In [16]:
train_df.loc[train_df[target].isin(target_leftover), target] = '-'

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x = train_df[['ticketno_log1p', 'alarmno_log1p', 'alarmtime', 'alarmlevel', 'site_1', 'sysname_1']]
y = train_df[target]

x.shape, y.shape

((12095, 6), (12095,))

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2023, stratify=y, test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((9676, 6), (2419, 6), (9676,), (2419,))

In [20]:
train = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
train

Unnamed: 0,ticketno_log1p,alarmno_log1p,alarmtime,alarmlevel,site_1,sysname_1,alarmmsg_original
0,16.865088,28.143759,2022-12-01 09:56:48+09:00,5,D,e,OPT-PWR-LOW
1,16.874048,28.144146,2022-12-08 21:39:58+09:00,5,E,f,OPT-PWR-LOW
2,16.869298,28.143969,2022-12-05 11:44:54+09:00,5,E,f,OPT-PWR-LOW
3,16.871468,28.144035,2022-12-06 18:13:56+09:00,5,E,f,OPT-PWR-LOW
4,16.865152,28.143750,2022-12-01 05:48:37+09:00,5,E,f,OPT-PWR-LOW
...,...,...,...,...,...,...,...
9671,16.900376,28.145099,2022-12-27 07:54:19+09:00,7,C,c,OPT-LOS
9672,16.876094,28.144267,2022-12-11 05:39:40+09:00,5,E,f,OPT-PWR-LOW
9673,16.889173,28.144698,2022-12-19 13:41:07+09:00,5,B,c,OPT-PWR-LOW
9674,16.884815,28.144553,2022-12-16 18:25:09+09:00,7,C,c,OPT-LOS


In [21]:
train[target].value_counts()

OPT-PWR-LOW          2402
LSP-LOC              2166
OPT-LOS              1667
ETH-ERR              1132
PSU-FAIL              595
-                     514
ETH-LINK-FAIL         457
OPT-REMOVE            405
ETH-NO-TX-TRAFFIC     338
Name: alarmmsg_original, dtype: int64

In [22]:
test = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)
test

Unnamed: 0,ticketno_log1p,alarmno_log1p,alarmtime,alarmlevel,site_1,sysname_1,alarmmsg_original
0,16.869401,28.143931,2022-12-04 18:01:30+09:00,5,E,f,OPT-PWR-LOW
1,16.890912,28.144753,2022-12-20 15:22:10+09:00,5,F,h,-
2,16.893222,28.144834,2022-12-22 05:03:37+09:00,5,C,c,ETH-ERR
3,16.898013,28.145001,2022-12-25 10:44:38+09:00,7,F,h,OPT-LOS
4,16.726789,20.639178,2022-12-15 01:00:55+09:00,7,E,f,OPT-REMOVE
...,...,...,...,...,...,...,...
2414,16.878352,28.144341,2022-12-12 16:08:50+09:00,5,E,f,OPT-PWR-LOW
2415,16.884026,28.144571,2022-12-17 03:08:34+09:00,5,E,f,OPT-PWR-LOW
2416,16.898013,28.144989,2022-12-25 05:11:53+09:00,5,F,h,ETH-NO-TX-TRAFFIC
2417,16.869078,28.143898,2022-12-04 02:51:33+09:00,5,E,f,LSP-LOC


In [23]:
test[target].value_counts()

OPT-PWR-LOW          601
LSP-LOC              541
OPT-LOS              417
ETH-ERR              283
PSU-FAIL             149
-                    129
ETH-LINK-FAIL        114
OPT-REMOVE           101
ETH-NO-TX-TRAFFIC     84
Name: alarmmsg_original, dtype: int64

In [24]:
train_data = TabularDataset(train)
train_data

Unnamed: 0,ticketno_log1p,alarmno_log1p,alarmtime,alarmlevel,site_1,sysname_1,alarmmsg_original
0,16.865088,28.143759,2022-12-01 09:56:48+09:00,5,D,e,OPT-PWR-LOW
1,16.874048,28.144146,2022-12-08 21:39:58+09:00,5,E,f,OPT-PWR-LOW
2,16.869298,28.143969,2022-12-05 11:44:54+09:00,5,E,f,OPT-PWR-LOW
3,16.871468,28.144035,2022-12-06 18:13:56+09:00,5,E,f,OPT-PWR-LOW
4,16.865152,28.143750,2022-12-01 05:48:37+09:00,5,E,f,OPT-PWR-LOW
...,...,...,...,...,...,...,...
9671,16.900376,28.145099,2022-12-27 07:54:19+09:00,7,C,c,OPT-LOS
9672,16.876094,28.144267,2022-12-11 05:39:40+09:00,5,E,f,OPT-PWR-LOW
9673,16.889173,28.144698,2022-12-19 13:41:07+09:00,5,B,c,OPT-PWR-LOW
9674,16.884815,28.144553,2022-12-16 18:25:09+09:00,7,C,c,OPT-LOS


In [25]:
label = target
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count            9676
unique              9
top       OPT-PWR-LOW
freq             2402
Name: alarmmsg_original, dtype: object


## 간단한 모델들로 빠르게 탐색


### 'ticketno_log1p', 'alarmno_log1p', 'alarmtime', 'alarmlevel'

In [None]:
predictor = TabularPredictor(label=target,
                             path='/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg_easy').fit(
    train_data,
)

Beginning AutoGluon training ...
AutoGluon will save models to "/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg_easy/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   8.41 GB / 16.11 GB (52.2%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Train Data Rows:    9676
Train Data Columns: 4
Label Column: alarmmsg_original
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	9 unique label values:  ['OPT-PWR-LOW', 'ETH-ERR', 'OPT-LOS', 'ETH-LINK-FAIL', 'LSP-LOC', 'ETH-NO-TX-TRAFFIC', 'OPT-REMOVE', '-', 'PSU-FAIL']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data C

[1000]	valid_set's multi_error: 0.216942
[2000]	valid_set's multi_error: 0.17562
[3000]	valid_set's multi_error: 0.163223
[4000]	valid_set's multi_error: 0.153926
[5000]	valid_set's multi_error: 0.14876
[6000]	valid_set's multi_error: 0.14876


	0.8533	 = Validation score   (accuracy)
	60.43s	 = Training   runtime
	7.25s	 = Validation runtime
Fitting model: LightGBM ...
	0.936	 = Validation score   (accuracy)
	5.02s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9225	 = Validation score   (accuracy)
	2.3s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9215	 = Validation score   (accuracy)
	5.29s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: CatBoost ...
	0.9194	 = Validation score   (accuracy)
	17.63s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9215	 = Validation score   (accuracy)
	2.66s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9205	 = Validation score   (accuracy)
	2.75s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: XGBoost ...
	0.9298	 = Validation score   (accuracy)
	6.34s	 = Training   runtime
	0.09s	 = Vali

### 'ticketno_log1p', 'alarmno_log1p', 'alarmtime', 'alarmlevel', 'site_1', 'sysname_1'

In [26]:
predictor = TabularPredictor(label=target,
                             path='/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg_0820_1').fit(
    train_data,
)

Beginning AutoGluon training ...
AutoGluon will save models to "/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg_0820_1/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   8.02 GB / 16.11 GB (49.8%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Train Data Rows:    9676
Train Data Columns: 6
Label Column: alarmmsg_original
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	9 unique label values:  ['OPT-PWR-LOW', 'ETH-ERR', 'OPT-LOS', 'ETH-LINK-FAIL', 'LSP-LOC', 'ETH-NO-TX-TRAFFIC', 'OPT-REMOVE', '-', 'PSU-FAIL']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data

[1000]	valid_set's multi_error: 0.155992
[2000]	valid_set's multi_error: 0.142562


	0.8595	 = Validation score   (accuracy)
	33.61s	 = Training   runtime
	2.11s	 = Validation runtime
Fitting model: LightGBM ...
	0.9349	 = Validation score   (accuracy)
	3.83s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9225	 = Validation score   (accuracy)
	4.08s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9215	 = Validation score   (accuracy)
	5.13s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: CatBoost ...
	0.9308	 = Validation score   (accuracy)
	219.93s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9174	 = Validation score   (accuracy)
	2.42s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9174	 = Validation score   (accuracy)
	2.52s	 = Training   runtime
	0.16s	 = Validation runtime
Fitting model: XGBoost ...
	0.936	 = Validation score   (accuracy)
	5.48s	 = Training   runtime
	0.04s	 = Va

### ticketno_log1p', 'alarmno_log1p', 'alarmtime', 'alarmlevel', 'site{x}', 'sysname_{x}'

In [None]:
predictor = TabularPredictor(label=target,
                             path='/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg_0820_2').fit(
    train_data,
)

## 최신 모델들로 자세히 탐색

In [None]:
predictor = TabularPredictor(label=target,
                             path='/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg').fit(
    train_data,
    presets='best_quality',
    num_bag_folds=5,
    num_bag_sets=1
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "/content/drive/MyDrive/kt_network_competition/AutogluonModels_msg/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   9.11 GB / 16.11 GB (56.6%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Train Data Rows:    9676
Train Data Columns: 7
Label Column: alarmmsg_original
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	9 unique label values:  ['OPT-PWR-LOW', 'ETH-ERR', 'OPT-LOS', 'ETH-LINK-FAIL', 'LSP-LOC', 'ETH-NO-TX-TRAFFIC', 'OPT-REMOVE', '-', 'PSU-FAIL']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type par

## 검증

In [27]:
test_data = TabularDataset(test)
y_test = test_data[target]  # values to predict
test_data_nolab = test_data.drop(columns=[target])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,ticketno_log1p,alarmno_log1p,alarmtime,alarmlevel,site_1,sysname_1
0,16.869401,28.143931,2022-12-04 18:01:30+09:00,5,E,f
1,16.890912,28.144753,2022-12-20 15:22:10+09:00,5,F,h
2,16.893222,28.144834,2022-12-22 05:03:37+09:00,5,C,c
3,16.898013,28.145001,2022-12-25 10:44:38+09:00,7,F,h
4,16.726789,20.639178,2022-12-15 01:00:55+09:00,7,E,f


In [28]:
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.9330301777594047
Evaluations on test data:
{
    "accuracy": 0.9330301777594047,
    "balanced_accuracy": 0.87682272067141,
    "mcc": 0.9197342103639392
}


Predictions:  
 0             OPT-PWR-LOW
1             OPT-PWR-LOW
2                 ETH-ERR
3                 OPT-LOS
4              OPT-REMOVE
              ...        
2414          OPT-PWR-LOW
2415          OPT-PWR-LOW
2416    ETH-NO-TX-TRAFFIC
2417              LSP-LOC
2418                    -
Name: alarmmsg_original, Length: 2419, dtype: object


In [29]:
predictor.leaderboard(test_data, extra_info=True, silent=True).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
model,WeightedEnsemble_L2,LightGBMLarge,LightGBM,XGBoost,ExtraTreesGini,RandomForestGini,RandomForestEntr,ExtraTreesEntr,CatBoost,LightGBMXT,NeuralNetTorch,NeuralNetFastAI,KNeighborsDist,KNeighborsUnif
score_test,0.93303,0.932203,0.93055,0.926416,0.926416,0.925176,0.924762,0.924349,0.921455,0.84291,0.822654,0.804051,0.663497,0.625465
score_val,0.941116,0.940083,0.934917,0.93595,0.917355,0.922521,0.921488,0.917355,0.930785,0.859504,0.831612,0.811983,0.677686,0.634298
pred_time_test,7.68599,0.047064,0.230801,0.156847,0.483875,0.390054,0.398194,0.394638,0.063406,7.635241,0.028938,0.073627,0.028279,0.028477
pred_time_val,2.134044,0.018101,0.058345,0.038229,0.172531,0.172912,0.153718,0.162445,0.036218,2.114498,0.01427,0.03509,0.014297,0.015224
fit_time,43.130024,8.14257,3.830778,5.484988,2.416958,4.076232,5.134068,2.51886,219.930226,33.605287,28.110458,11.984251,0.020814,0.027176
pred_time_test_marginal,0.003685,0.047064,0.230801,0.156847,0.483875,0.390054,0.398194,0.394638,0.063406,7.635241,0.028938,0.073627,0.028279,0.028477
pred_time_val_marginal,0.001445,0.018101,0.058345,0.038229,0.172531,0.172912,0.153718,0.162445,0.036218,2.114498,0.01427,0.03509,0.014297,0.015224
fit_time_marginal,1.382168,8.14257,3.830778,5.484988,2.416958,4.076232,5.134068,2.51886,219.930226,33.605287,28.110458,11.984251,0.020814,0.027176
stack_level,2,1,1,1,1,1,1,1,1,1,1,1,1,1


In [30]:
y_test

0             OPT-PWR-LOW
1                       -
2                 ETH-ERR
3                 OPT-LOS
4              OPT-REMOVE
              ...        
2414          OPT-PWR-LOW
2415          OPT-PWR-LOW
2416    ETH-NO-TX-TRAFFIC
2417              LSP-LOC
2418                    -
Name: alarmmsg_original, Length: 2419, dtype: object

In [31]:
y_pred

0             OPT-PWR-LOW
1             OPT-PWR-LOW
2                 ETH-ERR
3                 OPT-LOS
4              OPT-REMOVE
              ...        
2414          OPT-PWR-LOW
2415          OPT-PWR-LOW
2416    ETH-NO-TX-TRAFFIC
2417              LSP-LOC
2418                    -
Name: alarmmsg_original, Length: 2419, dtype: object

In [32]:
from sklearn.metrics import classification_report, confusion_matrix

In [33]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 55   6   1   1  44  13   5   2   2]
 [  4 276   0   0   0   0   3   0   0]
 [  0   0  84   0   0  27   0   3   0]
 [  2   0   0  82   0   0   0   0   0]
 [ 11   3   0   0 527   0   0   0   0]
 [  3   0   2   0   0 407   0   5   0]
 [  7   0   0   0   2   0 592   0   0]
 [  0   0   3   0   0  13   0  85   0]
 [  0   0   0   0   0   0   0   0 149]]
                   precision    recall  f1-score   support

                -       0.67      0.43      0.52       129
          ETH-ERR       0.97      0.98      0.97       283
    ETH-LINK-FAIL       0.93      0.74      0.82       114
ETH-NO-TX-TRAFFIC       0.99      0.98      0.98        84
          LSP-LOC       0.92      0.97      0.95       541
          OPT-LOS       0.88      0.98      0.93       417
      OPT-PWR-LOW       0.99      0.99      0.99       601
       OPT-REMOVE       0.89      0.84      0.87       101
         PSU-FAIL       0.99      1.00      0.99       149

         accuracy                           0.93      24

In [34]:
predictor.feature_importance(test_data)

Computing feature importance via permutation shuffling for 6 features using 2419 rows with 5 shuffle sets...
	243.04s	= Expected runtime (48.61s per shuffle set)
	192.59s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
alarmlevel,0.398512,0.008106,2.053176e-08,5,0.415202,0.381821
ticketno_log1p,0.388094,0.001991,8.314759e-11,5,0.392194,0.383994
alarmtime,0.173377,0.006702,2.673336e-07,5,0.187176,0.159579
site_1,0.100868,0.001402,4.476162e-09,5,0.103755,0.097982
sysname_1,0.097726,0.004262,4.330691e-07,5,0.106502,0.08895
alarmno_log1p,0.044564,0.001352,1.01611e-07,5,0.047348,0.04178


### hyperparameter tuning

In [None]:
from autogluon.common import space

nn_options = {  # specifies non-default hyperparameter values for neural network models
    'num_epochs': 10,  # number of training epochs (controls training time of NN models)
    'learning_rate': space.Real(1e-4, 1e-2, default=5e-4, log=True),  # learning rate used in training (real-valued hyperparameter searched on log-scale)
    'activation': space.Categorical('relu', 'softrelu', 'tanh'),  # activation function used in NN (categorical hyperparameter, default = first entry)
    'dropout_prob': space.Real(0.0, 0.5, default=0.1),  # dropout probability (real-valued hyperparameter)
}

gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'num_boost_round': 100,  # number of boosting rounds (controls training time of GBM models)
    'num_leaves': space.Int(lower=26, upper=66, default=36),  # number of leaves in trees (integer hyperparameter)
}

hyperparameters = {  # hyperparameters of each model type
                   'GBM': gbm_options,
                   'NN_TORCH': nn_options,  # NOTE: comment this line out if you get errors on Mac OSX
                  }  # When these keys are missing from hyperparameters dict, no models of that type are trained

time_limit = 5*60  # train various models for ~5 min
num_trials = 5  # try at most 5 different hyperparameter configurations for each type of model
search_strategy = 'auto'  # to tune hyperparameters using random search routine with a local scheduler

hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
    'num_trials': num_trials,
    'scheduler' : 'local',
    'searcher': search_strategy,
}  # Refer to TabularPredictor.fit docstring for all valid values

predictor = TabularPredictor(label=target, eval_metric=metric).fit(
    train_data,
    time_limit=time_limit,
    hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
)