In [142]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, FastText

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import scipy.stats as spst
from copy import deepcopy

from collections import Counter
import pytz
import re

In [143]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Hyperparameters

In [281]:
# embedding_model = 'w2v'
embedding_model = 'fast'
options = {
    'vector_size' :  64,
    'window' :  9,
    'sg' : 0,
           }
alarmno_len = 1
msgvec_len = 4

In [282]:
target = "root_cause_type"

data = pd.read_csv('/content/drive/MyDrive/my_data/kt_network/q2/Q2_train.csv')
data

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain,root_cause_type
0,21122633.0,1669820428245,2022-12-01 00:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3.0,1.0,NSA,A,LinkCut
1,21122633.0,1669821318728,2022-12-01 00:17:15+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3.0,1.0,NSA,A,LinkCut
2,21122633.0,1669822214832,2022-12-01 00:32:11+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3.0,1.0,NSA,A,LinkCut
3,21122633.0,1669823114128,2022-12-01 00:47:10+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3.0,1.0,NSA,A,LinkCut
4,21122633.0,1669824028082,2022-12-01 01:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3.0,1.0,NSA,A,LinkCut
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,21774618.0,1671892499215,2022-12-24 23:37:14+09:00,7,OPT-LOS,AFAA,agow,EQPT,5.0,5.0,NSA,A,LinkCut
9318,15693425.0,877949375,2022-12-25 10:13:46+09:00,7,OPT-REMOVE,AGFD,aibb,EQPT,1.0,8.0,NSA,A,UnitFail
9319,21809789.0,1671974758375,2022-12-25 22:28:14+09:00,7,OPT-LOS,ADKA,aeaq,EQPT,3.0,3.0,NSA,A,LinkCut
9320,21811213.0,1671978167736,2022-12-25 23:25:03+09:00,7,OPT-LOS,ABZO,acie,EQPT,5.0,6.0,NSA,A,LinkCut


In [283]:
test = pd.read_csv("/content/drive/MyDrive/my_data/kt_network/q2/Q2_test.csv")
test

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain
0,21812391.0,1671894138838,2022-12-25 00:02:16+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#15#1,16.0,15.0,NSA,C
1,21775988.0,1671894172511,2022-12-25 00:02:51+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
2,21792259.0,1671894204682,2022-12-25 00:03:22+09:00,4,DDM_RX_PWR_HIGH,AECE,afeg,X2FUA,2.0,1.0,NSA,B
3,21812412.0,1671894215702,2022-12-25 00:03:33+09:00,5,BATT_ENV_FAIL,ACCN,aclp,---,,,SA,B
4,21812417.0,1671894220812,2022-12-25 00:03:39+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C
...,...,...,...,...,...,...,...,...,...,...,...,...
37666,21986223.0,1672412311698,2022-12-30 23:58:30+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
37667,22015278.0,1672412316271,2022-12-30 23:58:33+09:00,4,DDM_RX_PWR_HIGH,AEMD,afsr,G16FU,5.0,6.0,SA,B
37668,21986426.0,1672412317238,2022-12-30 23:58:34+09:00,5,MEP_LSP_RDI,ACMY,acxj,G2FUA,1.0,1.0,NSA,B
37669,22015300.0,1672412373531,2022-12-30 23:59:32+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C


In [284]:
full_name =  {
'ais': 'alarm indication signal',
'batt': 'battery',
'cep': 'critical event prevention',
'conn': 'connection',
'crc': 'cyclic redundancy check',
'csf': 'client signal fail',
'dcc': 'data communication channel',
'dcn': 'data communication network',
'dc': 'direct current',
'ddm': 'digital diagnostic monitoring',
'env': 'environment',
'err': 'error',
'eth': 'ethernet',
'fan': 'fan',
'fl': 'frame loss',
'idf': 'in-service data frame',
'init': 'initialization',
'ioc': 'input output controller',
'ipc': 'inter process communication',
'iof': 'ingress only filtering',
'll': 'link-layer',
'llcf': 'link level control field',
'loc': 'loss of communication',
'loc': 'loss of continuity',
'lof': 'loss of frame',
'los': 'loss of signal',
'lsp': 'label switched path',
'meg': 'mega',
'mep': 'maintenance entity group end point',
'mis': 'misalignment',
'ne': 'network element',
'nvram': 'non volatile random access memory',
'oam': 'operations, administration, and maintenance',
'oamloss': 'operations administration and maintenance loss',
'odi': 'optical delay interferometer',
'opt': 'optical',
'pde': 'path delay estimation',
'pdm': 'physical data model',
'poam': 'physical operations, administration, and maintenance',
'ps': 'packet switching',
'psm': 'power save mode',
'psu': 'power supply unit',
'pw': 'pseudowire',
'pwe': 'pseudowire emulation',
'ql': 'queue length',
'rdi': 'remote defect indication',
'rem': 'remote',
'rfa': 'remote failure analysis',
'rmt': 'remote',
'rx': 'receiver',
'rsmt': 'required signal-to-noise ratio margin threshold',
'sfp': 'small form factor pluggable',
'stm': 'synchronous transport module',
'sys': 'system',
'tca': 'threshold crossing alert',
'tdm': 'time division multiplexing',
'trk': 'track',
'tx': 'transmit',
'urc': 'unidirectional remote control',
}

In [285]:
pattern = r'[\s\-_\(\)\/]'

def get_unique_lower(data1, data2):

    msg1 = data1["alarmmsg_original"].unique().tolist()
    msg2 = data2['alarmmsg_original'].unique().tolist()
    msg = [s.lower() for s in list(set(msg1 + msg2))]

    return msg

def cleaning(msg, pattern):

    cleaned_msg = []
    for m in msg:
        # 기호 제거
        m = re.sub(pattern, ' ', m)
        # 단어 분리
        words = [word for word in re.findall('[A-Za-z0-9]+', m) if len(word) > 1]
        # 분리된 단어들을 리스트에 추가
        cleaned_msg.append(words)

    return cleaned_msg

def mapping(cleaned_msg, abbr_dict):

    mapped_msg = []
    for sub_list in cleaned_msg:
        temp = []
        for item in sub_list:
            if item in abbr_dict:
                temp.append(abbr_dict[item])
            else:
                temp.append(item)
        mapped_msg.append(temp)

    return mapped_msg

def inner_split(mapped_msg, pattern):
    msg_uni = []
    for error in mapped_msg:
        new_error = []
        for word in error:
            word_list = re.split(pattern, word)
            new_error += word_list
        msg_uni.append(new_error)

    return msg_uni


def embedding(data1, data2, pattern, abbr_dict, embedding_model, opt_dict):

    msg = get_unique_lower(data1, data2)

    cleaned_msg = cleaning(msg, pattern)

    mapped_msg = mapping(cleaned_msg, abbr_dict)

    sentences = inner_split(mapped_msg, pattern)

    # 모델 생성
    if embedding_model == 'w2v':
        model = Word2Vec(sentences, min_count=1, workers=4, seed=42, **options)
    elif embedding_model == 'fast':
        model = FastText(sentences, min_count=1, workers=4, seed=42, **options)

    return model

In [286]:
model = embedding(data, test, pattern, full_name, embedding_model, options)

In [287]:
model.wv['communication', 'data', 'network']

array([[ 1.24213810e-03, -1.09651906e-03, -7.23810750e-04,
        -1.16857782e-03, -1.08754393e-04, -7.67750375e-04,
        -1.21648738e-03, -3.37042264e-03,  1.40471326e-03,
         7.73073523e-04,  1.23210950e-03,  1.51721388e-03,
        -6.29641523e-04,  1.05468993e-04, -1.83319685e-03,
        -2.21670452e-05, -1.63140113e-03, -7.09523680e-04,
        -1.29296666e-03,  1.48213119e-03, -3.67651635e-04,
         8.77157319e-04,  3.54132557e-04,  7.65398145e-04,
         9.01407213e-04, -7.92366453e-04, -1.70790043e-03,
        -2.62252288e-04, -3.10657290e-03,  1.20304793e-03,
         2.36860081e-03,  5.06571494e-04, -6.71247195e-04,
        -1.21735584e-03, -2.60466149e-05,  1.33184704e-03,
        -1.52034569e-03,  1.13445807e-04,  1.94197914e-04,
         6.64544175e-04, -2.47604609e-03,  1.02492503e-03,
         1.05316669e-03, -2.45759310e-03,  2.48038839e-03,
         2.36529027e-04,  1.03658276e-04,  2.44147331e-03,
         4.01681493e-04, -5.63725655e-04, -2.04910408e-0

# Feature engineering

In [288]:
import pytz

target = 'root_cause_type'

def expand_abbr(abbr_lst, abbr_dict):
    temp = []
    for abbr in abbr_lst:
        if abbr in abbr_dict:
            temp.extend(abbr_dict[abbr].split())
        else:
            if len(abbr) > 1:
                temp.append(abbr)
    return temp


def mean_vector(vectors):
    return np.mean(vectors, axis=0)


def feature_engineering(data, pattern, abbr_dict):
    '''
    params
        data : DataFrame - feature engineering의 대상이 되는 데이터
    return
        DataFrame
    '''
    # ticketno column을 int64로 변경
    temp = deepcopy(data)
    temp['ticketno'] = temp['ticketno'].astype('int64')

    # ticketno, alarmno를 로그변환
    temp['ticketno_log1p'] = np.log1p(temp['ticketno'])
    temp['alarmno_log1p'] = np.log1p(temp['alarmno'])

    # alarmtime column을 datetime64로 변경
    tz_pytz = pytz.timezone('Asia/Seoul')
    temp['alarmtime'] = pd.to_datetime(temp['alarmtime'], unit='ns', utc=True).dt.tz_convert('Asia/Seoul')

    # alarmmsg
    temp["alarmmsg_vector"] = temp["alarmmsg_original"].str.lower()
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : re.sub(pattern, ' ', x))
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : re.findall('[A-Za-z0-9]+', x))
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : expand_abbr(x, abbr_dict))
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : model.wv[x])
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : mean_vector(x))
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : np.expand_dims(x, axis=0))


    # alarmlevel
    temp['alarmlevel'] = temp['alarmlevel'].astype('object')

    # site & sysname
    site_col_dict = {}
    for i in range(4):
        site_col_dict[f'site{i}'] = []


    for idx, value in temp['site'].items():
        for num, each_char in enumerate(value):
            site_col_dict[f'site{num}'].append(each_char)
    site_df = pd.DataFrame(site_col_dict)

    sysname_col_dict = {}
    for i in range(4):
        sysname_col_dict[f'sysname{i}'] = []


    for idx, value in temp['sysname'].items():
        for num, each_char in enumerate(value):
            sysname_col_dict[f'sysname{num}'].append(each_char)

    sysname_df = pd.DataFrame(sysname_col_dict)
    temp = pd.concat([temp, site_df, sysname_df], axis=1)

    #slot, port의 결측치는 100으로 채운다.
    temp['port'].fillna(0.0, inplace=True)
    temp['slot'].fillna(100.0, inplace=True)

    # port와 slot의 dtype을 int32로 바꾼다
    temp['port'] = temp['port'].astype('int32')
    temp['slot'] = temp['slot'].astype('int32')

    return temp

In [289]:
def make_seq_data(data, train=True):

    temp = deepcopy(data)

    # seq 데이터 생성
    temp_seq = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmno_log1p'].apply(lambda x : ' '.join(map(str, x)))
    temp_seq['ticketno_log1p'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['ticketno_log1p'].apply(lambda x : ' '.join(map(str, x)))['ticketno_log1p']
    temp_seq['alarmlevel'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmlevel'].apply(lambda x : ' '.join(map(str, x)))['alarmlevel']
    temp_seq['alarmmsg_vector'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmmsg_vector'].apply(lambda x: np.array(x)).apply(lambda x : np.concatenate(x, axis=0))
    temp_seq['site1'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['site1'].apply(lambda x : ' '.join(x))['site1']
    temp_seq['sysname1'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['sysname1'].apply(lambda x : ' '.join(x))['sysname1']


    # seq에서 정보 추출 및 형 변환
    temp_seq['alarmno_log1p'] = temp_seq['alarmno_log1p'].apply(lambda x: list(map(float, x.split(' '))))
    temp_seq['ticketno_log1p'] = temp_seq['ticketno_log1p'].apply(lambda x: list(map(float, x.split(' '))))
    temp_seq['timesteps'] = temp_seq['alarmlevel'].apply(lambda x: len(x.split(' ')))


    if train == True:
    # 라벨링
        # 데이터 프레임의 feature중 list 또는 ndarray를 값으로 가지고 있는 feature가 있다면 drop_duplicate 실행시 오류가 난다.
        # 중복 제거 기준을 축소시킴으로서 해결하자.
        temp_seq = pd.merge(temp_seq, temp[['ticketno', target]], how='left', on='ticketno').drop_duplicates(subset='ticketno', keep='first', ignore_index=True)
    else:
        pass

    return temp_seq

In [290]:
temp = feature_engineering(data, pattern, full_name)
temp

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,...,alarmno_log1p,alarmmsg_vector,site0,site1,site2,site3,sysname0,sysname1,sysname2,sysname3
0,21122633,1669820428245,2022-12-01 00:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,28.143737,"[[-0.0015244028, 0.0018897726, 7.763447e-08, -...",A,C,E,N,a,c,n,t
1,21122633,1669821318728,2022-12-01 00:17:15+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,28.143738,"[[-0.0015244028, 0.0018897726, 7.763447e-08, -...",A,C,E,N,a,c,n,t
2,21122633,1669822214832,2022-12-01 00:32:11+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,28.143738,"[[-0.0015244028, 0.0018897726, 7.763447e-08, -...",A,C,E,N,a,c,n,t
3,21122633,1669823114128,2022-12-01 00:47:10+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,28.143739,"[[-0.0015244028, 0.0018897726, 7.763447e-08, -...",A,C,E,N,a,c,n,t
4,21122633,1669824028082,2022-12-01 01:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,28.143739,"[[-0.0015244028, 0.0018897726, 7.763447e-08, -...",A,C,E,N,a,c,n,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,21774618,1671892499215,2022-12-24 23:37:14+09:00,7,OPT-LOS,AFAA,agow,EQPT,5,5,...,28.144977,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",A,F,A,A,a,g,o,w
9318,15693425,877949375,2022-12-25 10:13:46+09:00,7,OPT-REMOVE,AGFD,aibb,EQPT,1,8,...,20.593099,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",A,G,F,D,a,i,b,b
9319,21809789,1671974758375,2022-12-25 22:28:14+09:00,7,OPT-LOS,ADKA,aeaq,EQPT,3,3,...,28.145027,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",A,D,K,A,a,e,a,q
9320,21811213,1671978167736,2022-12-25 23:25:03+09:00,7,OPT-LOS,ABZO,acie,EQPT,5,6,...,28.145029,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",A,B,Z,O,a,c,i,e


In [291]:
label_dict = {'LinkCut' : 0, 'PowerFail' : 1, 'UnitFail' : 2}
temp['root_cause_type'] = temp['root_cause_type'].map(label_dict)
temp['root_cause_type']

0       0
1       0
2       0
3       0
4       0
       ..
9317    0
9318    2
9319    0
9320    0
9321    0
Name: root_cause_type, Length: 9322, dtype: int64

In [292]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9322 entries, 0 to 9321
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype                     
---  ------             --------------  -----                     
 0   ticketno           9322 non-null   int64                     
 1   alarmno            9322 non-null   int64                     
 2   alarmtime          9322 non-null   datetime64[ns, Asia/Seoul]
 3   alarmlevel         9322 non-null   object                    
 4   alarmmsg_original  9322 non-null   object                    
 5   site               9322 non-null   object                    
 6   sysname            9322 non-null   object                    
 7   unit               9322 non-null   object                    
 8   slot               9322 non-null   int32                     
 9   port               9322 non-null   int32                     
 10  sva                9322 non-null   object                    
 11  root_cause_domain

In [293]:
temp_seq = make_seq_data(temp, train=True)
temp_seq

Unnamed: 0,ticketno,alarmno_log1p,ticketno_log1p,alarmlevel,alarmmsg_vector,site1,sysname1,timesteps,root_cause_type
0,14753084,"[20.581846324460642, 20.581846326764456, 20.58...","[16.506962771423133, 16.506962771423133, 16.50...",4 7 4 7 7 5,"[[0.0017409321, -0.0008008105, -0.00010471754,...",B B B B B B,b b b b b b,6,1
1,14771766,"[20.582054895929588, 20.582054894777922, 20.58...","[16.508228281745982, 16.508228281745982, 16.50...",7 4 4 7,"[[-0.0002770644, 0.0002764136, 0.0025757093, 0...",B B B B,b b b b,4,1
2,14777089,"[20.58210410432185, 20.582104102018626, 20.582...","[16.50858856641334, 16.50858856641334, 16.5085...",4 4 7 7 7,"[[0.0017409321, -0.0008008105, -0.00010471754,...",B B B B B,b b b b b,5,1
3,14790052,"[20.582242987011327, 20.582242988162776]","[16.509465418182927, 16.509465418182927]",4 4,"[[0.0017409321, -0.0008008105, -0.00010471754,...",C C,d d,2,1
4,14879922,"[20.583055085271422, 20.58305508642194, 20.583...","[16.51552341262465, 16.51552341262465, 16.5155...",7 7 4 4 7 5,"[[-0.0002770644, 0.0002764136, 0.0025757093, 0...",B B B B B B,b b b b b b,6,1
...,...,...,...,...,...,...,...,...,...
1109,21792877,[28.14495120214248],[16.89709377713444],7,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",B,c,1,0
1110,21793984,"[28.144952713177496, 28.14496006365392]","[16.897144572255907, 16.897144572255907]",7 7,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",D D,e e,2,0
1111,21799077,"[28.144959379526114, 28.144959379448355, 28.14...","[16.897378233286936, 16.897378233286936, 16.89...",7 7 5 5 5 5,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",B B B B B B,c c c c c c,6,0
1112,21809789,"[28.1449748391285, 28.14502653381203]","[16.897869509520728, 16.897869509520728]",7 7,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",D D,e e,2,0


In [294]:
x, y = temp_seq.drop(columns=[target]), temp_seq[target]
x.shape, y.shape

((1114, 8), (1114,))

In [295]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=2023, test_size=0.2, stratify=y)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((891, 8), (223, 8), (891,), (223,))

In [296]:
train_seq = pd.concat([x_train, y_train], axis=1)
valid_seq = pd.concat([x_valid, y_valid], axis=1)
train_seq.shape, valid_seq.shape

((891, 9), (223, 9))

In [297]:
train_seq

Unnamed: 0,ticketno,alarmno_log1p,ticketno_log1p,alarmlevel,alarmmsg_vector,site1,sysname1,timesteps,root_cause_type
356,18383456,[20.63926354303098],[16.726961742080935],7,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",D,e,1,2
315,18380373,[20.63917973534545],[16.72679402288936],7,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",E,f,1,2
883,21604385,"[28.14467274888314, 28.144672749446144, 28.144...","[16.88840690759696, 16.88840690759696, 16.8884...",5 5 5 5,"[[-0.0013098357, -0.001297574, 0.0013339105, 0...",F F F F,h h h h,4,0
747,21401465,"[28.144332643661315, 28.14433300316006]","[16.87896998231864, 16.87896998231864]",7 7,"[[-0.0018558681, -0.00063707435, 0.0003860231,...",E E,f f,2,0
169,17715343,"[20.626056248707442, 20.626056259728355]","[16.689941714665004, 16.689941714665004]",7 4,"[[-0.0002770644, 0.0002764136, 0.0025757093, 0...",C C,c c,2,1
...,...,...,...,...,...,...,...,...,...
894,21605550,[28.14467452146415],[16.88846083037867],5,"[[-0.0013098357, -0.001297574, 0.0013339105, 0...",B,c,1,0
313,18380359,[20.639179518888643],[16.726793261207074],7,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",G,i,1,2
23,15367349,[20.588888710790084],[16.547755686856707],7,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",C,d,1,2
327,18381440,"[20.639209952908903, 20.639210341214042]","[16.726852072253916, 16.726852072253916]",7 7,"[[-0.0006297901, 0.0017390233, 0.00017772795, ...",F F,h h,2,2


In [298]:
maxlen=12
def tokenizing(tokenizer, corpus, document):

    tokenizer.fit_on_texts(corpus)
    vocab_size = tokenizer.num_words + 1
    idx_seq = tokenizer.texts_to_sequences(document)

    return vocab_size, idx_seq

def padding(idx_seq, maxlen=12, dtype='int32'):

    temp_padseq = pad_sequences(idx_seq, maxlen=maxlen, dtype=dtype, padding='pre', truncating='post')

    return temp_padseq

### alarmno seq

In [299]:
train_alarmno_padseq = pad_sequences(list(train_seq['alarmno_log1p']), maxlen=alarmno_len, dtype='float', padding='pre', truncating='post')
valid_alarmno_padseq = pad_sequences(list(valid_seq['alarmno_log1p']), maxlen=alarmno_len, dtype='float', padding='pre', truncating='post')
train_alarmno_padseq = np.expand_dims(train_alarmno_padseq, axis=-1)
valid_alarmno_padseq = np.expand_dims(valid_alarmno_padseq, axis=-1)
train_alarmno_padseq[0]

array([[20.63926354]])

### alarmlv seq

In [300]:
alarmlv_tokenizer = Tokenizer(num_words=4, filters='', split=' ')
alarmlv_tokenizer.fit_on_texts(temp_seq['alarmlevel'])
alarmlv_tokenizer.index_word

{1: '5', 2: '7', 3: '4', 4: '3'}

In [301]:
alarmlevel_vocab_size = alarmlv_tokenizer.num_words + 1
alarmlevel_vocab_size

5

In [302]:
train_alarmlv_idxseq = alarmlv_tokenizer.texts_to_sequences(train_seq['alarmlevel'])
valid_alarmlv_idxseq = alarmlv_tokenizer.texts_to_sequences(valid_seq['alarmlevel'])

In [303]:
train_alarmlv_padseq = pad_sequences(train_alarmlv_idxseq, maxlen=maxlen, padding='pre', truncating='post')
valid_alarmlv_padseq = pad_sequences(valid_alarmlv_idxseq, maxlen=maxlen, padding='pre', truncating='post')
train_alarmlv_padseq[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3]], dtype=int32)

### alarmmsg_original seq

In [304]:
train_msgvec_padseq = pad_sequences(train_seq['alarmmsg_vector'], maxlen=msgvec_len, dtype='float', padding='pre', truncating='post')
valid_msgvec_padseq = pad_sequences(valid_seq['alarmmsg_vector'], maxlen=msgvec_len, dtype='float', padding='pre', truncating='post')

### site&sysname seq

In [305]:
site1_tokenizer = Tokenizer(num_words=6, filters='', split=' ', oov_token='<oov>')
sysname1_tokenizer = Tokenizer(num_words=6 , filters='', split=' ', oov_token='<oov>')

site1_tokenizer.fit_on_texts(temp['site1'])
sysname1_tokenizer.fit_on_texts(temp['sysname1'])

train_site1_idxseq = site1_tokenizer.texts_to_sequences(train_seq['site1'])
train_sysname1_idxseq = sysname1_tokenizer.texts_to_sequences(train_seq['sysname1'])
valid_site1_idxseq = site1_tokenizer.texts_to_sequences(valid_seq['site1'])
valid_sysname1_idxseq = sysname1_tokenizer.texts_to_sequences(valid_seq['sysname1'])

train_site1_padseq = pad_sequences(train_site1_idxseq, maxlen=maxlen, padding='pre', truncating='post')
train_sysname1_padseq = pad_sequences(train_sysname1_idxseq, maxlen=maxlen, padding='pre', truncating='post')
valid_site1_padseq = pad_sequences(valid_site1_idxseq, maxlen=maxlen, padding='pre', truncating='post')
valid_sysname1_padseq = pad_sequences(valid_sysname1_idxseq, maxlen=maxlen, padding='pre', truncating='post')

In [306]:
site1_vocab_size = site1_tokenizer.num_words + 1
sysname1_vocab_size = sysname1_tokenizer.num_words + 1
site1_vocab_size, sysname1_vocab_size

(7, 7)

In [307]:
site1_tokenizer.word_counts, sysname1_tokenizer.word_counts

(OrderedDict([('c', 1678),
              ('e', 4872),
              ('b', 650),
              ('d', 1039),
              ('f', 1017),
              ('a', 34),
              ('g', 32)]),
 OrderedDict([('c', 2176),
              ('f', 4678),
              ('e', 993),
              ('b', 98),
              ('g', 417),
              ('h', 802),
              ('d', 98),
              ('a', 36),
              ('i', 24)]))

In [308]:
train_alarmno_padseq.shape, train_alarmlv_padseq.shape, train_msgvec_padseq.shape, train_site1_padseq.shape, train_sysname1_padseq.shape

((891, 1, 1), (891, 12), (891, 4, 64), (891, 12), (891, 12))

In [309]:
y_train = to_categorical(train_seq[target], num_classes=len(label_dict))
y_valid = to_categorical(valid_seq[target], num_classes=len(label_dict))
y_train.shape, y_valid.shape

((891, 3), (223, 3))

In [310]:
from tensorflow.keras.backend import clear_session

from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate
from tensorflow.keras.layers import Bidirectional, GRU
from tensorflow.keras.layers import Flatten, MaxPool1D, Conv1D, Dropout, BatchNormalization

from tensorflow.keras.models import Model, load_model

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

## RNN model

In [359]:
clear_session()

# alarmno는 이미 수치화된 값이기에 Embedding layer가 필요 없다.
il_no = Input(shape=train_alarmno_padseq.shape[1:])
x_no = Bidirectional(layer=GRU(16, 'tanh'))(il_no)
x_no = Flatten()(x_no)

# alarmlevel
il_lv = Input(shape=train_alarmlv_padseq.shape[1:])
x_lv = Embedding(input_dim=alarmlevel_vocab_size, output_dim=32)(il_lv)
x_lv = Bidirectional(layer=GRU(64, 'tanh'))(x_lv)
x_lv = Flatten()(x_lv)

# alarmmsg_vector
il_msg = Input(shape=train_msgvec_padseq.shape[1:])
x_msg = Bidirectional(layer=GRU(64, 'tanh'))(il_msg)
x_msg = Flatten()(x_msg)

# site_1
il_site1 = Input(shape=train_site1_padseq.shape[1:])
x_site1 = Embedding(input_dim=site1_vocab_size, output_dim=32)(il_site1)
x_site1 = Bidirectional(layer=GRU(32, 'tanh'))(x_site1)
x_site1 = Flatten()(x_site1)

# sysname_1
il_sysname1 = Input(shape=train_sysname1_padseq.shape[1:])
x_sysname1 = Embedding(input_dim=sysname1_vocab_size, output_dim=32)(il_sysname1)
x_sysname1 = Bidirectional(layer=GRU(32, 'tanh'))(x_sysname1)
x_sysname1 = Flatten()(x_sysname1)

x = Concatenate()([x_no, x_lv, x_msg, x_site1, x_sysname1])
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(128, 'swish')(x)
x = Dropout(0.25)(x)
x = Dense(64, 'swish')(x)
x = Dropout(0.25)(x)
x = Dense(32, 'swish')(x)

ol = Dense(3, 'softmax')(x)


rnn_model = Model(inputs=[il_no, il_lv, il_msg, il_site1, il_sysname1], outputs=ol)

rnn_model.compile(loss='categorical_crossentropy',
              metrics=['accuracy'],
              optimizer='adam')
rnn_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 12)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 12)]         0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 12)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1, 1)]       0           []                               
                                                                                              

In [360]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=1, restore_best_weights=1)
mcp = ModelCheckpoint('/content/drive/MyDrive/my_data/kt_network/model/q2model.h5',
                      monitor='val_loss',
                      min_delta=0,
                      patience=20,
                      save_best_only=True,
                      verbose=1)

In [361]:
rnn_model.fit([train_alarmno_padseq, train_alarmlv_padseq, train_msgvec_padseq, train_site1_padseq, train_sysname1_padseq], y_train,
          validation_split=0.2, callbacks=[es, mcp],
          epochs=1000,
          verbose=1
          )

Epoch 1/1000
Epoch 1: val_loss improved from inf to 0.99433, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 2/1000
Epoch 2: val_loss improved from 0.99433 to 0.90795, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 3/1000
Epoch 3: val_loss improved from 0.90795 to 0.86969, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 4/1000
Epoch 4: val_loss improved from 0.86969 to 0.83450, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 5/1000
Epoch 5: val_loss improved from 0.83450 to 0.79890, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 6/1000
Epoch 6: val_loss improved from 0.79890 to 0.75385, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 7/1000
Epoch 7: val_loss improved from 0.75385 to 0.71699, saving model to /content/drive/MyDrive/my_data/kt_network/model/q2model.h5
Epoch 8/1000
Epoch 8: va

<keras.callbacks.History at 0x7b11275dff10>

In [362]:
rnn_model.evaluate(x=[valid_alarmno_padseq, valid_alarmlv_padseq, valid_msgvec_padseq, valid_site1_padseq, valid_sysname1_padseq], y=y_valid)



[0.06800471991300583, 0.9820627570152283]

In [363]:
temp2 = feature_engineering(test, pattern, full_name)
temp2

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,...,alarmno_log1p,alarmmsg_vector,site0,site1,site2,site3,sysname0,sysname1,sysname2,sysname3
0,21812391,1671894138838,2022-12-25 00:02:16+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#15#1,16,15,...,28.144978,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",A,E,A,Q,a,f,b,d
1,21775988,1671894172511,2022-12-25 00:02:51+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13,20,...,28.144978,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",A,D,Z,W,a,e,z,n
2,21792259,1671894204682,2022-12-25 00:03:22+09:00,4,DDM_RX_PWR_HIGH,AECE,afeg,X2FUA,2,1,...,28.144978,"[[0.0004656975, -0.00030120322, 0.0020906215, ...",A,E,C,E,a,f,e,g
3,21812412,1671894215702,2022-12-25 00:03:33+09:00,5,BATT_ENV_FAIL,ACCN,aclp,---,100,0,...,28.144978,"[[0.00026704816, -0.0014933241, 0.0003042725, ...",A,C,C,N,a,c,l,p
4,21812417,1671894220812,2022-12-25 00:03:39+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16,16,...,28.144978,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",A,E,A,Q,a,f,b,d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,21986223,1672412311698,2022-12-30 23:58:30+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13,20,...,28.145288,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",A,D,Z,W,a,e,z,n
37667,22015278,1672412316271,2022-12-30 23:58:33+09:00,4,DDM_RX_PWR_HIGH,AEMD,afsr,G16FU,5,6,...,28.145288,"[[0.0004656975, -0.00030120322, 0.0020906215, ...",A,E,M,D,a,f,s,r
37668,21986426,1672412317238,2022-12-30 23:58:34+09:00,5,MEP_LSP_RDI,ACMY,acxj,G2FUA,1,1,...,28.145288,"[[-0.0005147392, 0.00037882986, 0.0005863824, ...",A,C,M,Y,a,c,x,j
37669,22015300,1672412373531,2022-12-30 23:59:32+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16,16,...,28.145288,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",A,E,A,Q,a,f,b,d


In [364]:
test_seq = make_seq_data(temp2, train=False)
test_seq

Unnamed: 0,ticketno,alarmno_log1p,ticketno_log1p,alarmlevel,alarmmsg_vector,site1,sysname1,timesteps
0,15238899,"[20.587147990331687, 20.587147994914957, 20.58...","[16.539361927140092, 16.539361927140092, 16.53...",4 7 4 7,"[[0.0017409321, -0.0008008105, -0.00010471754,...",B B B B,b b b b,4
1,15712444,"[20.593143995476403, 20.593143994337435]","[16.56996363147674, 16.56996363147674]",4 4,"[[0.0017409321, -0.0008008105, -0.00010471754,...",E E,f f,2
2,15723187,"[20.593261748634625, 20.5932651571561, 20.5932...","[16.57064712338659, 16.57064712338659, 16.5706...",5 4 4,"[[0.0009292621, 0.00026019348, 0.0005671754, 0...",C C C,d d d,3
3,15737103,"[20.593405691325867, 20.593405694741875, 20.59...","[16.57153179419051, 16.57153179419051, 16.5715...",4 7 7 4,"[[0.0017409321, -0.0008008105, -0.00010471754,...",C C C C,d d d d,4
4,15737132,"[20.59340588603827, 20.59340588376093, 20.5934...","[16.57153363696751, 16.57153363696751, 16.5715...",7 4 7 4,"[[-0.0002770644, 0.0002764136, 0.0025757093, 0...",C C C C,d d d d,4
...,...,...,...,...,...,...,...,...
4322,22015278,[28.145288200831263],[16.907247270269067],4,"[[0.0004656975, -0.00030120322, 0.0020906215, ...",E,f,1
4323,22015300,[28.145288235069234],[16.90724826957455],5,"[[-0.003150155, -0.0010620999, 0.0011404278, 0...",E,f,1
4324,23818326,"[28.148832739791523, 28.148832717288478, 28.14...","[16.98596588432827, 16.98596588432827, 16.9859...",5 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4,"[[0.0007967136, 0.00012854437, 0.0002511567, -...",E E E E E E E E E E E E E E E E E,f f f f f f f f f f f f f f f f f,17
4325,23819373,[28.148834578650376],[16.98600984110948],5,"[[0.0007967136, 0.00012854437, 0.0002511567, -...",E,f,1


In [365]:
alarmno_padseq = pad_sequences(list(test_seq['alarmno_log1p']), maxlen=alarmno_len, dtype='float', padding='pre', truncating='post')
alarmno_padseq = np.expand_dims(alarmno_padseq, axis=-1)
alarmno_padseq[0]

array([[20.58714799]])

In [366]:
alarmlv_idxseq = alarmlv_tokenizer.texts_to_sequences(test_seq['alarmlevel'])
alarmlv_padseq = pad_sequences(alarmlv_idxseq, maxlen=maxlen, padding='pre', truncating='post')
alarmlv_padseq[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 3, 2], dtype=int32)

In [367]:
msgvec_padseq = pad_sequences(test_seq['alarmmsg_vector'], maxlen=msgvec_len, dtype='float', padding='pre', truncating='post')
msgvec_padseq[0]

array([[ 1.74093212e-03, -8.00810521e-04, -1.04717539e-04,
        -1.00708241e-03,  9.15109296e-04,  2.02772935e-04,
        -7.09030544e-04,  1.16093503e-03, -1.16356695e-03,
         8.52525234e-04,  1.64779299e-03,  1.05820806e-03,
         3.79032863e-05, -3.21125146e-04, -1.26856077e-03,
        -3.13219323e-04,  9.28583031e-04, -8.22664122e-04,
        -1.23601092e-03,  1.71430805e-03, -2.14400259e-03,
        -4.45401412e-04, -3.23385175e-04, -1.46647531e-03,
        -7.99539674e-04, -9.53468727e-04,  1.39919284e-04,
        -8.73879762e-04,  2.95329723e-04, -1.10861089e-03,
         4.71357780e-04, -2.56226747e-04, -2.91762827e-03,
        -3.58304562e-04,  2.32277671e-04,  4.34209069e-04,
         3.85205029e-04,  1.77526017e-04, -4.12180508e-03,
         2.97549949e-03, -1.05733576e-04,  8.66125571e-04,
         8.00630878e-05, -7.52362132e-04, -3.68527661e-04,
        -1.61929382e-03,  2.26483238e-03,  7.61098228e-04,
        -2.75302853e-04, -1.01323531e-05, -5.37149957e-0

In [368]:
site1_idxseq = site1_tokenizer.texts_to_sequences(test_seq['site1'])
sysname1_idxseq = sysname1_tokenizer.texts_to_sequences(test_seq['sysname1'])

site1_padseq = pad_sequences(site1_idxseq, maxlen=maxlen, padding='pre', truncating='post')
sysname1_padseq = pad_sequences(sysname1_idxseq, maxlen=maxlen, padding='pre', truncating='post')
site1_padseq[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int32)

In [369]:
prediction = rnn_model.predict([alarmno_padseq, alarmlv_padseq, msgvec_padseq, site1_padseq, sysname1_padseq])
prediction



array([[9.3416830e-08, 9.9997056e-01, 2.9334524e-05],
       [9.3901615e-09, 9.9999726e-01, 2.7005176e-06],
       [5.6790850e-10, 9.9999797e-01, 1.9835411e-06],
       ...,
       [9.5786554e-01, 4.1529987e-02, 6.0448673e-04],
       [9.9846411e-01, 1.3630079e-03, 1.7291441e-04],
       [1.0000000e+00, 4.9175920e-08, 1.4272892e-09]], dtype=float32)

In [370]:
submission = pd.read_csv('/content/drive/MyDrive/my_data/kt_network/q2/Q2_label_sample.csv')
submission['root_cause_type'] = prediction.argmax(axis=1)
submission

Unnamed: 0,ticketno,root_cause_type
0,15238899.0,1
1,15712444.0,1
2,15723187.0,1
3,15737103.0,1
4,15737132.0,1
...,...,...
4322,22015278.0,1
4323,22015300.0,0
4324,23818326.0,0
4325,23819373.0,0


In [371]:
label_dict_r = dict((v, k) for k, v in label_dict.items())
label_dict, label_dict_r

({'LinkCut': 0, 'PowerFail': 1, 'UnitFail': 2},
 {0: 'LinkCut', 1: 'PowerFail', 2: 'UnitFail'})

In [372]:
submission['root_cause_type'] = submission['root_cause_type'].map(label_dict_r)
submission['root_cause_type']

0       PowerFail
1       PowerFail
2       PowerFail
3       PowerFail
4       PowerFail
          ...    
4322    PowerFail
4323      LinkCut
4324      LinkCut
4325      LinkCut
4326      LinkCut
Name: root_cause_type, Length: 4327, dtype: object

## Result


In [373]:
unique_elements, counts_elements = np.unique(submission["root_cause_type"], return_counts=True)
print("각 유니크 요소의 개수:", unique_elements, counts_elements)

각 유니크 요소의 개수: ['LinkCut' 'PowerFail' 'UnitFail'] [3612  581  134]


In [374]:
submission.to_csv('/content/drive/MyDrive/my_data/kt_network/prediction/submission.csv', index=False)
checker = pd.read_csv('/content/drive/MyDrive/my_data/kt_network/prediction/submission.csv')
checker

Unnamed: 0,ticketno,root_cause_type
0,15238899.0,PowerFail
1,15712444.0,PowerFail
2,15723187.0,PowerFail
3,15737103.0,PowerFail
4,15737132.0,PowerFail
...,...,...
4322,22015278.0,PowerFail
4323,22015300.0,LinkCut
4324,23818326.0,LinkCut
4325,23819373.0,LinkCut
