In [165]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import random
from random import shuffle
random.seed(42)

In [166]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
validation = pd.read_csv('./data/validation_sample.csv')

## ideas
- 'level NUM log'을 삭제하기 (뒤 내용에 집중할 수 있도록?)
- 'DATE localhost'를 삭제하기 (0, 1에는 별로 안 좋을 수도. localhost 뒤 단어에 따라 갈리는 느낌이 있어서)
- 단순 증강

In [167]:
def search_by_index(idx):
    idx = int(idx)
    print(train.full_log.loc[idx])
    
def search_by_regex(word):
    print(train[train.full_log.str.contains(word, regex=True)==True])

# 기본 전처리
소문자, 줄바꿈 제거

In [168]:
train.full_log = train.full_log.apply(lambda x : x.lower())
train.full_log = train.full_log.str.replace('\n',' ', regex=True) 
train.full_log = train.full_log.str.replace('\\n',' ', regex=True) 

test.full_log = test.full_log.apply(lambda x : x.lower())
test.full_log = test.full_log.str.replace('\n',' ', regex=True) 
test.full_log = test.full_log.str.replace('\\n',' ', regex=True) 

validation.full_log = validation.full_log.apply(lambda x : x.lower())
validation.full_log = validation.full_log.str.replace('\n',' ', regex=True) 
validation.full_log = validation.full_log.str.replace('\\n',' ', regex=True) 

# 숫자 전처리

In [None]:
pattern = [
    ('(?<=\d{4}-\d{2}-\d{2}t)\d{2}:\d{2}:\d{2}z?', '<TIME>'), 
    ('\d{4}-\d{2}-\d{2}t(?= TIME )', '<DATE>'),
    ('\d+\.\d+\.\d+\.\d+(?:\.\d+)?', '<IP>')]

def masking(df, pattern):
    for reg, repl in pattern:
        df.full_log = df.full_log.str.replace(reg, repl, regex=True)

In [169]:
# TIME & DATE

timestamp_pattern = [
    # 2020-10-21t17:50:25z
    ('(?<=\d{4}-\d{2}-\d{2}t)\d{2}:\d{2}:\d{2}z?', ' TIME '), 
    ('\d{4}-\d{2}-\d{2}t(?= TIME )', ' DATE'),

    # 2020 oct 21 17:50:21
    ('\d{4} (jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec) \d{1,2} *', ' DATE '),
    ('(?<=DATE )\d{2}:\d{2}:\d{2}', ' TIME '),

    # fri jan 22 06:28:59 2021
    ('(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', 'DATE TIME'),
    ('[a-z]{3}\s+DATE TIME\s+\d{4}', ' DATE TIME '),
    
    ('DATE t TIME', 'DATE TIME'),

    # 혹시 모르니
    ('(\d{1,2}\:\d{1,2}\:\d{1,2})|(\D\d{1,2}\:\d{1,2}\:\d{1,2})|(\d{1,2}\:\d{1,2})', ' TIME '),
    ('((jan|feb|mar|apr|may|jun|jul|aug|dec|oct|sep|nov|mar) +[0-9]{1,2})|((19|20)[0-9]{2}-(0|1)[0-9]{1}-[0-3][0-9]{1})', ' DATE ')]


In [170]:
# IP

ip_pattern = [('127.0.0.1', 'localhost'), ('\d+\.\d+\.\d+\.\d+(?:\.\d+)?', ' IP ')]


In [171]:
# NUMBER

number_pattern = [
    # '0x'로 시작하는 것
    ('(?<![0-9a-f])0x[0-9a-f]+(?=\W|$)', ' NUM '), 
    
    # '": = 뒤에 오는 헥사 
    ('(?<=[=\'" :])(?=[a-f0-9]*[a-f])(?=[a-f0-9]*[0-9])[a-f0-9]{4,}(?=\W|$)', ' NUM '), 
    
    # 숫자 뭉텅이
    ('(?<=[^a-z0-9])(\d+)(?=[^a-z0-9]|$)', ' NUM '),
    
    # 시리얼 넘버
    ('(?<=\W)(?=[a-f0-9\-]*[0-9])(?=[a-f0-9\-]*[a-f])[a-f0-9]{3,}(?:-[a-f0-9]{3,})+(?=\W|$)', ' SERIAL '),

    # 5/13 추가 (id 444014)
    ('(?<=\<)f{1,}[a-z0-9]+(?=\>)', ' NUM ')]


In [172]:
def masking(df, pattern):
    for reg, repl in pattern:
        df.full_log = df.full_log.str.replace(reg, repl, regex=True)

In [173]:
for df in [train, test, validation]:
    masking(df, timestamp_pattern)
    masking(df, ip_pattern)
    masking(df, number_pattern)
    print(df.head())

   id  level                                           full_log
0   0      0  DATE TIME localhost kibana: {"type":"error","@...
1   1      0  DATE TIME localhost logstash: [ DATE TIME , NU...
2   2      0  DATE TIME localhost kibana: {"type":"error","@...
3   3      0  DATE TIME localhost kibana: {"type":"error","@...
4   4      1  type=syscall msg=audit( NUM . NUM  TIME  NUM )...
        id                                           full_log
0  1000000  DATE TIME localhost kibana: {"type":"error","@...
1  1000001  DATE TIME localhost kibana: {"type":"error","@...
2  1000002  type=syscall msg=audit( NUM . NUM  TIME  NUM )...
3  1000003  DATE TIME localhost kibana: {"type":"error","@...
4  1000004  type=syscall msg=audit( NUM . NUM  TIME  NUM )...
                                            full_log
0  type=anom_promiscuous msg=audit( NUM . NUM  TI...
1  oscap: msg: "xccdf-result", scan-id: " NUM ", ...
2  DATE TIME localhost kernel: out of memory: kil...


# 특수문자

In [174]:
# 모두 삭제
train.full_log = train.full_log.str.replace('[^a-zA-Zㄱ-ㅣ가-힣0-9]', ' ', regex=True)
test.full_log = test.full_log.str.replace('[^a-zA-Zㄱ-ㅣ가-힣0-9]', ' ', regex=True)
validation.full_log = validation.full_log.str.replace('[^a-zA-Zㄱ-ㅣ가-힣0-9]', ' ', regex=True)

In [175]:
# 모두 냅두고 앞뒤 공백
# train.full_log = train.full_log.str.replace(' ?(?P<note>[^a-zA-Z0-9ㄱ-ㅣ가-힣]) ?', ' \g<note> ', regex=True)
# test.full_log = test.full_log.str.replace(' ?(?P<note>[^a-zA-Z0-9ㄱ-ㅣ가-힣]) ?', ' \g<note> ', regex=True)
# validation.full_log = validation.full_log.str.replace(' ?(?P<note>[^a-zA-Z0-9ㄱ-ㅣ가-힣]) ?', ' \g<note> ', regex=True)

for df in [train, test, validation]:
    print(df.head())

   id  level                                           full_log
0   0      0  DATE TIME localhost kibana    type   error    ...
1   1      0  DATE TIME localhost logstash    DATE TIME   NU...
2   2      0  DATE TIME localhost kibana    type   error    ...
3   3      0  DATE TIME localhost kibana    type   error    ...
4   4      1  type syscall msg audit  NUM   NUM  TIME  NUM  ...
        id                                           full_log
0  1000000  DATE TIME localhost kibana    type   error    ...
1  1000001  DATE TIME localhost kibana    type   error    ...
2  1000002  type syscall msg audit  NUM   NUM  TIME  NUM  ...
3  1000003  DATE TIME localhost kibana    type   error    ...
4  1000004  type syscall msg audit  NUM   NUM  TIME  NUM  ...
                                            full_log
0  type anom promiscuous msg audit  NUM   NUM  TI...
1  oscap  msg   xccdf result   scan id    NUM    ...
2  DATE TIME localhost kernel  out of memory  kil...


# 숫자 여러개 하나로, 공백 한 칸으로

In [176]:
train.full_log = train.full_log.str.replace(' +', ' ', regex=True)
train.full_log = train.full_log.str.replace('NUM (NUM ?)+', ' NUM ', regex=True)
train.full_log = train.full_log.str.replace(' +', ' ', regex=True)

test.full_log = test.full_log.str.replace(' +', ' ', regex=True)
test.full_log = test.full_log.str.replace('NUM (NUM ?)+', ' NUM ', regex=True)
test.full_log = test.full_log.str.replace(' +', ' ', regex=True)

validation.full_log = validation.full_log.str.replace(' +', ' ', regex=True)
validation.full_log = validation.full_log.str.replace('NUM (NUM ?)+', ' NUM ', regex=True)
validation.full_log = validation.full_log.str.replace(' +', ' ', regex=True)

In [177]:
search_by_index(random.randrange(1, train.shape[0]+1))



In [178]:
# save

train.to_csv('./data/cleaned_train_special.csv')
test.to_csv('./data/cleaned_test_special.csv')
validation.to_csv('./data/cleaned_validation_special.csv')

# train 위험도 중복 제거
+ 전처리 모두 끝낸 파일로 확인하는 게 좋겠다

In [179]:
def count_level(df):
    columns = df.columns
    count = 0
    for col in columns:
        count += (df[col] > 0)
    return count

In [180]:
pivot = train.pivot_table(values='id', index='full_log', columns='level', aggfunc='count', fill_value=0)
pivot['count'] = count_level(pivot)

In [181]:
dupl = pivot[pivot['count'] > 1].copy()
dupl

level,0,1,2,3,4,5,6,count
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DATE TIME localhost sudo apache tty unknown pwd var www html management user root command bin curl xget localhost NUM cat snapshots esild backup,0,4,1,0,0,0,0,2
juniper,4,0,0,1,0,0,0,2
level NUM log abort executable name pid version version built by builder on date error message,1,1,0,0,0,0,0,2
level NUM log aborting unable to run in the background as a daemon error message,1,4,0,0,0,0,0,2
level NUM log action error code error message,1,1,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...
level NUM log versions of rpd and librpd did not match not retrying,1,1,0,0,0,0,0,2
level NUM log vpn vpn name from remote address is down,2,1,0,0,0,0,0,2
level NUM log vpn vpn name from remote address is up,1,1,0,0,0,0,0,2
level NUM log write error on pipe to client name,1,0,0,0,0,1,0,2


In [182]:
pd.Series(np.argmax(dupl.iloc[:, :-1].values, axis=-1)).value_counts()

0    289
1     48
3      1
dtype: int64

In [183]:
dupl['level'] = np.argmax(dupl.iloc[:, :-1].values, axis=-1)
dupl

level,0,1,2,3,4,5,6,count,level
full_log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DATE TIME localhost sudo apache tty unknown pwd var www html management user root command bin curl xget localhost NUM cat snapshots esild backup,0,4,1,0,0,0,0,2,1
juniper,4,0,0,1,0,0,0,2,0
level NUM log abort executable name pid version version built by builder on date error message,1,1,0,0,0,0,0,2,0
level NUM log aborting unable to run in the background as a daemon error message,1,4,0,0,0,0,0,2,1
level NUM log action error code error message,1,1,0,0,0,1,0,3,0
...,...,...,...,...,...,...,...,...,...
level NUM log versions of rpd and librpd did not match not retrying,1,1,0,0,0,0,0,2,0
level NUM log vpn vpn name from remote address is down,2,1,0,0,0,0,0,2,0
level NUM log vpn vpn name from remote address is up,1,1,0,0,0,0,0,2,0
level NUM log write error on pipe to client name,1,0,0,0,0,1,0,2,0


In [184]:
for index in dupl.index:
    targets = train[(train['full_log']==index) & (train['level']!=dupl['level'][index])].index
    print(targets)
    train.drop(index=targets, inplace=True)

Int64Index([365348], dtype='int64')
Int64Index([170195], dtype='int64')
Int64Index([250804], dtype='int64')
Int64Index([240078], dtype='int64')
Int64Index([298133, 453023], dtype='int64')
Int64Index([105740], dtype='int64')
Int64Index([339593], dtype='int64')
Int64Index([316815], dtype='int64')
Int64Index([175756], dtype='int64')
Int64Index([139455], dtype='int64')
Int64Index([98809], dtype='int64')
Int64Index([168990], dtype='int64')
Int64Index([357230], dtype='int64')
Int64Index([29302, 205309], dtype='int64')
Int64Index([201612], dtype='int64')
Int64Index([112330], dtype='int64')
Int64Index([161686, 205196], dtype='int64')
Int64Index([437568], dtype='int64')
Int64Index([19711, 108574], dtype='int64')
Int64Index([105732], dtype='int64')
Int64Index([390321], dtype='int64')
Int64Index([283912], dtype='int64')
Int64Index([97446], dtype='int64')
Int64Index([292296], dtype='int64')
Int64Index([241377, 255263], dtype='int64')
Int64Index([367165], dtype='int64')
Int64Index([325913], dtype='

Int64Index([155895], dtype='int64')
Int64Index([69574, 359974, 404161], dtype='int64')
Int64Index([373627], dtype='int64')
Int64Index([267642], dtype='int64')
Int64Index([302735], dtype='int64')
Int64Index([448315], dtype='int64')
Int64Index([18139], dtype='int64')
Int64Index([167526], dtype='int64')
Int64Index([262657], dtype='int64')
Int64Index([36415], dtype='int64')
Int64Index([206366], dtype='int64')
Int64Index([436872], dtype='int64')
Int64Index([132623], dtype='int64')
Int64Index([391166], dtype='int64')
Int64Index([426662], dtype='int64')
Int64Index([114910], dtype='int64')
Int64Index([198830], dtype='int64')
Int64Index([ 10266,  61246,  97932, 110375, 110792, 117107, 170875, 177943,
            198652, 203453, 213078, 221135, 223337, 244642, 318387, 392111,
            419025, 447472, 460282, 461833],
           dtype='int64')
Int64Index([18415], dtype='int64')
Int64Index([226634], dtype='int64')
Int64Index([215724, 410377], dtype='int64')
Int64Index([329609, 419697], dtype='i

In [185]:
train['level'].value_counts()

0    334020
1    132181
3      4139
5      2180
2        11
4        10
6         8
Name: level, dtype: int64

# 5/14 추가 전처리 (반복되는 단어 삭제)
- DATE localhost
- level NUM log

In [186]:
train.full_log = train.full_log.apply(lambda x : x.strip())
test.full_log = test.full_log.apply(lambda x : x.strip())
validation.full_log = validation.full_log.apply(lambda x : x.strip())

In [187]:
train.full_log = train.full_log.str.replace('DATE TIME localhost','', regex=True) 
test.full_log = test.full_log.str.replace('DATE TIME localhost','', regex=True) 
validation.full_log = validation.full_log.str.replace('DATE TIME localhost','', regex=True) 

In [188]:
train.full_log = train.full_log.str.replace('level NUM log','', regex=True) 
test.full_log = test.full_log.str.replace('level NUM log','', regex=True) 
validation.full_log = validation.full_log.str.replace('level NUM log','', regex=True) 

### 위험도 2~6

- 2, 4, 6은 2000배 늘리고 
- 5는 10배 늘리고
- 3은 5배 늘려

In [190]:
dummy_2000times = train[train.level.isin([2, 4, 6])]
dummy_10times = train[train.level.isin([5])]
dummy_5times = train[train.level.isin([3])]

In [191]:
train.value_counts('level')

level
0    334020
1    132181
3      4139
5      2180
2        11
4        10
6         8
dtype: int64

In [192]:
print('증강 전 데이터 개수 : ', train.shape[0])

증강 전 데이터 개수 :  472549


In [193]:
dummy_dict = {1999:dummy_2000times, 9:dummy_10times, 4:dummy_5times}

In [194]:
def easiest(log, num):
    augmented = [log]
    augmented = augmented * num
    return augmented

In [197]:
for times, df in dummy_dict.items():
    print('.')
    for idx in range(df.shape[0]):
        
        level = df.iloc[idx][1]
        log = df.iloc[idx][2]
        
        augmented = easiest(log, times)

        for sent in augmented: # 증강된 문장 하나씩 꺼내서 기존 데이터에 추가!!
            new_data = {'level':level, 'full_log':sent}
            train = train.append(new_data, ignore_index=True)

.
.
.


In [198]:
train.value_counts('level')

level
0    334020
1    132181
2     22000
5     21800
3     20695
4     20000
6     16000
dtype: int64

In [199]:
print('증강 후 데이터 개수 : ', train.shape[0])

증강 후 데이터 개수 :  566696


In [200]:
train.to_csv('./data/train_firstword_cleaned_simple_augmented.csv')

# Easy Data Augmentation
https://github.com/jasonwei20/eda_nlp/blob/master/code/eda.py

### 랜덤 삭제

In [93]:
def random_deletion(words, p):

    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words


### 랜덤 스왑

In [94]:
def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

### 증강

In [114]:
def easy_aug(sentence, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    tokens = sentence.split(' ')
    words = [word for word in tokens if word is not '']
    num_words = len(words)
    
    augmented_sentences = []
    num_new_per_new_technique = int(num_aug/2+1) # 기술마다 몇 문장씩?
    
    if (p_rd > 0):
        for _ in range(num_new_per_new_technique):
            a_words = random_deletion(words, p_rd)
            augmented_sentences.append(' '.join(a_words))
            
    if (alpha_rs > 0):
        n_rs = max(1, int(alpha_rs * num_words))
        for _ in range(num_new_per_new_technique):
            a_words = random_swap(words, n_rs)
            augmented_sentences.append(' '.join(a_words))
    
    shuffle(augmented_sentences)
    
    #trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
    
    #append the original sentence
    augmented_sentences.append(sentence)
    return augmented_sentences[:-1] # 마지막 요소는 원본 문장이길래 제외해봤음.

  words = [word for word in tokens if word is not '']


In [119]:
# test
easy_aug('this is a test', alpha_rs=0.3, p_rd=0.2, num_aug=9)

['is this a test',
 'this a test',
 'this a is test',
 'is a test',
 'is a',
 'is this a test',
 'a is this test',
 'this is a test',
 'this a test']

In [121]:
for idx in range(dummy_2000times.shape[0]): # 대상 데이터의 개수만큼 반복
    
    level = dummy_2000times.iloc[idx][1]
    log = dummy_2000times.iloc[idx][2]
    
    augmented = easy_aug(log, alpha_rs=0.2, p_rd=0.1, num_aug=1999)
    
    for sent in augmented: # 증강된 문장 하나씩 꺼내서 기존 데이터에 추가!!
        new_data = {'level':level, 'full_log':sent}
        train = train.append(new_data, ignore_index=True)

In [122]:
for idx in range(dummy_10times.shape[0]): # 대상 데이터의 개수만큼 반복
    
    level = dummy_10times.iloc[idx][1]
    log = dummy_10times.iloc[idx][2]
    
    augmented = easy_aug(log, alpha_rs=0.2, p_rd=0.1, num_aug=9)
    
    for sent in augmented: # 증강된 문장 하나씩 꺼내서 기존 데이터에 추가!!
        new_data = {'level':level, 'full_log':sent}
        train = train.append(new_data, ignore_index=True)

In [123]:
for idx in range(dummy_5times.shape[0]): # 대상 데이터의 개수만큼 반복
    
    level = dummy_5times.iloc[idx][1]
    log = dummy_5times.iloc[idx][2]
    
    augmented = easy_aug(log, alpha_rs=0.2, p_rd=0.1, num_aug=4)
    
    for sent in augmented: # 증강된 문장 하나씩 꺼내서 기존 데이터에 추가!!
        new_data = {'level':level, 'full_log':sent}
        train = train.append(new_data, ignore_index=True)