# 전처리

In [1]:
import numpy as np
import pandas as pd

In [4]:
train = pd.read_csv('../data/train.csv', index_col=0)

print(train.shape)
train.full_log.loc[0]

(472972, 2)




In [3]:
train['level'].value_counts()

0    334065
1    132517
3      4141
5      2219
2        12
4        10
6         8
Name: level, dtype: int64

### 첫번째 단어 위주로 로그 유형 파악

In [4]:
train['first_word'] = train['full_log'].str.extract('^([^ ]+)(?: |$)')

In [5]:
def show_contains(target, *, col='full_log', first_words=None, cnt=None):
    condition = train[col].str.contains(target)
    if first_words is not None:
        condition &= train['first_word'].isin(first_words)
    
    logs = train[condition][col][:cnt] if cnt is not None else train[condition][col]
    for log in logs:
        print(log, '\n')

In [6]:
train[train['full_log'].str.contains('\d{2}:\d{2}:\d{2}')]['first_word'].value_counts()

Jan             207025
Feb              35544
Dec              25882
Oct              23883
Sep              23193
Nov              21875
Mar              12174
type=SYSCALL       152
File                72
2021                22
2020                 9
Name: first_word, dtype: int64

In [7]:
train[train['full_log'].str.contains('\d{4}-\d{2}-\d{2}')]['first_word'].value_counts()

Jan             171098
Dec              23428
Sep              22389
Feb              20855
Oct              14669
Nov               3722
Mar               3285
type=SYSCALL       152
Name: first_word, dtype: int64

# 1. 마스킹 처리
- 날짜 및 시간  
`2020 Oct 30 08:39:31`  
`Fri Sep 18 13:36:11 2020`  
`2021-01-12T07:22:32Z`

 
- IP 주소  
`211.253.243.71`  
(단, `127.0.0.1`는 `localhost`로 대체)


- 헥사, 시리얼 넘버, 숫자

In [5]:
TARGET_COLUMN = 'full_log'
PATTERNS = [('\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z?', '<TS>'), 
            # YEAR, MON, DAY, TIME
            ('\d{4}(?= (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))', '<YEAR>'), 
            ('(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', '<DATE> <TIME>'), 
            ('[A-Za-z]{3}\s+<DATE> <TIME>\s+\d{4}', '<DAY> <DATE> <TIME> <YEAR>'), 
            # IP
            ('127.0.0.1', 'localhost'),
            ('\d+\.\d+\.\d+\.\d+(?:\.\d+)?', '<IP>'), 
            # HEX, NUM
            ('(?<![0-9a-fA-F])0x[0-9a-fA-F]+(?=\W|$)', '<HEX>'), 
            ('(?<=\W)(?=[a-fA-F0-9\-]*[0-9])(?=[a-fA-F0-9\-]*[a-fA-F])[a-fA-F0-9]{3,}(?:-[a-fA-F0-9]{3,})+(?=\W|$)', '<SN>'),
            ('(?<==)[a-fA-F0-9]+(?=\W|$)', '<NUM>'),
            ('(?<=:)[a-fA-F0-9]+(?=\s|$)', '<NUM>'),
            ('(?<=\')[a-fA-F0-9]+(?=\')', '<NUM>'),
            ('(?<= )(?=[a-fA-F0-9]*[0-9])[a-fA-F0-9]{4,}(?=\W|$)', '<NUM>'), 
            ('(?<=[^a-zA-Z0-9])(\d+)(?=[^a-zA-Z0-9]|$)', '<NUM>')]

def apply_masking(df):
    for pat, repl in PATTERNS:
        df[TARGET_COLUMN] = df[TARGET_COLUMN].str.replace(pat, repl, regex=True)

In [12]:
example = 'Sep 24 10:02:22 localhost kibana: {"type":"error","@timestamp":"2020-09-24T01:02:22Z","tags":["warning","stats-collection"],"pid":6458,"level":"error","error":{"message":"No Living connections","name":"Error","stack":"Error: No Living connections\\n    at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\\n    at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\\n    at process._tickCallback (internal/process/next_tick.js:61:11)"},"message":"No Living connections"}'

In [16]:
import re

for pat, repl in PATTERNS:
    example = re.sub(pat, repl, example)

In [17]:
example



In [46]:
FILES = {'train': train, 'validation': validation, 'test': test}

for file, df in FILES.items():
    apply_masking(df)
    df.to_csv(f'./data/{file}_masked_02.csv')

In [43]:
train.head()

Unnamed: 0_level_0,level,full_log
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err..."
1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...
2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err..."
3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err..."
4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...


In [44]:
validation.head()

Unnamed: 0,full_log
0,type=ANOM_PROMISCUOUS msg=audit(1600402733.466...
1,"oscap: msg: ""xccdf-result"", scan-id: ""00016007..."
2,Sep 22 10:56:19 localhost kernel: Out of memor...


In [45]:
test.head()

Unnamed: 0_level_0,full_log
id,Unnamed: 1_level_1
1000000,"Feb 8 15:47:26 localhost kibana: {""type"":""err..."
1000001,"Sep 24 03:46:39 localhost kibana: {""type"":""err..."
1000002,type=SYSCALL msg=audit(1611888200.428:210563):...
1000003,"Jan 18 11:24:06 localhost kibana: {""type"":""err..."
1000004,type=SYSCALL msg=audit(1603081202.050:46851): ...
