In [402]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

In [403]:
df = pd.read_csv('./train.csv')

In [404]:
label_ = list(set(df['label_action']))
label_cnt = len(label_)

idx = []
for i in range (0, label_cnt):
    idx.append(i)
ldf = df.replace(label_, idx)

# 0. Preprocessing

## 0.1. Decode URL
URL은 %NN 형태로 인코딩되어있기 때문에 디코딩을 해줄 것이다.

In [405]:
from urllib.parse import unquote
#test
test_url = ldf.payload[4]
dec_url = unquote(test_url)
print("Before:", test_url)
print("After:", dec_url)



Before: GET /sub_04_1_read.php?page=1&id=31%29%3BSELECT%20%2A%20FROM%20GENERATE_SERIES%289156%2C9156%2CCASE%20WHEN%20%289156%3D5350%29%20THEN%201%20ELSE%200%20END%29%20LIMIT%201-- HTTP/1.1\r\nCache-Control: no-cache\r\nUser-Agent: sqlmap/1.6.10#stable (https://sqlmap.org)\r\nReferer: http://10.0.17.20:80/sub_04_1_read.php\r\nHost: 10.0.17.20\r\nAccept: */*\r\nAccept-Encoding: gzip,deflate\r\nConnection: close\r\n\r\n
After: GET /sub_04_1_read.php?page=1&id=31);SELECT * FROM GENERATE_SERIES(9156,9156,CASE WHEN (9156=5350) THEN 1 ELSE 0 END) LIMIT 1-- HTTP/1.1\r\nCache-Control: no-cache\r\nUser-Agent: sqlmap/1.6.10#stable (https://sqlmap.org)\r\nReferer: http://10.0.17.20:80/sub_04_1_read.php\r\nHost: 10.0.17.20\r\nAccept: */*\r\nAccept-Encoding: gzip,deflate\r\nConnection: close\r\n\r\n


이런 형태로 모든 스트링에 적용시켜준다

In [406]:
from urllib.parse import unquote
decoded_payload = []
for url in ldf.payload:
    decoded_payload.append(unquote(url))
pd.options.display.max_colwidth = 100
ldf.payload = decoded_payload


In [407]:
ldf.to_csv("./decoded.csv", escapechar='\\')

잘 디코딩이 된 것을 확인할 수 있다.

## 0.2. 불필요한 데이터 제거
payload의 모든 내용이 학습에 도움이 되는 것은 아니다. 
문자열을 feature data로 사용할 때 중요한 것은 실제로 공격에 사용된 부분이다. 
따라서 크게 영향을 주지 않을 것으로 부분들을 제거할것이다

In [408]:
duplicate_values = ldf['payload'].duplicated()
i = 0
cnt = 0
for pl in df.payload:
    if (duplicate_values[i] == True):
        cnt += 1
    i += 1
print("Number of dupes: ", cnt)

Number of dupes:  13246


In [409]:
ldf = ldf.drop_duplicates(subset=['payload'])

In [410]:
from hashlib import sha256

# for payload in ldf.payload:
#     if (len(payload) == 185):
#         print(payload)
new_payload = []
for payload in ldf.payload:
    #hashed_payload = []
    # for field in payload:
    #     hash = sha256(field.encode())
    #     hash2 = hash.hexdigest()
    #     hashed_payload.append(hash2)
    hash = sha256(payload.encode())
    hash2 = hash.hexdigest()
    new_payload.append(hash2)

ldf.payload = new_payload

In [415]:
ldf

Unnamed: 0,Log_Number,payload,label_action
0,0,[0d027677821e24b2ea01b13f6dbe67d658d7fae7cccfa31f768e01838cd0e0df],8
1,1,[6d62336c123095a448ef1213b1327e299baa39921c25a18aed32d3715bbb9c66],8
2,2,[bc359a6c366ead580634c024ba37773db949a86be44a48092766b3dca55f38eb],4
3,3,[4951ea67db0afe468c5b64b5aa1ecaafefaf3fe79466f195d65a6beceeb25c9d],1
4,4,[4c108a8378c3b063a2a8707a569674c8eb2463170725da7b44ac79ec53d5171b],8
...,...,...,...
44993,44993,[560164bf69caef493fe72762c967c781ea5e86503428d32d05b5393615a4bb76],3
44995,44995,[ff267828502774b4f564b296fc13913ddaf406ce2ec963db1feef06c06b34b9b],8
44996,44996,[18e2e70e29a48d037ddcb13ae11e7002c21cec45e3df159ba87bb67a7d7f3727],8
44997,44997,[594dbbb6efabfd872affcfd36a1854be251f9ee05b0e6bee7381c023f97fdbf9],4


## Tokenizing
공백, &, =, :, ;, +로 문자열 분리하기

In [411]:

split_payload = []
for payload in ldf.payload:
    #table = payload.maketrans("&=;:+ ", ",,,,,,")
    #payload = payload.translate(table)
    #tokens = payload.split(',')
    tokens = payload.split('\\r\\n')
    while("" in tokens):
        tokens.remove('')
    split_payload.append(tokens)
ldf.payload = split_payload

In [412]:
ldf.to_csv("./split.csv")

In [413]:
ldf

Unnamed: 0,Log_Number,payload,label_action
0,0,[0d027677821e24b2ea01b13f6dbe67d658d7fae7cccfa31f768e01838cd0e0df],8
1,1,[6d62336c123095a448ef1213b1327e299baa39921c25a18aed32d3715bbb9c66],8
2,2,[bc359a6c366ead580634c024ba37773db949a86be44a48092766b3dca55f38eb],4
3,3,[4951ea67db0afe468c5b64b5aa1ecaafefaf3fe79466f195d65a6beceeb25c9d],1
4,4,[4c108a8378c3b063a2a8707a569674c8eb2463170725da7b44ac79ec53d5171b],8
...,...,...,...
44993,44993,[560164bf69caef493fe72762c967c781ea5e86503428d32d05b5393615a4bb76],3
44995,44995,[ff267828502774b4f564b296fc13913ddaf406ce2ec963db1feef06c06b34b9b],8
44996,44996,[18e2e70e29a48d037ddcb13ae11e7002c21cec45e3df159ba87bb67a7d7f3727],8
44997,44997,[594dbbb6efabfd872affcfd36a1854be251f9ee05b0e6bee7381c023f97fdbf9],4


In [414]:
from hashlib import sha256

# for payload in ldf.payload:
#     if (len(payload) == 185):
#         print(payload)
new_payload = []
for payload in ldf.payload:
    #hashed_payload = []
    # for field in payload:
    #     hash = sha256(field.encode())
    #     hash2 = hash.hexdigest()
    #     hashed_payload.append(hash2)
    hash = sha256(payload.encode())
    hash2 = hash.hexdigest()
    new_payload.append(hash2)

ldf.payload = new_payload



AttributeError: 'list' object has no attribute 'encode'

In [None]:
ldf

Unnamed: 0,Log_Number,payload,label_action
0,0,"[9d828b99ee83fca7abb3d5fb59a851208c20be4adfabf07d4eb34ead93ce1234, 511e1e1bfd37827742869ddf0996e...",8
1,1,"[135e09f43f86aec89da9a0b52dda0406058be4a44d31ee057d933b3092c4928a, 7350f08c376b672ce87da99c1fe77...",8
2,2,"[2f06064595d62a9383acf7a12eb80b0d332c72fd2b1d8ce6741cb604e19a7566, 94d613cc7461cdc32b457a5f45ced...",4
3,3,[fc5d49bbaf39f98b659cbafac841654f529b9690293d106902a27acb82aad4b6],1
4,4,"[f1e0e7bcfe440c734169802110cbfd3f4ac0204d6b7fa853b3841ccce2e2c56d, af13fd0efc2ed0de793f083651982...",8
...,...,...,...
44993,44993,"[a0944faa9f4faf87f441ff6fc3115559049c067c7a7957245e8d9f29d54d5ef5, ce8edff17c3f0da70695308651d5c...",3
44995,44995,"[dcca888ee4f358c0126591d7bb2d59c913699976190d8c2e6c4d0c87d4a1fccc, ce8edff17c3f0da70695308651d5c...",8
44996,44996,"[54ccda9ea315a23048262e444fd6ea0be22cb098a10154e8ef88e95aa33c514b, ce8edff17c3f0da70695308651d5c...",8
44997,44997,"[28ffbee02ca8c6259de54588d8104a04a3f5174cf5b591dc45a37165fc9b9278, 710695fcb4cae17bdb8743763d2be...",4


In [None]:
max_len = 0
for payload in ldf.payload:
    print(len(payload))
    max_len = max(max_len, len(payload))
i = 0
for payload in ldf.payload:
    if (len(payload) == max_len):
        print(payload)
        ldf.la
    i += 1
        



7
10
9
1
8
10
8
8
7
7
4
11
7
9
8
7
9
10
9
12
9
9
10
8
11
7
6
7
9
9
9
9
9
7
7
10
7
7
10
4
7
3
4
4
9
10
9
8
7
9
9
9
10
4
9
12
3
7
7
10
9
8
10
9
9
8
8
5
9
9
9
8
7
11
1
9
1
10
10
7
9
10
7
4
10
9
7
3
4
11
7
10
11
8
10
10
7
8
7
4
9
9
9
11
12
7
9
7
9
8
8
7
9
4
10
11
7
7
5
7
10
5
7
14
4
7
9
7
10
9
9
8
7
8
9
7
3
1
9
4
10
7
9
7
7
11
9
9
7
11
10
10
4
7
7
11
11
9
9
8
7
9
8
9
8
10
9
11
10
7
10
9
9
10
8
9
9
9
7
7
5
9
7
7
7
10
7
9
9
9
7
7
10
8
4
10
10
7
9
7
7
8
11
7
3
8
4
9
8
7
11
7
10
5
7
7
4
8
5
10
9
7
4
7
4
9
5
9
7
10
7
8
11
9
6
7
5
8
9
10
7
4
1
10
7
9
9
9
9
9
10
9
7
7
7
7
10
10
8
8
7
9
4
9
7
4
9
8
1
10
7
9
10
4
7
7
12
10
4
9
9
7
7
7
9
9
10
10
7
7
1
7
5
8
7
7
7
9
8
4
7
7
4
7
10
6
7
7
9
9
7
7
10
7
9
8
8
7
9
9
10
7
8
4
10
10
10
5
4
10
4
10
7
4
7
10
3
14
7
7
7
1
7
4
7
2
9
9
11
7
7
7
2
12
8
9
10
7
9
4
7
11
8
5
9
7
10
5
7
1
12
9
8
9
9
8
7
5
9
7
7
10
7
8
9
14
9
4
7
7
10
11
9
10
3
7
10
9
9
9
10
8
9
9
7
11
10
11
7
5
9
9
7
5
4
9
4
10
8
4
4
4
8
9
7
10
7
7
9
4
11
7
7
8
3
7
8
8
7
5
9
10
8
7
9
8
7
7
3
4
7
9
5
