# Part1 -- Data Prepare

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
def from_txt_to_dataframe(src_file,is_malicious,injection_type):
    
    #read file
    payloads_txt = open('data/{}.txt'.format(src_file),'r',encoding='UTF-8').readlines()
    
    #create dataframe
    
    payloads = pd.DataFrame(payloads_txt,columns=['payload'])
    payloads['is_malicious'] = [is_malicious]*len(payloads)
    payloads['injection_type'] = [injection_type]*len(payloads)
    
    print('First 5 lines of ' + injection_type)
    display(payloads.head(10)) # default is 5
    
    return payloads

In [3]:
# payloads = pd.DataFrame(columns=['payload','is_malicious','injection_type'])
# payloads = payloads.append(from_txt_to_dataframe('SQLCollection',1,'SQL'))

In [4]:
# payloads = payloads.append(from_txt_to_dataframe('XSSCollection',1,'XSS'))

In [5]:
# payloads = payloads.append(from_txt_to_dataframe('ShellCollection',1,'SHELL'))

In [6]:
# payloads = payloads.append(from_txt_to_dataframe('non-maliciousCollection',0,'LEGAL'))

In [7]:
payloads = pd.read_csv("data/payloads.csv",index_col='index')
display(payloads.head(20))

Unnamed: 0_level_0,payload,is_malicious,injection_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37662577P,0.0,LEGAL
1,shirting,0.0,LEGAL
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS
3,obeying,0.0,LEGAL
4,dictating,0.0,LEGAL
5,lafleur,0.0,LEGAL
6,capturers,0.0,LEGAL
7,8nca58z48,0.0,LEGAL
8,autocratic,0.0,LEGAL
9,grocery+warehouses,0.0,LEGAL


# Part 2 -- Features Extract

In [8]:
print(type(payloads), payloads.shape)
display(payloads.head(50))

<class 'pandas.core.frame.DataFrame'> (110357, 3)


Unnamed: 0_level_0,payload,is_malicious,injection_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37662577P,0.0,LEGAL
1,shirting,0.0,LEGAL
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS
3,obeying,0.0,LEGAL
4,dictating,0.0,LEGAL
5,lafleur,0.0,LEGAL
6,capturers,0.0,LEGAL
7,8nca58z48,0.0,LEGAL
8,autocratic,0.0,LEGAL
9,grocery+warehouses,0.0,LEGAL


In [9]:
def create_length_feature(payloads):
    payloads['length'] = [len(str(r)) for r in payloads['payload']]
    return payloads

In [10]:
payloads = create_length_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,37662577P,0.0,LEGAL,9
1,shirting,0.0,LEGAL,8
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31
3,obeying,0.0,LEGAL,7
4,dictating,0.0,LEGAL,9
5,lafleur,0.0,LEGAL,7
6,capturers,0.0,LEGAL,9
7,8nca58z48,0.0,LEGAL,9
8,autocratic,0.0,LEGAL,10
9,grocery+warehouses,0.0,LEGAL,18


In [11]:
print(payloads['length'].name)
print(payloads['length'].describe())
print(payloads['length'].describe().name)
print(payloads['length'].describe().dtype)
print(payloads['length'].describe().mean)
print(payloads['length'].describe()[7])

length
count    110357.000000
mean         16.559629
std          32.108640
min           1.000000
25%           6.000000
50%           9.000000
75%          14.000000
max         974.000000
Name: length, dtype: float64
length
float64
<bound method Series.mean of count    110357.000000
mean         16.559629
std          32.108640
min           1.000000
25%           6.000000
50%           9.000000
75%          14.000000
max         974.000000
Name: length, dtype: float64>
974.0


In [12]:
import string
def create_non_printable_characters_feature(payloads):
    payloads['non_printable_chars'] = [len([1 for c in str(r) if c not in string.printable]) for r in payloads['payload']]
    return payloads

In [13]:
print(string.printable, len(string.printable), string.printable[:62], string.printable[62:100])
# for i, c in enumerate(string.printable[62:100]): print(i,c)
payloads = create_non_printable_characters_feature(payloads)
display(payloads.head(50))

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	
 100 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,37662577P,0.0,LEGAL,9,0
1,shirting,0.0,LEGAL,8,0
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0
3,obeying,0.0,LEGAL,7,0
4,dictating,0.0,LEGAL,9,0
5,lafleur,0.0,LEGAL,7,0
6,capturers,0.0,LEGAL,9,0
7,8nca58z48,0.0,LEGAL,9,0
8,autocratic,0.0,LEGAL,10,0
9,grocery+warehouses,0.0,LEGAL,18,0


In [14]:
print(payloads['non_printable_chars'].name)
print(payloads['non_printable_chars'].describe())
print(payloads['non_printable_chars'].describe().name)
print(payloads['non_printable_chars'].describe().dtype)
print(payloads['non_printable_chars'].describe().mean)
print(payloads['non_printable_chars'].describe()[7])

non_printable_chars
count    110357.000000
mean          0.007412
std           0.216736
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          30.000000
Name: non_printable_chars, dtype: float64
non_printable_chars
float64
<bound method Series.mean of count    110357.000000
mean          0.007412
std           0.216736
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          30.000000
Name: non_printable_chars, dtype: float64>
30.0


In [15]:
def create_punctuation_chars_feature(payloads):
    payloads['punctuation'] = [ len([1 for c in str(r) if c in string.punctuation]) for r in payloads['payload']]
    return payloads

In [16]:
print(string.punctuation)
payloads = create_punctuation_chars_feature(payloads)
display(payloads.head(50))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,37662577P,0.0,LEGAL,9,0,0
1,shirting,0.0,LEGAL,8,0,0
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11
3,obeying,0.0,LEGAL,7,0,0
4,dictating,0.0,LEGAL,9,0,0
5,lafleur,0.0,LEGAL,7,0,0
6,capturers,0.0,LEGAL,9,0,0
7,8nca58z48,0.0,LEGAL,9,0,0
8,autocratic,0.0,LEGAL,10,0,0
9,grocery+warehouses,0.0,LEGAL,18,0,1


In [17]:
print(payloads['punctuation'].name)
print(payloads['punctuation'].describe())
print(payloads['punctuation'].describe().name)
print(payloads['punctuation'].describe().dtype)
print(payloads['punctuation'].describe().mean)
print(payloads['punctuation'].describe()[7])

punctuation
count    110357.000000
mean          2.363729
std           9.771260
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max         538.000000
Name: punctuation, dtype: float64
punctuation
float64
<bound method Series.mean of count    110357.000000
mean          2.363729
std           9.771260
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max         538.000000
Name: punctuation, dtype: float64>
538.0


In [18]:
# for r in payloads['payload']: print(r, min(str(r)))

In [19]:
def create_min_byte_value_feature(payloads):
    payloads['min-byte'] = [ min(bytearray(str(r), 'utf-8')) for r in payloads['payload']]
    return payloads

In [20]:
payloads = create_min_byte_value_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,37662577P,0.0,LEGAL,9,0,0,50
1,shirting,0.0,LEGAL,8,0,0,103
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37
3,obeying,0.0,LEGAL,7,0,0,98
4,dictating,0.0,LEGAL,9,0,0,97
5,lafleur,0.0,LEGAL,7,0,0,97
6,capturers,0.0,LEGAL,9,0,0,97
7,8nca58z48,0.0,LEGAL,9,0,0,52
8,autocratic,0.0,LEGAL,10,0,0,97
9,grocery+warehouses,0.0,LEGAL,18,0,1,43


In [21]:
print(payloads['min-byte'].name)
print(payloads['min-byte'].describe())
print(payloads['min-byte'].describe().name)
print(payloads['min-byte'].describe().dtype)
print(payloads['min-byte'].describe().mean)
print(payloads['min-byte'].describe()[7])

min-byte
count    110357.000000
mean         71.225749
std          26.545783
min           9.000000
25%          48.000000
50%          75.000000
75%          97.000000
max         125.000000
Name: min-byte, dtype: float64
min-byte
float64
<bound method Series.mean of count    110357.000000
mean         71.225749
std          26.545783
min           9.000000
25%          48.000000
50%          75.000000
75%          97.000000
max         125.000000
Name: min-byte, dtype: float64>
125.0


In [22]:
def create_max_byte_value_feature(payloads):
    payloads['max-byte'] = [ max(bytearray(str(r), 'utf-8')) for r in payloads['payload'] ]
    return payloads

In [23]:
payloads = create_max_byte_value_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80
1,shirting,0.0,LEGAL,8,0,0,103,116
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119
3,obeying,0.0,LEGAL,7,0,0,98,121
4,dictating,0.0,LEGAL,9,0,0,97,116
5,lafleur,0.0,LEGAL,7,0,0,97,117
6,capturers,0.0,LEGAL,9,0,0,97,117
7,8nca58z48,0.0,LEGAL,9,0,0,52,122
8,autocratic,0.0,LEGAL,10,0,0,97,117
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121


In [24]:
print(payloads['max-byte'].name)
print(payloads['max-byte'].describe())
print(payloads['max-byte'].describe().name)
print(payloads['max-byte'].describe().dtype)
print(payloads['max-byte'].describe().mean)
print(payloads['max-byte'].describe()[7])

max-byte
count    110357.000000
mean        109.495166
std          20.327684
min          33.000000
25%         114.000000
50%         116.000000
75%         118.000000
max         240.000000
Name: max-byte, dtype: float64
max-byte
float64
<bound method Series.mean of count    110357.000000
mean        109.495166
std          20.327684
min          33.000000
25%         114.000000
50%         116.000000
75%         118.000000
max         240.000000
Name: max-byte, dtype: float64>
240.0


In [25]:
def create_mean_byte_value_feature(payloads):
    payloads['mean-byte'] = [ sum(bytearray(str(r), 'utf-8'))/ len(bytearray(str(r), 'utf-8')) for r in payloads['payload']]
    return payloads

In [26]:
payloads = create_mean_byte_value_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte,mean-byte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80,56.333333
1,shirting,0.0,LEGAL,8,0,0,103,116,109.0
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119,65.806452
3,obeying,0.0,LEGAL,7,0,0,98,121,107.0
4,dictating,0.0,LEGAL,9,0,0,97,116,105.666667
5,lafleur,0.0,LEGAL,7,0,0,97,117,106.714286
6,capturers,0.0,LEGAL,9,0,0,97,117,109.444444
7,8nca58z48,0.0,LEGAL,9,0,0,52,122,77.888889
8,autocratic,0.0,LEGAL,10,0,0,97,117,107.1
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121,105.555556


In [27]:
print(payloads['mean-byte'].name)
print(payloads['mean-byte'].describe())
print(payloads['mean-byte'].describe().name)
print(payloads['mean-byte'].describe().dtype)
print(payloads['mean-byte'].describe().mean)
print(payloads['mean-byte'].describe()[7])

mean-byte
count    110357.000000
mean         92.953226
std          19.316953
min          33.000000
25%          83.416667
50%         101.875000
75%         107.142857
max         164.000000
Name: mean-byte, dtype: float64
mean-byte
float64
<bound method Series.mean of count    110357.000000
mean         92.953226
std          19.316953
min          33.000000
25%          83.416667
50%         101.875000
75%         107.142857
max         164.000000
Name: mean-byte, dtype: float64>
164.0


In [28]:
def create_standard_deviation_byte_value_feature(payloads):
    payloads['standard-deviation-byte'] = [ np.std(bytearray(str(r), 'utf-8')) for r in payloads['payload']]
    return payloads

In [29]:
payloads = create_standard_deviation_byte_value_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte,mean-byte,standard-deviation-byte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80,56.333333,8.537499
1,shirting,0.0,LEGAL,8,0,0,103,116,109.0,5.049752
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119,65.806452,26.617263
3,obeying,0.0,LEGAL,7,0,0,98,121,107.0,7.151423
4,dictating,0.0,LEGAL,9,0,0,97,116,105.666667,6.599663
5,lafleur,0.0,LEGAL,7,0,0,97,117,106.714286,6.670067
6,capturers,0.0,LEGAL,9,0,0,97,117,109.444444,7.558823
7,8nca58z48,0.0,LEGAL,9,0,0,52,122,77.888889,26.904954
8,autocratic,0.0,LEGAL,10,0,0,97,117,107.1,8.117266
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121,105.555556,16.836542


In [30]:
print(payloads['standard-deviation-byte'].name)
print(payloads['standard-deviation-byte'].describe())
print(payloads['standard-deviation-byte'].describe().name)
print(payloads['standard-deviation-byte'].describe().dtype)
print(payloads['standard-deviation-byte'].describe().mean)
print(payloads['standard-deviation-byte'].describe()[7])

standard-deviation-byte
count    110357.000000
mean         12.656653
std           9.197185
min           0.000000
25%           5.914355
50%           7.812810
75%          20.754440
max          75.013332
Name: standard-deviation-byte, dtype: float64
standard-deviation-byte
float64
<bound method Series.mean of count    110357.000000
mean         12.656653
std           9.197185
min           0.000000
25%           5.914355
50%           7.812810
75%          20.754440
max          75.013332
Name: standard-deviation-byte, dtype: float64>
75.0133321483588


In [31]:
def create_distinct_byte_value_feature(payloads):
    payloads['distinct-byte'] = [ len(set(str(r))) for r in payloads['payload']]
    return payloads

In [32]:
payloads = create_distinct_byte_value_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte,mean-byte,standard-deviation-byte,distinct-byte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80,56.333333,8.537499,6
1,shirting,0.0,LEGAL,8,0,0,103,116,109.0,5.049752,7
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119,65.806452,26.617263,18
3,obeying,0.0,LEGAL,7,0,0,98,121,107.0,7.151423,7
4,dictating,0.0,LEGAL,9,0,0,97,116,105.666667,6.599663,7
5,lafleur,0.0,LEGAL,7,0,0,97,117,106.714286,6.670067,6
6,capturers,0.0,LEGAL,9,0,0,97,117,109.444444,7.558823,8
7,8nca58z48,0.0,LEGAL,9,0,0,52,122,77.888889,26.904954,7
8,autocratic,0.0,LEGAL,10,0,0,97,117,107.1,8.117266,7
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121,105.555556,16.836542,12


In [33]:
print(payloads['distinct-byte'].name)
print(payloads['distinct-byte'].describe())
print(payloads['distinct-byte'].describe().name)
print(payloads['distinct-byte'].describe().dtype)
print(payloads['distinct-byte'].describe().mean)
print(payloads['distinct-byte'].describe()[7])

distinct-byte
count    110357.000000
mean          9.477577
std           7.398486
min           1.000000
25%           5.000000
50%           7.000000
75%          10.000000
max          76.000000
Name: distinct-byte, dtype: float64
distinct-byte
float64
<bound method Series.mean of count    110357.000000
mean          9.477577
std           7.398486
min           1.000000
25%           5.000000
50%           7.000000
75%          10.000000
max          76.000000
Name: distinct-byte, dtype: float64>
76.0


In [34]:
sql_keywords = pd.read_csv('data/SQLKeywords.txt', index_col=False)
def create_sql_keywords_feature(payloads):
    payloads['sql-keywords'] = [ len([1 for keyword in sql_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
    return payloads

In [35]:
create_sql_keywords_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte,mean-byte,standard-deviation-byte,distinct-byte,sql-keywords
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80,56.333333,8.537499,6,0
1,shirting,0.0,LEGAL,8,0,0,103,116,109.0,5.049752,7,0
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119,65.806452,26.617263,18,0
3,obeying,0.0,LEGAL,7,0,0,98,121,107.0,7.151423,7,0
4,dictating,0.0,LEGAL,9,0,0,97,116,105.666667,6.599663,7,0
5,lafleur,0.0,LEGAL,7,0,0,97,117,106.714286,6.670067,6,0
6,capturers,0.0,LEGAL,9,0,0,97,117,109.444444,7.558823,8,0
7,8nca58z48,0.0,LEGAL,9,0,0,52,122,77.888889,26.904954,7,0
8,autocratic,0.0,LEGAL,10,0,0,97,117,107.1,8.117266,7,0
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121,105.555556,16.836542,12,0


In [36]:
js_keywords = pd.read_csv('data/JavascriptKeywords.txt', index_col=False)
def create_javascript_keywords_feature(payloads):
    payloads['js-keywords'] = [len([1 for keyword in js_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
    return payloads

In [37]:
create_javascript_keywords_feature(payloads)
display(payloads.head(50))

Unnamed: 0_level_0,payload,is_malicious,injection_type,length,non_printable_chars,punctuation,min-byte,max-byte,mean-byte,standard-deviation-byte,distinct-byte,sql-keywords,js-keywords
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,37662577P,0.0,LEGAL,9,0,0,50,80,56.333333,8.537499,6,0,0
1,shirting,0.0,LEGAL,8,0,0,103,116,109.0,5.049752,7,0,1
2,&kw=%27;alert%28%27XSS%27%29;//,1.0,XSS,31,0,11,37,119,65.806452,26.617263,18,0,1
3,obeying,0.0,LEGAL,7,0,0,98,121,107.0,7.151423,7,0,1
4,dictating,0.0,LEGAL,9,0,0,97,116,105.666667,6.599663,7,0,1
5,lafleur,0.0,LEGAL,7,0,0,97,117,106.714286,6.670067,6,0,0
6,capturers,0.0,LEGAL,9,0,0,97,117,109.444444,7.558823,8,0,0
7,8nca58z48,0.0,LEGAL,9,0,0,52,122,77.888889,26.904954,7,0,0
8,autocratic,0.0,LEGAL,10,0,0,97,117,107.1,8.117266,7,0,0
9,grocery+warehouses,0.0,LEGAL,18,0,1,43,121,105.555556,16.836542,12,0,0


In [63]:
payloads.to_csv("data/processed_payloads.csv", encoding='utf-8', index = True, header=True)

In [49]:
login_keywords = pd.read_csv('data/darkweb2017-top10000.txt', index_col=False)
# print(login_keywords, type(login_keywords), login_keywords.shape)
def create_web_login_keywords_feature(payloads):
    payloads['login_keywords'] = [len([1 for keyword in login_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
    return payloads

In [51]:
# create_web_login_keywords_feature(payloads)
# display(payloads.head(50))

In [52]:
name_keywords = pd.read_csv('data/names.txt', index_col=False)
def create_name_keywords_feature(payloads):
    payloads['name-keywords'] = [len([1 for keyword in name_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
    return payloads

In [53]:
# create_name_keywords_feature(payloads)
# display(payloads.head(50))

In [54]:
def create_features(payloads):
    features = create_length_feature(payloads)
    features = create_non_printable_characters_feature(features)
    features = create_punctuation_chars_feature(features)
    features = create_max_byte_value_feature(features)
    features = create_min_byte_value_feature(features)
    features = create_mean_byte_value_feature(features)
    features = create_standard_deviation_byte_value_feature(features)
    features = create_distinct_byte_value_feature(features)
    features = create_sql_keywords_feature(features)
    del features['payload']
    return features

In [55]:
def create_powerful_features(payloads):
    features = create_web_login_keywords_feature(payloads)
    features = create_name_keywords_feature(features)
    del features['payload']
    return features

In [56]:
Y = payloads['is_malicious']
X = create_features(pd.DataFrame(payloads['payload'][:]))

In [57]:
display(X.head(50))

Unnamed: 0_level_0,length,non_printable_chars,punctuation,max-byte,min-byte,mean-byte,standard-deviation-byte,distinct-byte,sql-keywords
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,9,0,0,80,50,56.333333,8.537499,6,0
1,8,0,0,116,103,109.0,5.049752,7,0
2,31,0,11,119,37,65.806452,26.617263,18,0
3,7,0,0,121,98,107.0,7.151423,7,0
4,9,0,0,116,97,105.666667,6.599663,7,0
5,7,0,0,117,97,106.714286,6.670067,6,0
6,9,0,0,117,97,109.444444,7.558823,8,0
7,9,0,0,122,52,77.888889,26.904954,7,0
8,10,0,0,117,97,107.1,8.117266,7,0
9,18,0,1,121,43,105.555556,16.836542,12,0


In [58]:
display(Y.head(50))

index
0     0.0
1     0.0
2     1.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    1.0
20    0.0
21    1.0
22    1.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
31    0.0
32    0.0
33    0.0
34    0.0
35    0.0
36    0.0
37    0.0
38    0.0
39    0.0
40    0.0
41    1.0
42    0.0
43    0.0
44    0.0
45    0.0
46    0.0
47    0.0
48    0.0
49    0.0
Name: is_malicious, dtype: float64

In [59]:
print(X.shape, Y.shape)

(110357, 9) (110357,)


# Part 3 -- Model Selection

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [67]:
def create_classifier(type):
    if type == 'AdaBoost': clf = AdaBoostClassifier(n_estimators=100)
    elif type == "LogisticRegressionL1": clf = LogisticRegression(penalty='l1', tol=0.0001, C=1.0) 
    elif type == "LogisticRegressionL2": clf = LogisticRegression(penalty='l2', tol=0.0001, C=1.0) 
    elif type == "SGD": clf = SGDClassifier(loss="log", penalty="l2")
    elif type == "MLPClassifier": clf = MLPClassifier(activation='relu', solver='adam', early_stopping=False, verbose=True)
    elif type == 'SVC':  clf = SVC(probability=True)
    elif type == 'RandomForest': clf = RandomForestClassifier(max_depth=None, min_samples_split=2, random_state=0)
    elif type == 'DecisionTreeClassifier': clf = DecisionTreeClassifier()
    elif type == 'MultinomialNB': clf = MultinomialNB()
    else: clf = LogisticRegression(penalty='l2', tol=0.0001, C=1.0)
    return clf 

In [68]:
AdaBoost = create_classifier("AdaBoost")
LR1 = create_classifier("LogisticRegressionL1")
LR2 = create_classifier("LogisticRegressionL2")
SGD = create_classifier("SGD")
MLP = create_classifier("MLPClassifier")
SVC = create_classifier("SVC")
RandomForest = create_classifier("RandomForest")
DecisionTree = create_classifier("DecisionTreeClassifier")
MNB = create_classifier("MultinomialNB")

In [81]:
shuffle(X)
shuffle(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.75,test_size=0.25)

In [82]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(82767, 9) (82767,)
(27590, 9) (27590,)


In [83]:
AdaBoost.fit(x_train[:],y_train[:])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [84]:
Ada_y_test = AdaBoost.predict(x_test[:])
print(Ada_y_test.shape)

(27590,)


# Part 4 -- Model Evalution

In [87]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

In [170]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [160]:
print(confusion_matrix(y_test, Ada_y_test))
tn, fp, fn, tp = confusion_matrix(y_test, Ada_y_test).ravel()
print(tn, fp, fn, tp)

[[25030   117]
 [  196  2247]]
25030 117 196 2247


In [161]:
print(classification_report(y_test, Ada_y_test))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     25147
         1.0       0.95      0.92      0.93      2443

   micro avg       0.99      0.99      0.99     27590
   macro avg       0.97      0.96      0.96     27590
weighted avg       0.99      0.99      0.99     27590



In [162]:
print('hamming loss: ', hamming_loss(y_test, Ada_y_test))
print('log loss: ', log_loss(y_test, Ada_y_test))
print('zero one loss: ', zero_one_loss(y_test, Ada_y_test))

hamming loss:  0.011344690105110548
log loss:  0.3918351056363321
zero one loss:  0.011344690105110522


In [167]:
print('mean absolute error: ',  mean_absolute_error(y_test, Ada_y_test) )
print('mean squared error: ', mean_squared_error(y_test, Ada_y_test) )
print('mean squared log error: ',  mean_squared_log_error(y_test, Ada_y_test) )
print('median absolute error: ', median_absolute_error(y_test, Ada_y_test) )

mean absolute error:  0.011344690105110548
mean squared error:  0.011344690105110548
mean squared log error:  0.00545059055296836
median absolute error:  0.0


In [168]:
print( 'accuracy score: ', accuracy_score(y_test, Ada_y_test) )
print('precision score: ' ,  metrics.precision_score(y_test, Ada_y_test))
print('recall score: ' , metrics.recall_score(y_test, Ada_y_test))
print('F1 score: ' , metrics.f1_score(y_test, Ada_y_test) )

accuracy score:  0.9886553098948895
precision score:  0.950507614213198
recall score:  0.9197707736389685
F1 score:  0.934886623673809


In [169]:
print('precision_recall_curve: \n', precision_recall_curve(y_test, Ada_y_test))
print('roc curve: \n', roc_curve(y_test, Ada_y_test))
fpr, tpr, thresholds = roc_curve(y_test, Ada_y_test)
print(fpr, tpr, thresholds)
print('roc_auc_score: ', roc_auc_score(y_test, Ada_y_test))

precision_recall_curve: 
 (array([0.08854657, 0.95050761, 1.        ]), array([1.        , 0.91977077, 0.        ]), array([0., 1.]))
roc curve: 
 (array([0.        , 0.00465264, 1.        ]), array([0.        , 0.91977077, 1.        ]), array([2., 1., 0.]))
[0.         0.00465264 1.        ] [0.         0.91977077 1.        ] [2. 1. 0.]
roc_auc_score:  0.9575590655883235


In [None]:
auc_score = auc(fpr, tpr)
if plot:
    plt.figure(figsize=(7, 6))
    plt.plot(fpr, tpr, color='blue', label='ROC (AUC = %0.4f)' % auc_score)
    plt.legend(loc='lower right')
    plt.title("ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.show()