In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing
import string
from urllib.parse import urlparse, parse_qs


In [2]:
columns_str = 'frame.number,frame.len,frame.time,frame.time_epoch,frame.protocols,eth.src,eth.dst,eth.type,ip.src,ip.dst,ip.len,ip.ttl,ip.flags,ip.frag_offset,ip.proto,ip.version,ip.dsfield,ip.checksum,tcp.srcport,tcp.dstport,tcp.len,tcp.seq,tcp.ack,tcp.flags,tcp.flags.syn,tcp.flags.ack,tcp.flags.fin,tcp.flags.reset,tcp.window_size,tcp.checksum,tcp.stream,udp.srcport,udp.dstport,udp.length,udp.checksum,icmp.type,icmp.code,icmp.checksum,http.request.method,http.request.uri,http.request.version,http.request.full_uri,http.response.code,http.user_agent,http.content_length_header,http.content_type,http.cookie,http.host,http.referer,http.location,http.authorization,http.connection,dns.qry.name,dns.qry.type,dns.qry.class,dns.flags.response,dns.flags.recdesired,dns.flags.rcode,dns.resp.ttl,dns.resp.len,smtp.req.command,smtp.data.fragment,pop.request.command,pop.response,imap.request.command,imap.response,ftp.request.command,ftp.request.arg,ftp.response.code,ftp.response.arg,ipv6.src,ipv6.dst,ipv6.plen,alert'
data_columns = columns_str.split(',')
data_columns

['frame.number',
 'frame.len',
 'frame.time',
 'frame.time_epoch',
 'frame.protocols',
 'eth.src',
 'eth.dst',
 'eth.type',
 'ip.src',
 'ip.dst',
 'ip.len',
 'ip.ttl',
 'ip.flags',
 'ip.frag_offset',
 'ip.proto',
 'ip.version',
 'ip.dsfield',
 'ip.checksum',
 'tcp.srcport',
 'tcp.dstport',
 'tcp.len',
 'tcp.seq',
 'tcp.ack',
 'tcp.flags',
 'tcp.flags.syn',
 'tcp.flags.ack',
 'tcp.flags.fin',
 'tcp.flags.reset',
 'tcp.window_size',
 'tcp.checksum',
 'tcp.stream',
 'udp.srcport',
 'udp.dstport',
 'udp.length',
 'udp.checksum',
 'icmp.type',
 'icmp.code',
 'icmp.checksum',
 'http.request.method',
 'http.request.uri',
 'http.request.version',
 'http.request.full_uri',
 'http.response.code',
 'http.user_agent',
 'http.content_length_header',
 'http.content_type',
 'http.cookie',
 'http.host',
 'http.referer',
 'http.location',
 'http.authorization',
 'http.connection',
 'dns.qry.name',
 'dns.qry.type',
 'dns.qry.class',
 'dns.flags.response',
 'dns.flags.recdesired',
 'dns.flags.rcode',
 'd

In [3]:
df = pd.read_csv('attack-simulation-http-url.csv', usecols = ['http.request.uri'], names=data_columns, header=None, low_memory=False)
df.head(10)

Unnamed: 0,http.request.uri
0,/dvwa/config/
1,/dvwa/docs/
2,/dvwa/external/
3,/mutillidae/ajax/
4,/mutillidae/classes/
5,/mutillidae/data/
6,/mutillidae/documentation/
7,/mutillidae/images/
8,/mutillidae/includes/
9,/mutillidae/javascript/


In [4]:
#Preprocessing

def uri_len(uri):
    uri_str = str(uri)
    return len(uri_str)

df['uri_len'] = df['http.request.uri'].apply(uri_len)
df.head(10)

Unnamed: 0,http.request.uri,uri_len
0,/dvwa/config/,13
1,/dvwa/docs/,11
2,/dvwa/external/,15
3,/mutillidae/ajax/,17
4,/mutillidae/classes/,20
5,/mutillidae/data/,17
6,/mutillidae/documentation/,26
7,/mutillidae/images/,19
8,/mutillidae/includes/,21
9,/mutillidae/javascript/,23


In [5]:
def count_uri_segments(uri):
    parsed_uri = urlparse(str(uri))
    path = parsed_uri.path.strip('/')  # Remove leading and trailing slashes
    segments = path.split('/')
    return len(segments)

df['uri_segs'] = df['http.request.uri'].apply(count_uri_segments)
df.head(10)

Unnamed: 0,http.request.uri,uri_len,uri_segs
0,/dvwa/config/,13,2
1,/dvwa/docs/,11,2
2,/dvwa/external/,15,2
3,/mutillidae/ajax/,17,2
4,/mutillidae/classes/,20,2
5,/mutillidae/data/,17,2
6,/mutillidae/documentation/,26,2
7,/mutillidae/images/,19,2
8,/mutillidae/includes/,21,2
9,/mutillidae/javascript/,23,2


In [6]:
def count_special_chars(uri):
    special_chars = []
    for char in str(uri):
        if char not in string.ascii_letters + string.digits + '/':
                    special_chars.append(char)
    return len(special_chars)

df['uri_s_chars'] = df['http.request.uri'].apply(count_special_chars)
df.head(30)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars
0,/dvwa/config/,13,2,0
1,/dvwa/docs/,11,2,0
2,/dvwa/external/,15,2,0
3,/mutillidae/ajax/,17,2,0
4,/mutillidae/classes/,20,2,0
5,/mutillidae/data/,17,2,0
6,/mutillidae/documentation/,26,2,0
7,/mutillidae/images/,19,2,0
8,/mutillidae/includes/,21,2,0
9,/mutillidae/javascript/,23,2,0


In [7]:
def check_reserved_characters(uri):
    parsed_uri = urlparse(str(uri))
    path = parsed_uri.path.strip('/')  # Remove leading and trailing slashes
    reserved_chars = "!*'();:@&=+$,/?#[]"
    reserved_char_count = sum(c in reserved_chars for c in path)
    return reserved_char_count

df['uri_res_chars'] = df['http.request.uri'].apply(check_reserved_characters)
df.head(30)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars
0,/dvwa/config/,13,2,0,1
1,/dvwa/docs/,11,2,0,1
2,/dvwa/external/,15,2,0,1
3,/mutillidae/ajax/,17,2,0,1
4,/mutillidae/classes/,20,2,0,1
5,/mutillidae/data/,17,2,0,1
6,/mutillidae/documentation/,26,2,0,1
7,/mutillidae/images/,19,2,0,1
8,/mutillidae/includes/,21,2,0,1
9,/mutillidae/javascript/,23,2,0,1


In [8]:
def count_query_parameters(uri):
    parsed_uri = urlparse(str(uri))
    query_params = parsed_uri.query
    parsed_query_params = parse_qs(query_params)
    num_query_params = len(parsed_query_params)
    return num_query_params

df['uri_q_params'] = df['http.request.uri'].apply(count_query_parameters)
df.tail(30)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars,uri_q_params
1964477,/awstats/awstats.pl?config=owaspbwa&framename=...,139,2,18,1,5
1964478,/awstats/awstats.pl?config=owaspbwa&framename=...,128,2,17,1,6
1964479,/awstats/awstats.pl?config=owaspbwa&framename=...,123,2,17,1,5
1964480,/awstats/awstats.pl?config=owaspbwa&framename=...,120,2,15,1,6
1964481,/awstats/awstats.pl?config=owaspbwa&framename=...,128,2,18,1,5
1964482,/awstats/awstats.pl?config=owaspbwa&framename=...,149,2,19,1,6
1964483,/awstats/awstats.pl?config=owaspbwa&framename=...,120,2,16,1,5
1964484,/awstats/awstats.pl?config=owaspbwa&framename=...,154,2,20,1,6
1964485,/awstats/awstats.pl?config=owaspbwa&framename=...,149,2,20,1,5
1964486,/awstats/awstats.pl?config=owaspbwa&framename=...,136,2,19,1,6


In [9]:
def max_query_param_length(uri):
    parsed_uri = urlparse(str(uri))
    query_params = parsed_uri.query
    parsed_query_params = parse_qs(query_params)
    
    max_param_length = 0
    
    for param, values in parsed_query_params.items():
        for value in values:
            max_param_length = max(max_param_length, len(value))
    
    return max_param_length

df['uri_maxq_len'] = df['http.request.uri'].apply(max_query_param_length)
df.tail(30)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars,uri_q_params,uri_maxq_len
1964477,/awstats/awstats.pl?config=owaspbwa&framename=...,139,2,18,1,5,38
1964478,/awstats/awstats.pl?config=owaspbwa&framename=...,128,2,17,1,6,18
1964479,/awstats/awstats.pl?config=owaspbwa&framename=...,123,2,17,1,5,22
1964480,/awstats/awstats.pl?config=owaspbwa&framename=...,120,2,15,1,6,14
1964481,/awstats/awstats.pl?config=owaspbwa&framename=...,128,2,18,1,5,25
1964482,/awstats/awstats.pl?config=owaspbwa&framename=...,149,2,19,1,6,35
1964483,/awstats/awstats.pl?config=owaspbwa&framename=...,120,2,16,1,5,21
1964484,/awstats/awstats.pl?config=owaspbwa&framename=...,154,2,20,1,6,38
1964485,/awstats/awstats.pl?config=owaspbwa&framename=...,149,2,20,1,5,42
1964486,/awstats/awstats.pl?config=owaspbwa&framename=...,136,2,19,1,6,22


In [10]:
# Drop rows with NaN values
df.dropna(inplace=True)
df


Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars,uri_q_params,uri_maxq_len
0,/dvwa/config/,13,2,0,1,0,0
1,/dvwa/docs/,11,2,0,1,0,0
2,/dvwa/external/,15,2,0,1,0,0
3,/mutillidae/ajax/,17,2,0,1,0,0
4,/mutillidae/classes/,20,2,0,1,0,0
...,...,...,...,...,...,...,...
1964502,/awstats/awstats.pl?config=owaspbwa&framename=...,149,2,19,1,6,35
1964503,/awstats/awstats.pl?config=owaspbwa&framename=...,124,2,16,1,5,21
1964504,/awstats/awstats.pl?config=owaspbwa&framename=...,154,2,20,1,6,38
1964505,/awstats/awstats.pl?config=owaspbwa&framename=...,153,2,20,1,5,42


In [11]:
# Create an Isolation Forest model
model=IsolationForest(contamination=0.1)
model


In [12]:
# Fit the model on your preprocessed data
required_df = df[['uri_len','uri_segs','uri_s_chars','uri_res_chars','uri_q_params','uri_maxq_len']]
model.fit(required_df)

In [13]:
# Predict anomalies
df['scores']=model.decision_function(required_df)
df['anomaly']=model.predict(required_df)
df.head(20)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars,uri_q_params,uri_maxq_len,scores,anomaly
0,/dvwa/config/,13,2,0,1,0,0,0.090492,1
1,/dvwa/docs/,11,2,0,1,0,0,0.087122,1
2,/dvwa/external/,15,2,0,1,0,0,0.096042,1
3,/mutillidae/ajax/,17,2,0,1,0,0,0.100049,1
4,/mutillidae/classes/,20,2,0,1,0,0,0.102356,1
5,/mutillidae/data/,17,2,0,1,0,0,0.100049,1
6,/mutillidae/documentation/,26,2,0,1,0,0,0.10265,1
7,/mutillidae/images/,19,2,0,1,0,0,0.101738,1
8,/mutillidae/includes/,21,2,0,1,0,0,0.104347,1
9,/mutillidae/javascript/,23,2,0,1,0,0,0.107356,1


In [14]:
anomaly=df[df['anomaly']==-1]
anomaly.head(50)

Unnamed: 0,http.request.uri,uri_len,uri_segs,uri_s_chars,uri_res_chars,uri_q_params,uri_maxq_len,scores,anomaly
20,/assets/,8,1,0,0,0,0,-0.031069,-1
21,/cgi-bin/,9,1,1,0,0,0,-0.017994,-1
22,/evil/,6,1,0,0,0,0,-0.034518,-1
23,/gallery2/,10,1,0,0,0,0,-0.023963,-1
26,/icon/,6,1,0,0,0,0,-0.034518,-1
27,/images/,8,1,0,0,0,0,-0.031069,-1
28,/javascript/,12,1,0,0,0,0,-0.020447,-1
29,/joomla/,8,1,0,0,0,0,-0.031069,-1
30,/phpBB2/,8,1,0,0,0,0,-0.031069,-1
31,/phpmyadmin/,12,1,0,0,0,0,-0.020447,-1
