In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import plotly.express as px
import matplotlib.pyplot as plt
import joblib
import wandb
from dotenv import load_dotenv
import os

load_dotenv()
WANDB_API_KEY = os.environ.get('WANDB_API_KEY')

In [2]:
df = pd.read_csv('../attack-sample.csv')

  df = pd.read_csv('../attack-sample.csv')


In [3]:
df.shape

(1000000, 74)

Based on my yesterday's analysis I would like to train the Isolation forest on the http data.

In [4]:
df.columns


Index(['frame.number', 'frame.len', 'frame.time', 'frame.time_epoch',
       'frame.protocols', 'eth.src', 'eth.dst', 'eth.type', 'ip.src', 'ip.dst',
       'ip.len', 'ip.ttl', 'ip.flags', 'ip.frag_offset', 'ip.proto',
       'ip.version', 'ip.dsfield', 'ip.checksum', 'tcp.srcport', 'tcp.dstport',
       'tcp.len', 'tcp.seq', 'tcp.ack', 'tcp.flags', 'tcp.flags.syn',
       'tcp.flags.ack', 'tcp.flags.fin', 'tcp.flags.reset', 'tcp.window_size',
       'tcp.checksum', 'tcp.stream', 'udp.srcport', 'udp.dstport',
       'udp.length', 'udp.checksum', 'icmp.type', 'icmp.code', 'icmp.checksum',
       'http.request.method', 'http.request.uri', 'http.request.version',
       'http.request.full_uri', 'http.response.code', 'http.user_agent',
       'http.content_length_header', 'http.content_type', 'http.cookie',
       'http.host', 'http.referer', 'http.location', 'http.authorization',
       'http.connection', 'dns.qry.name', 'dns.qry.type', 'dns.qry.class',
       'dns.flags.response', 'dns.f

In [11]:
df['http.authorization'].value_counts()

http.authorization
Basic YWRtaW46aXJvbnBvcnQ=                    1
Basic d2ViYWRtaW46d2ViYWRtaW4=                1
Basic d2ViYWRtaW46MTIzNA==                    1
Basic YWRtaW46YmFycmljYWRl                    1
Basic dGVzdDp0ZXN0                            1
Basic ZnRwOg==                                1
Basic cm9vdDpjaGFuZ2VtZQ==                    1
Basic YWRtaW46MDAwMA==                        1
Basic dXNlcl9lZGl0b3I6ZGVtbw==                1
Basic ZTI1MDplMjUwY2hhbmdlbWU=                1
Basic YWRtaW46c2VjdXJl                        1
Basic UUNDOlFMb2dpYzY2                        1
Basic YmlsbHktYm9iOg==                        1
Basic c3RvcndhdGNoOnNwZWNpYWxpc3Q=            1
Basic YWRtaW5pc3RyYXRvcjphZG1pbmlzdHJhdG9y    1
Basic OnBhc3N3b3Jk                            1
Basic eGFtcHA6eGFtcHA=                        1
Basic YWRtaW46b3BlcmF0b3I=                    1
Basic YWRtaW46aGFncG9sbTE=                    1
Basic dXNlcl9wdWJsaXNoZXI6ZGVtbw==            1
Basic YWRtaW46Y2hhbmd

In [12]:
# http.request.version' I will treat this column as categorical Since this is a categorical field with a limited number of expected values (plus some anomalies), label encoding can be a good approach.

# 'http.response.code' --> Treat this column as categorical. one-hot encoding might be more appropriate than label encoding. This is because HTTP response codes are not ordinal (i.e., a higher code doesn't necessarily imply a 'greater' or 'lesser' category in a meaningful way for modeling). One-hot encoding will treat each response code as a separate feature, which could help the model distinguish between different types of responses more effectively.

# 'http.content_length_header'--> Since this is inherently a numerical feature, I can use it directly without the need for categorical encoding. Given the likely wide range of values, normalization might be beneficial. Techniques like Min-Max Scaling or Standardization (Z-score normalization) could be used to bring this feature onto a comparable scale with other features.

# 'http.content_type' --> This column contains various MIME types indicating the nature of the content in the HTTP response. Common types include text/html, application/x-www-form-urlencoded, image/png, image/gif, etc. There are entries that appear to be irregular or not standard MIME types (e.g., entries with sfish and session IDs). These could be indicative of non-standard behavior or anomalies --> could map all text/html variants to a single category, irrespective of the character set details. This would reduce the dimensionality and focus on the broader content type.The anomalous or non-standard types should be handled carefully. Depending on their frequency and significance, you might treat them as a separate category or group them under a generic 'other' category. One-Hot Encoding: After simplification, one-hot encoding can be a good approach, as it will treat each content type as a separate feature.

# 'http.cookie' --> The values are quite complex and varied. There are standard session cookies like PHPSESSID, JSESSIONID, and ASP.NET_SessionId. However, there are also entries that look non-standard or potentially indicative of unusual activity (e.g., strings with -->>'>'" and sfish). Need to simplify the features --> Simplifying the feature might involve extracting specific, common patterns (like the presence of standard session IDs) or quantifying the complexity of the cookie string (e.g., length, number of distinct components). --> Binary or Categorical Encoding: Depending on the simplification approach, I might encode this feature as a binary variable (e.g., indicating the presence or absence of a certain pattern) or as a categorical variable with a manageable number of categories. The anomalous or unusual cookie values should be carefully considered, as they might be significant for detecting anomalies in web traffic.

# 'http.host' --> The http.host column primarily consists of IP addresses and domain names. The most frequent host is 10.20.30.101, which appears to be a local or private IP address, followed by other hosts including a multicast address 239.255.255.250:1900 (commonly used for SSDP), and various domain names. Some entries include specific ports (e.g., 10.20.30.101:8080), while others appear irregular or malformed (e.g., entries with sfish and keep-alive). Categorical Encoding with Simplification: Given the high frequency of the primary IP address and the diversity of other hosts, it might be beneficial to simplify this feature. For instance, I could categorize the data into 'Local IP', 'Multicast IP', 'Domain Name', 'Irregular', etc. The unusual and non-standard entries are potentially significant for anomaly detection and should be encoded in a way that highlights their presence. Depending on the final number of categories after simplification, either one-hot encoding or label encoding can be used. One-hot encoding is preferable if the number of categories is not too high.

# 'http.location' --> This feature contains URLs to which the client is being redirected or specific pages being accessed. It includes both relative paths (like error.php, login.php) and absolute URLs --> The variety ranges from standard application paths (like login pages) to more specific and potentially unusual URLs (such as those ending with .sfish).potential Indicators of Web Application Flow: Standard paths like login.php or signup pages could indicate normal application flow, while frequent redirects to error.php or database-offline.php might signify operational issues or anomalies --> Given the high diversity of URLs, direct use might be impractical due to high dimensionality. A possible approach is to simplify the URLs into broader categories, like 'Error Page', 'Login Page', 'Signup Page', 'External Redirect', 'Unusual Path', etc --> Particular attention should be given to unusual or non-standard paths, as they might be significant for anomaly detection. These could be encoded separately --> One-Hot Encoding or Label Encoding: After categorization, either one-hot encoding or label encoding can be used, depending on the number of categories. One-hot encoding would be preferable for maintaining distinct information about each category.

# http.authorization --> This column contains Base64 encoded authorization credentials, typically in the format Basic [encoded credentials]. Each encoded string represents a username and password pair. --> I will remove this column

# http.request.full_uri --> Feature Transformation for Dimensionality Reduction: Given the high diversity and specificity of the URIs, direct usage in their current form could lead to extremely high dimensionality. A transformation approach might be needed. Some options include: Extracting the base path or resource (e.g., basket.jsp, login.php) and ignoring query parameters or unique identifiers. Grouping URIs into broader categories based on their structure or content. Handling Anomalies: I need to pay special attention to unusual or rarely occurring URIs, as they might be significant for detecting anomalies. However, I need to ensure that the transformation method does not overly generalize these unique URIs. Categorical Encoding: After simplification, I can use categorical encoding methods like one-hot encoding or label encoding. The choice depends on the number of categories after transformation. Text Processing Techniques: Depending on the complexity, text processing techniques like tokenization or feature hashing might be applicable to extract meaningful patterns from the URIs.


In [3]:
features_to_keep = ['http.request.version','http.response.code', 'http.request.full_uri',
                    'http.content_length_header', 'http.content_type',
                    'http.cookie', 'http.host', 'http.location', 'alert']

In [4]:
df_http = df[features_to_keep]


In [7]:
df_http.shape

(1000000, 8)

In [8]:
df_http.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   http.request.version        319771 non-null   object 
 1   http.response.code          319200 non-null   float64
 2   http.content_length_header  333432 non-null   object 
 3   http.content_type           333479 non-null   object 
 4   http.cookie                 161531 non-null   object 
 5   http.host                   319298 non-null   object 
 6   http.location               7996 non-null     object 
 7   alert                       1000000 non-null  object 
dtypes: float64(1), object(7)
memory usage: 61.0+ MB


I will drop the response feature for now as it has more than %70 of missing values

In [13]:
df_http['http.response.code'].isnull().value_counts()

http.response.code
True     680800
False    319200
Name: count, dtype: int64

In [14]:
df_http =  df_http.drop(columns=['http.response.code'])


In [6]:
joblib.dump(df_http, 'df_http.csv')

['df_http.csv']

In [8]:
df_http.head(10)

Unnamed: 0,http.request.version,http.response.code,http.request.full_uri,http.content_length_header,http.content_type,http.cookie,http.host,http.location,alert
0,HTTP/1.1,,http://10.20.30.101/joomla/libraries/joomla/ht...,,,,10.20.30.101,,suspicious
1,,404.0,,2196.0,text/html; charset=utf-8,,,,suspicious
2,,,,,,,,,suspicious
3,HTTP/1.1,,http://10.20.30.101/joomla/plugins/editors/tin...,,,,10.20.30.101,,suspicious
4,,404.0,,2200.0,text/html; charset=utf-8,,,,suspicious
5,,,,,,,,,suspicious
6,,404.0,,186.0,text/html; charset=iso-8859-1,,,,suspicious
7,,404.0,,2210.0,text/html; charset=utf-8,,,,suspicious
8,HTTP/1.1,,http://10.20.30.101/joomla/plugins/editors/tin...,,,,10.20.30.101,,suspicious
9,HTTP/1.1,,http://10.20.30.101/mono/owasp.net/head.sfish/...,,,PHPSESSID=3ks8clrkpt2b3l0p7hkemej9q7; JSESSION...,10.20.30.101,,suspicious


In [13]:
print(f'df_http shape is: {df_http.shape}')
print((df_http.isna().sum() / len(df_http)) * 100)
df_http.isna().sum()

df_http shape is: (1000000, 9)
http.request.version          68.0229
http.response.code            68.0800
http.request.full_uri         68.0664
http.content_length_header    66.6568
http.content_type             66.6521
http.cookie                   83.8469
http.host                     68.0702
http.location                 99.2004
alert                          0.0000
dtype: float64


http.request.version          680229
http.response.code            680800
http.request.full_uri         680664
http.content_length_header    666568
http.content_type             666521
http.cookie                   838469
http.host                     680702
http.location                 992004
alert                              0
dtype: int64

I need to do feature engineering + deciding about missing values in each feature