# Load library

In [522]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Load data

In [523]:
data = pd.read_csv('dataset.csv')
data

Unnamed: 0,Class,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
0,Anomalous,GET,HTTP/1.0,invalid,"audio/*;q=0.7, audio/*;q=0.0",*;q=0.6,*;q=0.3,invalid,,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,,,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...
1,Anomalous,PUT,HTTP/1.0,close,"image/*, image/jpeg;q=0.1","x-mac-greek;q=0.9, euc-cn, x-mac-greek, utf-7",*,invalid,no-cache,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,application/x-www-form-urlencoded,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,
2,Anomalous,POST,HTTP/1.0,close,"audio/*, audio/basic, text/html;q=0.4",*,non-standard,invalid,no-cache,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,application/x-www-form-urlencoded,,
3,Anomalous,GET,HTTP/1.0,close,*/*;q=0.3,"x-mac-japanese;q=0.2, iso-8859-3",*,invalid,no-cache,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,,,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...
4,Anomalous,GET,HTTP/1.1,close,*/*,*,non-standard,invalid,no-cache,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,,,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84952,Valid,GET,HTTP/1.0,close,"application/*;q=0.5, text/*",windows-1255,non-standard,,no-cache,invalid,,,
84953,Valid,PUT,HTTP/1.1,close,*/*,"iso-2022-jp;q=0.9, koi8, shift_jis;q=0.8, iso-...",non-standard,,no-cache,invalid,application/x-www-form-urlencoded,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,
84954,Valid,GET,HTTP/1.1,keep-alive,"image/*;q=0.3, image/*;q=0.4",x-mac-korean;q=0.7,non-standard,,invalid,invalid,,,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...
84955,Valid,GET,HTTP/1.1,keep-alive,"video/*, image/*, application/x-tar","utf-8;q=0.3, utf-8;q=0.4",non-standard,,invalid,invalid,,,kesRt=leodn%25&daeAzrero=5&shavingI0=sr


# EDA

## Data Understanding

In [524]:
data.shape

(84957, 13)

In [525]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84957 entries, 0 to 84956
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Class            84957 non-null  object
 1   Method           84957 non-null  object
 2   Host-Header      84957 non-null  object
 3   Connection       84953 non-null  object
 4   Accept           84916 non-null  object
 5   Accept-Charset   84867 non-null  object
 6   Accept-Language  84867 non-null  object
 7   Cache-control    84596 non-null  object
 8   Pragma           84210 non-null  object
 9   User-Agent       84954 non-null  object
 10  Content-Type     21950 non-null  object
 11  POST-Data        21942 non-null  object
 12  GET-Query        34707 non-null  object
dtypes: object(13)
memory usage: 8.4+ MB


In [526]:
data.describe()


Unnamed: 0,Class,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
count,84957,84957,84957,84953,84916,84867,84867,84596,84210,84954,21950,21942,34707
unique,2,3,2,3,6791,9053,13,3068,2,11677,1,16051,28395
top,Valid,GET,HTTP/1.1,close,"text/xml,application/xml,application/xhtml+xml...","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,no-cache,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,application/x-www-form-urlencoded,B2=Vaciar+carrito,B2=Vaciar+carrito
freq,46287,63007,73067,69078,61065,61065,61065,63907,72544,61065,21950,1046,1000


In [527]:
print(f"Number of missing values: {data.isnull().sum().sum()}")
print(f"Number of duplicated values: {data.duplicated().sum()}")
print(f"Number of unique values: {data.nunique().sum()}")

Number of missing values: 177608
Number of duplicated values: 37114
Number of unique values: 75061


In [528]:
data["Class"].value_counts()

Class
Valid        46287
Anomalous    38670
Name: count, dtype: int64

## Data Cleaning

In [529]:
# Rename columns
data = data.rename(columns={"Class": "Label"})
data

Unnamed: 0,Label,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
0,Anomalous,GET,HTTP/1.0,invalid,"audio/*;q=0.7, audio/*;q=0.0",*;q=0.6,*;q=0.3,invalid,,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,,,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...
1,Anomalous,PUT,HTTP/1.0,close,"image/*, image/jpeg;q=0.1","x-mac-greek;q=0.9, euc-cn, x-mac-greek, utf-7",*,invalid,no-cache,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,application/x-www-form-urlencoded,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,
2,Anomalous,POST,HTTP/1.0,close,"audio/*, audio/basic, text/html;q=0.4",*,non-standard,invalid,no-cache,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,application/x-www-form-urlencoded,,
3,Anomalous,GET,HTTP/1.0,close,*/*;q=0.3,"x-mac-japanese;q=0.2, iso-8859-3",*,invalid,no-cache,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,,,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...
4,Anomalous,GET,HTTP/1.1,close,*/*,*,non-standard,invalid,no-cache,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,,,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84952,Valid,GET,HTTP/1.0,close,"application/*;q=0.5, text/*",windows-1255,non-standard,,no-cache,invalid,,,
84953,Valid,PUT,HTTP/1.1,close,*/*,"iso-2022-jp;q=0.9, koi8, shift_jis;q=0.8, iso-...",non-standard,,no-cache,invalid,application/x-www-form-urlencoded,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,
84954,Valid,GET,HTTP/1.1,keep-alive,"image/*;q=0.3, image/*;q=0.4",x-mac-korean;q=0.7,non-standard,,invalid,invalid,,,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...
84955,Valid,GET,HTTP/1.1,keep-alive,"video/*, image/*, application/x-tar","utf-8;q=0.3, utf-8;q=0.4",non-standard,,invalid,invalid,,,kesRt=leodn%25&daeAzrero=5&shavingI0=sr


In [530]:
data = data.drop_duplicates()
print(f"Number of duplicated values: {data.duplicated().sum()}")


Number of duplicated values: 0


In [531]:
data.fillna(0, inplace=True)

In [532]:
data


Unnamed: 0,Label,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
0,Anomalous,GET,HTTP/1.0,invalid,"audio/*;q=0.7, audio/*;q=0.0",*;q=0.6,*;q=0.3,invalid,0,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,0,0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...
1,Anomalous,PUT,HTTP/1.0,close,"image/*, image/jpeg;q=0.1","x-mac-greek;q=0.9, euc-cn, x-mac-greek, utf-7",*,invalid,no-cache,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,application/x-www-form-urlencoded,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,0
2,Anomalous,POST,HTTP/1.0,close,"audio/*, audio/basic, text/html;q=0.4",*,non-standard,invalid,no-cache,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,application/x-www-form-urlencoded,0,0
3,Anomalous,GET,HTTP/1.0,close,*/*;q=0.3,"x-mac-japanese;q=0.2, iso-8859-3",*,invalid,no-cache,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,0,0,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...
4,Anomalous,GET,HTTP/1.1,close,*/*,*,non-standard,invalid,no-cache,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,0,0,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84952,Valid,GET,HTTP/1.0,close,"application/*;q=0.5, text/*",windows-1255,non-standard,0,no-cache,invalid,0,0,0
84953,Valid,PUT,HTTP/1.1,close,*/*,"iso-2022-jp;q=0.9, koi8, shift_jis;q=0.8, iso-...",non-standard,0,no-cache,invalid,application/x-www-form-urlencoded,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,0
84954,Valid,GET,HTTP/1.1,keep-alive,"image/*;q=0.3, image/*;q=0.4",x-mac-korean;q=0.7,non-standard,0,invalid,invalid,0,0,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...
84955,Valid,GET,HTTP/1.1,keep-alive,"video/*, image/*, application/x-tar","utf-8;q=0.3, utf-8;q=0.4",non-standard,0,invalid,invalid,0,0,kesRt=leodn%25&daeAzrero=5&shavingI0=sr


In [533]:
# Outlier Detection
from scipy import stats
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
df_no_outliers = data[(z_scores < 3).all(axis=1)]
df_no_outliers


Unnamed: 0,Label,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
0,Anomalous,GET,HTTP/1.0,invalid,"audio/*;q=0.7, audio/*;q=0.0",*;q=0.6,*;q=0.3,invalid,0,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,0,0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...
1,Anomalous,PUT,HTTP/1.0,close,"image/*, image/jpeg;q=0.1","x-mac-greek;q=0.9, euc-cn, x-mac-greek, utf-7",*,invalid,no-cache,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,application/x-www-form-urlencoded,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,0
2,Anomalous,POST,HTTP/1.0,close,"audio/*, audio/basic, text/html;q=0.4",*,non-standard,invalid,no-cache,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,application/x-www-form-urlencoded,0,0
3,Anomalous,GET,HTTP/1.0,close,*/*;q=0.3,"x-mac-japanese;q=0.2, iso-8859-3",*,invalid,no-cache,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,0,0,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...
4,Anomalous,GET,HTTP/1.1,close,*/*,*,non-standard,invalid,no-cache,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,0,0,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84952,Valid,GET,HTTP/1.0,close,"application/*;q=0.5, text/*",windows-1255,non-standard,0,no-cache,invalid,0,0,0
84953,Valid,PUT,HTTP/1.1,close,*/*,"iso-2022-jp;q=0.9, koi8, shift_jis;q=0.8, iso-...",non-standard,0,no-cache,invalid,application/x-www-form-urlencoded,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,0
84954,Valid,GET,HTTP/1.1,keep-alive,"image/*;q=0.3, image/*;q=0.4",x-mac-korean;q=0.7,non-standard,0,invalid,invalid,0,0,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...
84955,Valid,GET,HTTP/1.1,keep-alive,"video/*, image/*, application/x-tar","utf-8;q=0.3, utf-8;q=0.4",non-standard,0,invalid,invalid,0,0,kesRt=leodn%25&daeAzrero=5&shavingI0=sr


## Descriptive Statistics


In [534]:
data.describe()


Unnamed: 0,Label,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
count,47843,47843,47843,47843,47843,47843,47843,47843,47843,47843,47843,47843,47843
unique,2,3,2,4,6792,9054,14,3069,3,11678,2,16052,28396
top,Anomalous,GET,HTTP/1.1,close,"text/xml,application/xml,application/xhtml+xml...","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,no-cache,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,0,0,0
freq,27954,31763,35954,31965,23954,23954,23954,26794,35431,23954,31763,31771,18900


## Data Distribution

## Data Relationships



In [535]:
# Save data
data.to_csv("data_cleaned.csv", index=False)
df_no_outliers.to_csv("data_no_outliers.csv", index=False)


### Features selection

In [536]:
data = pd.read_csv("data_cleaned.csv")
data


Unnamed: 0,Label,Method,Host-Header,Connection,Accept,Accept-Charset,Accept-Language,Cache-control,Pragma,User-Agent,Content-Type,POST-Data,GET-Query
0,Anomalous,GET,HTTP/1.0,invalid,"audio/*;q=0.7, audio/*;q=0.0",*;q=0.6,*;q=0.3,invalid,0,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,0,0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...
1,Anomalous,PUT,HTTP/1.0,close,"image/*, image/jpeg;q=0.1","x-mac-greek;q=0.9, euc-cn, x-mac-greek, utf-7",*,invalid,no-cache,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,application/x-www-form-urlencoded,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,0
2,Anomalous,POST,HTTP/1.0,close,"audio/*, audio/basic, text/html;q=0.4",*,non-standard,invalid,no-cache,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,application/x-www-form-urlencoded,0,0
3,Anomalous,GET,HTTP/1.0,close,*/*;q=0.3,"x-mac-japanese;q=0.2, iso-8859-3",*,invalid,no-cache,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,0,0,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...
4,Anomalous,GET,HTTP/1.1,close,*/*,*,non-standard,invalid,no-cache,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,0,0,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47838,Valid,GET,HTTP/1.0,close,"application/*;q=0.5, text/*",windows-1255,non-standard,0,no-cache,invalid,0,0,0
47839,Valid,PUT,HTTP/1.1,close,*/*,"iso-2022-jp;q=0.9, koi8, shift_jis;q=0.8, iso-...",non-standard,0,no-cache,invalid,application/x-www-form-urlencoded,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,0
47840,Valid,GET,HTTP/1.1,keep-alive,"image/*;q=0.3, image/*;q=0.4",x-mac-korean;q=0.7,non-standard,0,invalid,invalid,0,0,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...
47841,Valid,GET,HTTP/1.1,keep-alive,"video/*, image/*, application/x-tar","utf-8;q=0.3, utf-8;q=0.4",non-standard,0,invalid,invalid,0,0,kesRt=leodn%25&daeAzrero=5&shavingI0=sr


In [537]:
newData = data[["POST-Data", "GET-Query","User-Agent","Host-Header","Method","Content-Type","Label"]]
newData

Unnamed: 0,POST-Data,GET-Query,User-Agent,Host-Header,Method,Content-Type,Label
0,0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,HTTP/1.0,GET,0,Anomalous
1,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,0,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,HTTP/1.0,PUT,application/x-www-form-urlencoded,Anomalous
2,0,0,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,HTTP/1.0,POST,application/x-www-form-urlencoded,Anomalous
3,0,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,HTTP/1.0,GET,0,Anomalous
4,0,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,HTTP/1.1,GET,0,Anomalous
...,...,...,...,...,...,...,...
47838,0,0,invalid,HTTP/1.0,GET,0,Valid
47839,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,0,invalid,HTTP/1.1,PUT,application/x-www-form-urlencoded,Valid
47840,0,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,invalid,HTTP/1.1,GET,0,Valid
47841,0,kesRt=leodn%25&daeAzrero=5&shavingI0=sr,invalid,HTTP/1.1,GET,0,Valid


In [538]:
# Feature Engineering
from sklearn.preprocessing import LabelEncoder

# Encode the label
label_encoder = LabelEncoder()
newData["Label"] = label_encoder.fit_transform(newData["Label"])
newData


Unnamed: 0,POST-Data,GET-Query,User-Agent,Host-Header,Method,Content-Type,Label
0,0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,HTTP/1.0,GET,0,0
1,tcy0wLnt7sw=lTWFH@dvrU&zUEX8MFk=htaccesonDonin...,0,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,HTTP/1.0,PUT,application/x-www-form-urlencoded,0
2,0,0,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,HTTP/1.0,POST,application/x-www-form-urlencoded,0
3,0,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,HTTP/1.0,GET,0,0
4,0,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,HTTP/1.1,GET,0,0
...,...,...,...,...,...,...,...
47838,0,0,invalid,HTTP/1.0,GET,0,1
47839,lYVd1L2eincludenK=56&EbvGT=9078475&edftegaG6Ow...,0,invalid,HTTP/1.1,PUT,application/x-www-form-urlencoded,1
47840,0,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,invalid,HTTP/1.1,GET,0,1
47841,0,kesRt=leodn%25&daeAzrero=5&shavingI0=sr,invalid,HTTP/1.1,GET,0,1


In [539]:
# Encode the categorical features
newData["POST-Data"] = label_encoder.fit_transform(newData["POST-Data"].astype(str))
newData["GET-Query"] = label_encoder.fit_transform(newData["GET-Query"].astype(str))
newData["User-Agent"] = label_encoder.fit_transform(newData["User-Agent"].astype(str))
newData["Host-Header"] = label_encoder.fit_transform(newData["Host-Header"].astype(str))
newData["Method"] = label_encoder.fit_transform(newData["Method"].astype(str))
newData["Content-Type"] = label_encoder.fit_transform(newData["Content-Type"].astype(str))
newData



Unnamed: 0,POST-Data,GET-Query,User-Agent,Host-Header,Method,Content-Type,Label
0,21,3146,11647,0,0,0,0
1,15585,110,11640,0,2,1,0
2,21,110,11627,0,1,1,0
3,21,11733,11608,0,0,0,0
4,21,5444,11594,1,0,0,0
...,...,...,...,...,...,...,...
47838,21,110,11677,0,0,0,1
47839,4937,110,11677,1,2,1,1
47840,21,28370,11677,1,0,0,1
47841,21,12279,11677,1,0,0,1


In [540]:
# Ensure k is less than or equal to the number of features
k = min(5, newData.shape[1] - 1)  # Subtract 1 for the label column

# Select the top k features
selector = SelectKBest(k=k)
selector.fit(newData.drop("Label", axis=1), newData["Label"])

# Debugging: Print shapes to verify
print("DataFrame shape:", newData.drop("Label", axis=1).shape)
print("Support array shape:", selector.get_support().shape)

# Get the selected feature names
selected_features = newData.drop("Label", axis=1).columns[selector.get_support()].tolist()
selected_features



DataFrame shape: (47843, 6)
Support array shape: (6,)


['GET-Query', 'User-Agent', 'Host-Header', 'Method', 'Content-Type']

In [541]:
data = data[selected_features + ["Label"]]  
data


Unnamed: 0,GET-Query,User-Agent,Host-Header,Method,Content-Type,Label
0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,Mozilla/9.9 (X11; U; Unix 0.4; el-1y; rv:6.5.0...,HTTP/1.0,GET,0,Anomalous
1,0,Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1-ff;...,HTTP/1.0,PUT,application/x-www-form-urlencoded,Anomalous
2,0,Mozilla/9.9 (Windows; U; Windows NT 4.9; lf-wc...,HTTP/1.0,POST,application/x-www-form-urlencoded,Anomalous
3,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,Mozilla/9.9 (Windows; U; Win 9x 9.4; 0t-os; rv...,HTTP/1.0,GET,0,Anomalous
4,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,Mozilla/9.9 (Machintosh; U; PPC 8.5; in-hu; rv...,HTTP/1.1,GET,0,Anomalous
...,...,...,...,...,...,...
47838,0,invalid,HTTP/1.0,GET,0,Valid
47839,0,invalid,HTTP/1.1,PUT,application/x-www-form-urlencoded,Valid
47840,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,invalid,HTTP/1.1,GET,0,Valid
47841,kesRt=leodn%25&daeAzrero=5&shavingI0=sr,invalid,HTTP/1.1,GET,0,Valid


In [542]:
def concatenate_features(row):
    return f"{row['GET-Query']} | {row['User-Agent']} | {row['Host-Header']} | {row['Method']} | {row['Content-Type']}"

# Create interaction terms
data['Method_ContentType'] = data['Method'] + "_" + data['Content-Type']

# Apply the concatenation
data['combined_text'] = data[selected_features].apply(concatenate_features, axis=1)

data = data[['combined_text','Label']]
data 



Unnamed: 0,combined_text,Label
0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,Anomalous
1,0 | Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1...,Anomalous
2,0 | Mozilla/9.9 (Windows; U; Windows NT 4.9; l...,Anomalous
3,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,Anomalous
4,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,Anomalous
...,...,...
47838,0 | invalid | HTTP/1.0 | GET | 0,Valid
47839,0 | invalid | HTTP/1.1 | PUT | application/x-w...,Valid
47840,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,Valid
47841,kesRt=leodn%25&daeAzrero=5&shavingI0=sr | inva...,Valid


In [543]:
from sklearn.preprocessing import LabelEncoder
# Labelencode the label
le = LabelEncoder()
data["Label"] = le.fit_transform(data["Label"])
data

Unnamed: 0,combined_text,Label
0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,0
1,0 | Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1...,0
2,0 | Mozilla/9.9 (Windows; U; Windows NT 4.9; l...,0
3,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,0
4,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,0
...,...,...
47838,0 | invalid | HTTP/1.0 | GET | 0,1
47839,0 | invalid | HTTP/1.1 | PUT | application/x-w...,1
47840,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,1
47841,kesRt=leodn%25&daeAzrero=5&shavingI0=sr | inva...,1


In [544]:
def preprocess_request(request):
    parts = request.split(' | ')
    url_params, user_agent, http_version, method = (parts + ["UNKNOWN"] * 4)[:4]
    return (
        f"HTTP method: {method}, "
        f"URL Parameters: {url_params}, "
        f"User-Agent: {user_agent}, "
        f"HTTP version: {http_version}"
    )

data['Final'] = data['combined_text'].apply(preprocess_request)
data

Unnamed: 0,combined_text,Label,Final
0,NT0cDJacceptnfOF=138695082&wcatUzz=dmH_g&t1lTs...,0,"HTTP method: GET, URL Parameters: NT0cDJaccept..."
1,0 | Mozilla/9.9 (X11; U; Open BSD i586 4.2; w1...,0,"HTTP method: PUT, URL Parameters: 0, User-Agen..."
2,0 | Mozilla/9.9 (Windows; U; Windows NT 4.9; l...,0,"HTTP method: POST, URL Parameters: 0, User-Age..."
3,imgbo-K@xZdW=tnpp&QVc@=8Ehto&EaEEtEgdis=thxoo8...,0,"HTTP method: GET, URL Parameters: imgbo-K@xZdW..."
4,beqqeAoaUeLxen=%22%29%28targetfilter%3D%28o%3D...,0,"HTTP method: GET, URL Parameters: beqqeAoaUeLx..."
...,...,...,...
47838,0 | invalid | HTTP/1.0 | GET | 0,1,"HTTP method: GET, URL Parameters: 0, User-Agen..."
47839,0 | invalid | HTTP/1.1 | PUT | application/x-w...,1,"HTTP method: PUT, URL Parameters: 0, User-Agen..."
47840,zsnd0edcIssro=00445431&o2d=bStgeseuew0U%5Dm%40...,1,"HTTP method: GET, URL Parameters: zsnd0edcIssr..."
47841,kesRt=leodn%25&daeAzrero=5&shavingI0=sr | inva...,1,"HTTP method: GET, URL Parameters: kesRt=leodn%..."


In [546]:
# remove combine_text
data = data.drop(columns="combined_text")
data

Unnamed: 0,Label,Final
0,0,"HTTP method: GET, URL Parameters: NT0cDJaccept..."
1,0,"HTTP method: PUT, URL Parameters: 0, User-Agen..."
2,0,"HTTP method: POST, URL Parameters: 0, User-Age..."
3,0,"HTTP method: GET, URL Parameters: imgbo-K@xZdW..."
4,0,"HTTP method: GET, URL Parameters: beqqeAoaUeLx..."
...,...,...
47838,1,"HTTP method: GET, URL Parameters: 0, User-Agen..."
47839,1,"HTTP method: PUT, URL Parameters: 0, User-Agen..."
47840,1,"HTTP method: GET, URL Parameters: zsnd0edcIssr..."
47841,1,"HTTP method: GET, URL Parameters: kesRt=leodn%..."


# Data preprocessing

## Tokenize

# Modeling

In [552]:
import torch
from transformers import AutoTokenizer, AutoModel

device = "cuda" # the device to load the model onto

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")


ModuleNotFoundError: No module named 'torch'