## Import libraries

In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

from scipy.sparse import issparse
from imblearn.over_sampling import SMOTE

## Load data

In [47]:
data = pd.read_csv("/kaggle/input/httpparamsdataset/HttpParamsDataset-master/payload_full.csv")
data.head()

Unnamed: 0,payload,length,attack_type,label
0,c/ caridad s/n,14,norm,norm
1,"campello, el",12,norm,norm
2,40184,5,norm,norm
3,1442431887503330,16,norm,norm
4,nue37,5,norm,norm


In [48]:
train_data = data["payload"].values
label_data = data["label"].values
X_train, X_test, y_train, y_test = train_test_split(train_data, label_data, test_size=0.2, random_state=42)

In [49]:
print(train_data.shape)

(31067,)


In [50]:
label_binarizer = LabelBinarizer()
y_test_binary = label_binarizer.fit_transform(y_test)
y_train_binary = label_binarizer.fit_transform(y_train)

In [51]:
print(y_test_binary[0])
print(y_test[0])

[1]
norm


# Attack type

In [52]:
from sklearn.preprocessing import LabelEncoder
train_data = data["payload"].values
label_data = data["attack_type"].values
X_train, X_test, y_train, y_test = train_test_split(train_data, label_data, test_size=0.2, random_state=42)

In [53]:
label_encode = LabelEncoder()
y_test_encode = label_encode.fit_transform(y_test)
y_train_encode = label_encode.fit_transform(y_train)

In [54]:
print(np.unique(y_train), np.unique(y_train_encode))


['cmdi' 'norm' 'path-traversal' 'sqli' 'xss'] [0 1 2 3 4]


In [55]:
X_train.shape

(24853,)

## Random Forest

### PCA

In [56]:
reduced_data.shape

(77145, 256)

In [57]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(X_train)

pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
smt = SMOTE()
# random forest
reduced_data, label_train = smt.fit_resample(reduced_data, y_train_encode)
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)


In [58]:
tfidf_test = vectorizer.transform(X_test)
reduced_test  = pca.transform(tfidf_test.toarray())
y_pred = rf_classifier.predict(reduced_test)

In [60]:
clf_report = metrics.classification_report(y_test_encode, y_pred, digits=4)
cnf_matrix = metrics.confusion_matrix(y_test_encode, y_pred)

print(cnf_matrix)


[[  15    3    0    0    0]
 [ 114 3756    1    4    0]
 [   0    0   65    0    0]
 [   0    2    0 2151    0]
 [   0    3    0    0  100]]


### Non-PCA

In [61]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(X_train)

smt = SMOTE()
# random forest
reduced_data, label_train = smt.fit_resample(tfidf_matrix, y_train_encode)
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)

In [62]:
tfidf_test = vectorizer.transform(X_test)
# reduced_test  = pca.transform(tfidf_test.toarray())
y_pred = rf_classifier.predict(tfidf_test)

In [63]:
clf_report = metrics.classification_report(y_test_encode, y_pred, digits=4)
cnf_matrix = metrics.confusion_matrix(y_test_encode, y_pred)

print(cnf_matrix)


[[  15    2    1    0    0]
 [   3 3872    0    0    0]
 [   0    0   65    0    0]
 [   0    1    0 2152    0]
 [   0    2    0    0  101]]
