## Import libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

from scipy.sparse import issparse
import time

## Load data

In [4]:
data = pd.read_csv("/kaggle/input/httpparamsdataset/HttpParamsDataset-master/payload_full.csv")
data.head()

Unnamed: 0,payload,length,attack_type,label
0,c/ caridad s/n,14,norm,norm
1,"campello, el",12,norm,norm
2,40184,5,norm,norm
3,1442431887503330,16,norm,norm
4,nue37,5,norm,norm


In [5]:
train_data = data["payload"].values
label_data = data["label"].values

## Split data

In [6]:
train_data, train_cross, label_train, label_cross = train_test_split(train_data,label_data, 
                                                                     test_size=0.2, random_state = 42)

## K-fold

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

n_folds = 10

kf = StratifiedKFold(n_splits=n_folds)

train_data_k_fold = []
for train_index, test_index in kf.split(train_cross, label_cross):
    X_train, X_test = train_cross[train_index], train_cross[test_index]
    y_train, y_test = label_cross[train_index], label_cross[test_index]
    train_data_k_fold.append((X_train, y_train, X_test, y_test))

## 2-gram

### PCA

#### Train

In [8]:
start_time = time.time()
res_time = 0
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
res_time += time.time() - start_time
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
start_time = time.time()
# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)
res_time += time.time() - start_time
print("Thời gian chạy: ", res_time)

Thời gian chạy:  26.836788177490234


#### Evaluate

In [26]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.6484661102294922 giây ---
Average TP: [228.3 386.9]
Average FP: [0.6 5.6]
Average FN: [5.6 0.6]
Average TN: [386.9 228.3]


In [27]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[228.3, 0.6], [5.6, 386.9]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9761    0.0015    0.0239    0.9900    0.9974    0.9761    0.9866


### Non-PCA

#### Train

In [15]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", 
                             sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(tfidf_matrix.toarray(), label_train)
print("Thời gian chạy: ", time.time() - start_time)

Thời gian chạy:  17.90942120552063


#### Evaluate

In [29]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0

time_exe = 0

start_time = time.time()
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    # random forest
    rf_classifier.fit(tfidf_matrix, y_train)
    
    
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    y_pred = rf_classifier.predict(tfidf_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
    
print("Thời gian chạy: --- %s giây ---" % time_exe)

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.2866401672363281 giây ---
Average TP: [232.4 387.5]
Average FP: [0.  1.5]
Average FN: [1.5 0. ]
Average TN: [387.5 232.4]


In [30]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[232.4, 0.0], [1.5, 387.5]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9936    0.0000    0.0064    0.9976    1.0000    0.9936    0.9968


## 3-gram

### PCA

#### Train

In [10]:
start_time = time.time()
res_time = 0
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
res_time += time.time() - start_time
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
start_time = time.time()
# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)
res_time += time.time() - start_time
print("Thời gian chạy: ", res_time)

Thời gian chạy:  27.522501230239868


#### Evaluate

In [34]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 1.49257230758667 giây ---
Average TP: [230.9 385.1]
Average FP: [2.4 3. ]
Average FN: [3.  2.4]
Average TN: [385.1 230.9]


In [35]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[230.9, 2.4], [3.0, 385.1]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9872    0.0062    0.0128    0.9913    0.9897    0.9872    0.9884


### Non-PCA

#### Train

In [12]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", 
                             sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(tfidf_matrix.toarray(), label_train)
print("Thời gian chạy: ", time.time() - start_time)

Thời gian chạy:  92.99857187271118


#### Evaluate

In [37]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0

time_exe = 0

start_time = time.time()
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    # random forest
    rf_classifier.fit(tfidf_matrix, y_train)
    
    
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    y_pred = rf_classifier.predict(tfidf_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
    
print("Thời gian chạy: --- %s giây ---" % time_exe)

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.40568065643310547 giây ---
Average TP: [231.9 387.5]
Average FP: [0. 2.]
Average FN: [2. 0.]
Average TN: [387.5 231.9]


In [38]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.9, 0.0], [2.0, 387.5]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9914    0.0000    0.0086    0.9968    1.0000    0.9914    0.9957


## 4-gram

### PCA

#### Train

In [13]:
start_time = time.time()
res_time = 0
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
res_time += time.time() - start_time
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
start_time = time.time()
# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)
res_time += time.time() - start_time
print("Thời gian chạy: ", res_time)

Thời gian chạy:  17.34978222846985


#### Evaluate

In [None]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

In [None]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

### Non-PCA

#### Train

In [16]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", 
                             sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(tfidf_matrix.toarray(), label_train)
print("Thời gian chạy: ", time.time() - start_time)

Thời gian chạy:  132.56825613975525


#### Evaluate

In [None]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0

time_exe = 0

start_time = time.time()
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    # random forest
    rf_classifier.fit(tfidf_matrix, y_train)
    
    
#     # pca
#     pca = PCA(n_components=256)
#     reduced_data = pca.fit_transform(tfidf_matrix.toarray())
#     reduced_test  = pca.transform(tfidf_test.toarray())

    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    y_pred = rf_classifier.predict(tfidf_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
    
print("Thời gian chạy: --- %s giây ---" % time_exe)

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

In [None]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)