## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

from scipy.sparse import issparse
import time

In [2]:
data = pd.read_csv("/kaggle/input/httpparamsdataset/HttpParamsDataset-master/payload_full.csv")
data.head()

Unnamed: 0,payload,length,attack_type,label
0,c/ caridad s/n,14,norm,norm
1,"campello, el",12,norm,norm
2,40184,5,norm,norm
3,1442431887503330,16,norm,norm
4,nue37,5,norm,norm


In [3]:
train_data = data["payload"].values
label_data = data["label"].values

## train_test_split

In [4]:
train_data, train_cross, label_train, label_cross = train_test_split(train_data,label_data, 
                                                                     test_size=0.2, random_state = 42)

## Cross Validation

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

n_folds = 10

kf = StratifiedKFold(n_splits=n_folds)

train_data_k_fold = []
for train_index, test_index in kf.split(train_cross, label_cross):
    X_train, X_test = train_cross[train_index], train_cross[test_index]
    y_train, y_test = label_cross[train_index], label_cross[test_index]
    train_data_k_fold.append((X_train, y_train, X_test, y_test))

# 3-gram

## Decision Tree

## PCA

In [21]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# decision tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  43.52415919303894


In [22]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # decision tree
    dt_classifier.fit(reduced_data, y_train)
    
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = dt_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 938.1354036331177 giây ---
Average TP: [230.2 379.1]
Average FP: [8.4 3.7]
Average FN: [3.7 8.4]
Average TN: [379.1 230.2]


In [23]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[230.2, 8.4], [3.7, 379.1]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9842    0.0217    0.0158    0.9805    0.9648    0.9842    0.9744


## SVM

## PCA

In [24]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# svm
svm_classifier = SVC()
svm_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  34.11277723312378


In [25]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # svm
    svm_classifier.fit(reduced_data, y_train)
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = svm_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 3.1471409797668457 giây ---
Average TP: [230.4 387.2]
Average FP: [0.3 3.5]
Average FN: [3.5 0.3]
Average TN: [387.2 230.4]


In [2]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[230.4, 0.3], [3.5, 387.2]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9850    0.0008    0.0150    0.9909    0.9987    0.9850    0.9918


## Naive Bayes

## PCA

In [27]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# gaussian naive bayes
nb_classifier = GaussianNB()
nb_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  24.594082832336426


In [28]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # gaussian naive bayes
    nb_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = nb_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.9354598522186279 giây ---
Average TP: [225.5 361. ]
Average FP: [26.5  8.4]
Average FN: [ 8.4 26.5]
Average TN: [361.  225.5]


In [29]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[225.5, 26.5], [8.4, 361.0]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9641    0.0684    0.0359    0.9438    0.8948    0.9641    0.9282


# 4-gram

## RF

In [30]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# random forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  57.52792978286743


In [31]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 1.2068507671356201 giây ---
Average TP: [231.5 383.8]
Average FP: [3.7 2.4]
Average FN: [2.4 3.7]
Average TN: [383.8 231.5]


In [32]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.5, 3.7], [2.4, 383.8]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9897    0.0095    0.0103    0.9902    0.9843    0.9897    0.9870


## DT

In [33]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# decision tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  35.84761190414429


In [34]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # decision tree
    dt_classifier.fit(reduced_data, y_train)
    
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = dt_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 851.6556220054626 giây ---
Average TP: [230.3 381.6]
Average FP: [5.9 3.6]
Average FN: [3.6 5.9]
Average TN: [381.6 230.3]


In [35]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[230.3, 5.9], [3.6, 381.6]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9846    0.0152    0.0154    0.9847    0.9750    0.9846    0.9798


## SVM

In [36]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# svm
svm_classifier = SVC()
svm_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  37.169310331344604


In [37]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # svm
    svm_classifier.fit(reduced_data, y_train)
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = svm_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 3.188843250274658 giây ---
Average TP: [231.5 386.9]
Average FP: [0.6 2.4]
Average FN: [2.4 0.6]
Average TN: [386.9 231.5]


In [38]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.5, 0.6], [2.4, 386.9]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9897    0.0015    0.0103    0.9952    0.9974    0.9897    0.9936


## NB

In [39]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# gaussian naive bayes
nb_classifier = GaussianNB()
nb_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  25.03987407684326


In [40]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # gaussian naive bayes
    nb_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = nb_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.8918499946594238 giây ---
Average TP: [231.5 370.4]
Average FP: [17.1  2.4]
Average FN: [ 2.4 17.1]
Average TN: [370.4 231.5]


In [41]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.5, 17.1], [2.4, 370.4]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9897    0.0441    0.0103    0.9686    0.9312    0.9897    0.9596


# 5-gram

## RF

In [42]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(5, 5), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# random forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  44.77918601036072


In [43]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 1.2234625816345215 giây ---
Average TP: [232.  383.9]
Average FP: [3.6 1.9]
Average FN: [1.9 3.6]
Average TN: [383.9 232. ]


In [44]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[232.0, 3.6], [1.9, 383.9]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9919    0.0093    0.0081    0.9911    0.9847    0.9919    0.9883


## DT

In [45]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(5, 5), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# decision tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  31.321612119674683


In [46]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # decision tree
    dt_classifier.fit(reduced_data, y_train)
    
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = dt_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 775.3720941543579 giây ---
Average TP: [231.7 383.4]
Average FP: [4.1 2.2]
Average FN: [2.2 4.1]
Average TN: [383.4 231.7]


In [47]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.7, 4.1], [2.2, 383.4]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9906    0.0106    0.0094    0.9899    0.9826    0.9906    0.9866


## SVM

In [48]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(5, 5), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# svm
svm_classifier = SVC()
svm_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  38.32723689079285


In [49]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # svm
    svm_classifier.fit(reduced_data, y_train)
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = svm_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 3.2942147254943848 giây ---
Average TP: [230.7 387.4]
Average FP: [0.1 3.2]
Average FN: [3.2 0.1]
Average TN: [387.4 230.7]


In [50]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[230.7, 0.1], [3.2, 387.4]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9863    0.0003    0.0137    0.9947    0.9996    0.9863    0.9929


## NB

In [51]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(5, 5), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = PCA(n_components=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# gaussian naive bayes
nb_classifier = GaussianNB()
nb_classifier.fit(reduced_data, label_train)
print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  24.033484935760498


In [52]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    
    # gaussian naive bayes
    nb_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = nb_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
print("Thời gian chạy: --- %s giây ---" % time_exe )
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.9929909706115723 giây ---
Average TP: [231.4 381.3]
Average FP: [6.2 2.5]
Average FN: [2.5 6.2]
Average TN: [381.3 231.4]


In [53]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.4, 6.2], [2.5, 381.3]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9893    0.0160    0.0107    0.9860    0.9739    0.9893    0.9815
