## Import libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

from scipy.sparse import issparse
import time

## Load data

In [6]:
data = pd.read_csv("/kaggle/input/httpparamsdataset/HttpParamsDataset-master/payload_full.csv")
data.head()

Unnamed: 0,payload,length,attack_type,label
0,c/ caridad s/n,14,norm,norm
1,"campello, el",12,norm,norm
2,40184,5,norm,norm
3,1442431887503330,16,norm,norm
4,nue37,5,norm,norm


In [7]:
train_data = data["payload"].values
label_data = data["label"].values

## Split data K-fold

In [8]:
train_data, train_cross, label_train, label_cross = train_test_split(train_data,label_data, 
                                                                     test_size=0.2, random_state = 42)

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

n_folds = 10

kf = StratifiedKFold(n_splits=n_folds)

train_data_k_fold = []
for train_index, test_index in kf.split(train_cross, label_cross):
    X_train, X_test = train_cross[train_index], train_cross[test_index]
    y_train, y_test = label_cross[train_index], label_cross[test_index]
    train_data_k_fold.append((X_train, y_train, X_test, y_test))

# 2-gram

## RF + Correlation

In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

In [7]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.83109712600708


In [8]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.3894035816192627 giây ---
Average TP: [231.  386.9]
Average FP: [0.6 2.9]
Average FN: [2.9 0.6]
Average TN: [386.9 231. ]


In [9]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[231.0, 0.6], [2.9, 386.9]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9876    0.0015    0.0124    0.9944    0.9974    0.9876    0.9925


## DT + Correlation

In [10]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = DecisionTreeClassifier()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)



Thời gian chạy:  2.7568247318267822


In [11]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.2836019992828369 giây ---
Average TP: [229.7 385.9]
Average FP: [1.6 4.2]
Average FN: [4.2 1.6]
Average TN: [385.9 229.7]


In [12]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[229.7, 1.6], [4.2, 385.9]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9820    0.0041    0.0180    0.9907    0.9931    0.9820    0.9875


## SVM + Correlation

In [13]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = SVC()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  14.279884815216064


In [14]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 1.9246580600738525 giây ---
Average TP: [226.6 387. ]
Average FP: [0.5 7.3]
Average FN: [7.3 0.5]
Average TN: [387.  226.6]


In [15]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[226.6, 0.5], [7.3, 387.0]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9688    0.0013    0.0312    0.9874    0.9978    0.9688    0.9831


## NB + Correlation

In [16]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(2, 2), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = GaussianNB()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  2.3654887676239014


In [17]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.3009474277496338 giây ---
Average TP: [225.6 384.7]
Average FP: [2.8 8.3]
Average FN: [8.3 2.8]
Average TN: [384.7 225.6]


In [18]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[225.6, 2.8], [8.3, 384.7]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9645    0.0072    0.0355    0.9821    0.9877    0.9645    0.9760


# 3-gram

## RF + Correlation

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

In [11]:
start_time = time.time()
res_time = 0 
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
res_time += time.time()-start_time
# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
start_time = time.time()
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)
res_time += time.time()-start_time

print("Thời gian chạy: ", res_time)

Thời gian chạy:  3.31902813911438


In [21]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.728630781173706 giây ---
Average TP: [217.  386.7]
Average FP: [ 0.8 16.9]
Average FN: [16.9  0.8]
Average TN: [386.7 217. ]


In [22]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[217.0, 0.8], [16.9, 386.7]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9277    0.0021    0.0723    0.9715    0.9963    0.9277    0.9608


## DT + Correlation

In [23]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = DecisionTreeClassifier()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.96515154838562


In [24]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.6171400547027588 giây ---
Average TP: [216.9 386.1]
Average FP: [ 1.4 17. ]
Average FN: [17.   1.4]
Average TN: [386.1 216.9]


In [25]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[216.9, 1.4], [17.0, 386.1]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9273    0.0036    0.0727    0.9704    0.9936    0.9273    0.9593


## SVM + Correlation

In [26]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = SVC()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  19.559290885925293


In [27]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 2.045280933380127 giây ---
Average TP: [214.6 386.2]
Average FP: [ 1.3 19.3]
Average FN: [19.3  1.3]
Average TN: [386.2 214.6]


In [28]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.6, 1.3], [19.3, 386.2]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9175    0.0034    0.0825    0.9668    0.9940    0.9175    0.9542


## NB + Correlation

In [29]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = GaussianNB()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.635833263397217


In [30]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.6453804969787598 giây ---
Average TP: [214.5 383.6]
Average FP: [ 3.9 19.4]
Average FN: [19.4  3.9]
Average TN: [383.6 214.5]


In [31]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.5, 3.9], [19.4, 383.6]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9171    0.0101    0.0829    0.9625    0.9821    0.9171    0.9485


# 4-gram

## RF + Correlation

In [32]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

In [33]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  5.873429536819458


In [34]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.5241591930389404 giây ---
Average TP: [214.  387.4]
Average FP: [ 0.1 19.9]
Average FN: [19.9  0.1]
Average TN: [387.4 214. ]


In [35]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.0, 0.1], [19.9, 387.4]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9149    0.0003    0.0851    0.9678    0.9995    0.9149    0.9554


## DT + Correlation

In [36]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = DecisionTreeClassifier()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)



Thời gian chạy:  4.596590280532837


In [37]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.49536728858947754 giây ---
Average TP: [214.1 387.3]
Average FP: [ 0.2 19.8]
Average FN: [19.8  0.2]
Average TN: [387.3 214.1]


In [38]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.1, 0.2], [19.8, 387.3]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9153    0.0005    0.0847    0.9678    0.9991    0.9153    0.9554


## SVM + Correlation

In [39]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = SVC()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  16.201651096343994


In [40]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 1.5039410591125488 giây ---
Average TP: [213.3 385.6]
Average FP: [ 1.9 20.6]
Average FN: [20.6  1.9]
Average TN: [385.6 213.3]


In [41]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[213.3, 1.9], [20.6, 385.6]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9119    0.0049    0.0881    0.9638    0.9912    0.9119    0.9499


## NB + Correlation

In [42]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(4, 4), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = GaussianNB()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.4817235469818115


In [43]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.5794570446014404 giây ---
Average TP: [214.7 385. ]
Average FP: [ 2.5 19.2]
Average FN: [19.2  2.5]
Average TN: [385.  214.7]


In [44]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.7, 2.5], [19.2, 385.0]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9179    0.0065    0.0821    0.9651    0.9885    0.9179    0.9519


# 5-gram

## RF + Correlation

In [45]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

In [46]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)
# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = RandomForestClassifier(n_estimators=50)
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  6.868495941162109


In [47]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.7238509654998779 giây ---
Average TP: [217.  386.6]
Average FP: [ 0.9 16.9]
Average FN: [16.9  0.9]
Average TN: [386.6 217. ]


In [48]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[217.0, 0.9], [16.9, 386.6]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9277    0.0023    0.0723    0.9714    0.9959    0.9277    0.9606


## DT + Correlation

In [49]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = DecisionTreeClassifier()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.900875091552734


In [50]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.6199371814727783 giây ---
Average TP: [217.  386.2]
Average FP: [ 1.3 16.9]
Average FN: [16.9  1.3]
Average TN: [386.2 217. ]


In [51]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[217.0, 1.3], [16.9, 386.2]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9277    0.0034    0.0723    0.9707    0.9940    0.9277    0.9598


## SVM + Correlation

In [52]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = SVC()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  19.162943124771118


In [53]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 2.046395778656006 giây ---
Average TP: [214.6 386.2]
Average FP: [ 1.3 19.3]
Average FN: [19.3  1.3]
Average TN: [386.2 214.6]


In [54]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.6, 1.3], [19.3, 386.2]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9175    0.0034    0.0825    0.9668    0.9940    0.9175    0.9542


## NB + Correlation

In [55]:
start_time = time.time()
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3), max_features=8000)
tfidf_matrix = vectorizer.fit_transform(train_data)

# feature_names = vectorizer.get_feature_names()

# feature_names


# pca
pca = SelectKBest(f_classif, k=256)
reduced_data = pca.fit_transform(tfidf_matrix.toarray(), label_train)

# random forest
rf_classifier = GaussianNB()
rf_classifier.fit(reduced_data, label_train)

print("Thời gian chạy: ", time.time()-start_time)

Thời gian chạy:  4.656690835952759


In [56]:
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0
import time

time_exe = 0
for X_train, y_train, X_test, y_test in train_data_k_fold:
    # tf-idf
    tfidf_matrix = vectorizer.fit_transform(X_train)
    
    # pca
    reduced_data = pca.fit_transform(tfidf_matrix.toarray(), y_train)
    
    # random forest
    rf_classifier.fit(reduced_data, y_train)
    
    start_time = time.time()
    tfidf_test = vectorizer.transform(X_test)
    reduced_test  = pca.transform(tfidf_test.toarray())
    y_pred = rf_classifier.predict(reduced_test)
    time_exe += (time.time() - start_time)
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

print("Thời gian chạy: --- %s giây ---" % time_exe )

average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Thời gian chạy: --- 0.6432590484619141 giây ---
Average TP: [214.5 383.6]
Average FP: [ 3.9 19.4]
Average FN: [19.4  3.9]
Average TN: [383.6 214.5]


In [57]:
TP = average_TP[0]
FP = average_FP[0]
FN = average_FN[0]
TN = average_TN[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)



cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[214.5, 3.9], [19.4, 383.6]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.9171    0.0101    0.0829    0.9625    0.9821    0.9171    0.9485
