## Import libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient
from glob import glob
from tqdm.notebook import tqdm
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers as L
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score




## Constant

In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU:', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

# user_secrets = UserSecretsClient()
# user_credential = user_secrets.get_gcloud_credential()
# user_secrets.set_tensorflow_credential(user_credential)
# GCS_DS_PATH = KaggleDatasets().get_gcs_path('websiteb dbbb-defacement')

AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
N_JOBS = cpu_count()


IMG_SIZE = 224
EPOCHS = 50

print('N_JOBS:', N_JOBS)
print('Batch size:', BATCH_SIZE)
# print('GCS_DS_PATH:', GCS_DS_PATH)

N_JOBS: 2
Batch size: 8


## Path

In [3]:
dataset_path = '../input/website-defacement/'
img_paths = glob(dataset_path + 'image/*/*')
txt_paths = glob(dataset_path + 'text/*/*')
img_names = [path.split('/')[-1][:-4] for path in img_paths]
txt_names = [path.split('/')[-1][:-4] for path in txt_paths]
names = list(set(img_names).intersection(txt_names))

## Prepare data

In [4]:
def prepare_data(name):
    if glob(dataset_path + f'text/defaced/{name}.txt'):
        path = dataset_path + f'text/defaced/{name}.txt'
        label = 1
    else:
        path = dataset_path + f'text/benign/{name}.txt'
        label = 0
    return (path, label)

output = Parallel(n_jobs=N_JOBS)(delayed(prepare_data)(name) for name in tqdm(names))
X, y = zip(*output)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
# # y_train = to_categorical(y_train, num_classes=2)
# # X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
# #                                                       random_state=42)

# print(len(X_train), len(X_test))

  0%|          | 0/96234 [00:00<?, ?it/s]

## K-fold

In [5]:
import numpy as np
from sklearn.model_selection import KFold

n_splits = 10

X = np.array(X)
y = np.array(y)

# Khởi tạo K-Fold Cross-Validator
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lặp qua các phần đã chia
for fold_number, (train_index, test_index) in enumerate(kf.split(X, y), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]    
    
    print(f"Fold {fold_number}:")
    print(f"Train set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

Fold 1:
Train set size: 86610
Test set size: 9624
Fold 2:
Train set size: 86610
Test set size: 9624
Fold 3:
Train set size: 86610
Test set size: 9624
Fold 4:
Train set size: 86610
Test set size: 9624
Fold 5:
Train set size: 86611
Test set size: 9623
Fold 6:
Train set size: 86611
Test set size: 9623
Fold 7:
Train set size: 86611
Test set size: 9623
Fold 8:
Train set size: 86611
Test set size: 9623
Fold 9:
Train set size: 86611
Test set size: 9623
Fold 10:
Train set size: 86611
Test set size: 9623


In [6]:
X_fold = np.array_split(X, 10)
y_fold = np.array_split(y, 10)

## Train

In [7]:
from sklearn.preprocessing import FunctionTransformer
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0

to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
for X_, y_ in zip(X_fold,y_fold):
    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42, shuffle=True)
    pipeline = Pipeline([
        ('tf', CountVectorizer(input='filename',max_features=300, ngram_range=(2,2), analyzer= 'char')),
        ('to_dense', to_dense),
        ('classifier', MultinomialNB())
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    clf_report = metrics.classification_report(y_test, y_pred, digits=4)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cnf_matrix.ravel()
    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN
n_folds = 10
average_TP = total_TP / n_folds
average_FP = total_FP / n_folds
average_FN = total_FN / n_folds
average_TN = total_TN / n_folds

print("Average TP:", average_TP)
print("Average FP:", average_FP)
print("Average FN:", average_FN)
print("Average TN:", average_TN)

Average TP: 620.4
Average FP: 173.8
Average FN: 162.3
Average TN: 968.5


## Evaluate

In [8]:
TP = average_TP
FP = average_FP
FN = average_FN
TN = average_TN

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

cnf_matrix = [[TP,FP],[FN,TN]]
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)
ACC = (TP+TN)/(TP+FP+TN+FN)
other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR,
                              'ACC': '%.4f' % ACC,
                             'Precision':'%.4f' % precision,
                             'Recall':'%.4f' % recall,
                             'F1_score':'%.4f' % f1_score,}, index=[0]).to_string(col_space=9, index=False)
# print(clf_report)
print(cnf_matrix)
print(other_metrics)

[[620.4, 173.8], [162.3, 968.5]]
      TPR       FPR       FNR       ACC Precision    Recall  F1_score
   0.7926    0.1521    0.2074    0.8254    0.7812    0.7926    0.7869
