In [1]:
################################################################################
#                                                                              #
#                         Author: Bc. Petr Pouč                                #
#                         Date: April 4, 2024                                  #
#                         School: Brno University of Technology (BUT)          #
#                                                                              #
#         Master's Thesis: Optimization of Classification Models               #
#                         for Malicious Domain Detection                       #
#                                                                              #
################################################################################


# Standard library imports
import pickle
import sys
from pathlib import Path

# Third-party imports for data manipulation
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from multiprocessing import cpu_count
import numpy as np
from joblib import Parallel, delayed
import torch.optim as optim
from functools import partial
import os
import multiprocessing

# Third-party imports for machine learning
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split  # Duplicated import removed
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from sklearn.metrics import f1_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import ParameterGrid

# Third-party imports for visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Local application/library specific imports
sys.path.append('..')  # Simplify the addition of the path to sys.path
from transformers.drop_nontrain import drop_nontrain_table as drop_nontrain
from utils.preprocess_one_domain import NDF2


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras.callbacks import EarlyStopping, ModelCheckpoint
from scikeras.wrappers import KerasClassifier
from math import floor
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import StratifiedKFold
from keras.layers import LeakyReLU
LeakyReLU = LeakyReLU(alpha=0.1)
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

2024-04-20 11:21:07.274261: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 11:21:07.767331: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
input_data = {
    'benign': '../floor/benign_2312.parquet',
    'malign': '../floor/phishing_2311.parquet'
}

dataset = NDF2("cnn", True, input_data=input_data, one_line_processing=False)

print(dataset['feature_names'])

X_train, x_test, y_train, y_test = train_test_split(torch.Tensor(dataset['features']), torch.Tensor(dataset['labels']), test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train)
print(y_train.unique(return_counts=True))

2024-04-19 19:49:47,421 - utils.preprocess_one_domain - INFO - Benign dataset path: ../floor/benign_2312.parquet
2024-04-19 19:49:47,424 - utils.preprocess_one_domain - INFO - Malign dataset path: ../floor/phishing_2311.parquet


Malign dataset path: ../floor/phishing_2311.parquet
Benign dataset path: ../floor/benign_2312.parquet


2024-04-19 19:49:48,887 - utils.preprocess_one_domain - INFO - Number of records in benign dataset: 432572
2024-04-19 19:49:48,890 - utils.preprocess_one_domain - INFO - Number of records in malign dataset: 68353
2024-04-19 19:49:50,996 - utils.preprocess_one_domain - INFO - Total percentage of missing values in benign dataset: 0.39%
2024-04-19 19:49:51,000 - utils.preprocess_one_domain - INFO - Total percentage of missing values in malign dataset: 0.45%


Before sampling: (500925, 180)
After sampling: (150278, 180)


2024-04-19 19:49:56,031 - utils.preprocess_one_domain - INFO - Decision tree model saved to trained_borders/decision_tree_model.joblib
2024-04-19 19:49:56,151 - utils.preprocess_one_domain - INFO - New feature 'dtree_prob' created from decision tree predictions.
2024-04-19 19:49:56,430 - utils.preprocess_one_domain - INFO - Decision Tree Train Accuracy: 0.94
2024-04-19 19:49:56,433 - utils.preprocess_one_domain - INFO - Decision Tree Test Accuracy: 0.93
2024-04-19 19:50:00,751 - utils.preprocess_one_domain - INFO - Decision Tree Cross-Validation Scores: [0.92529894 0.92869263 0.92681466]
2024-04-19 19:50:00,756 - utils.preprocess_one_domain - INFO - Generated class map: {'benign_2310:unknown': 0, 'misp_2310:phishing': 1}
2024-04-19 19:50:01,396 - utils.preprocess_one_domain - INFO - Outliers thresholds saved to trained_borders/outliers.joblib
2024-04-19 19:50:01,521 - utils.preprocess_one_domain - INFO - Outliers removed from dns_A_count: 158 rows
2024-04-19 19:50:01,598 - utils.prepro


Dataset Subset:
Name: dataset_../floor/benign2312_../floor/phishing2311_2024-04-19.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0   0.500000   0.557438   0.500000   0.500000   0.500000   0.500000   
1   0.500000   0.500000   0.500000   0.500000   0.500000   0.500000   
2   0.731059   0.500000   0.500000   0.500000   0.500000   0.500000   
3   0.500000   0.538386   0.562177   0.500000   0.500000   0.500000   
4   0.500000   0.500000   0.500000   0.500000   0.500000   0.500000   
5   0.500000   0.500000   0.500000   0.500000   0.500000   0.500000   
6   0.500000   0.519221   0.500000   0.524979   0.557438   0.562177   
7   0.500000   0.538386   0.562177   0.500000   0.500000   0.500000   
8   0.500000   0.557438   0.592667   0.524979   0.538386   0.510415   
9   0.500000   0.557438   0.592667   0.622459   0.538386   0.541570   

   Feature_6  Feature_7  Feature_8  Feature_9  Feature_10  Feature_11  \
0   0.500000   0.500000   0.500000   0.5000

In [3]:
print(dataset['features'].shape)

import torch.nn.functional as F
import math
import pyarrow.parquet as pq

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

# Calculate the side size for the square shape dynamically
feature_size = X_train.shape[1]  # Number of features in your dataset
desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)  # Side size of the square

# Calculate padding required to achieve the desired size
padding = desired_size - feature_size

# Applying dynamic padding
if padding > 0:
    # The padding is applied to the last dimension of the dataset
    # (0, padding) applies the padding only to the right side of the last dimension
    x_train_padded = F.pad(X_train, (0, padding), 'constant', 0)
    x_test_padded = F.pad(x_test, (0, padding), 'constant', 0)
else:
    # If no padding is needed, use the original data
    x_train_padded = X_train
    x_test_padded = x_test

# Reshape the data to the new dynamically calculated square shape
x_train = x_train_padded.view(-1, 1, side_size, side_size)
x_test = x_test_padded.view(-1, 1, side_size, side_size)

#create copz of the data
x_train_copy = x_train.clone().detach()
x_test_copy = x_test.clone().detach()
y_test_copy = y_test.clone().detach()
y_train_copy = y_train.clone().detach()

torch.Size([137383, 179])


In [4]:
import tensorflow as tf
from tensorflow.keras.metrics import Metric

class F1Score(Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.round(y_pred)  # Assume y_pred is output of a sigmoid and round to 0 or 1
        tp = tf.reduce_sum(y_true * y_pred)
        fp = tf.reduce_sum((1 - y_true) * y_pred)
        fn = tf.reduce_sum(y_true * (1 - y_pred))

        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        return f1

    def reset_states(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)


In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from keras_tuner.tuners import BayesianOptimization
from keras_tuner.engine.hyperparameters import HyperParameters
from sklearn.model_selection import train_test_split
import numpy as np
#import keras metrics
from tensorflow.keras import metrics as keras_metrics

# Assuming dataset['features'] and dataset['labels'] are numpy arrays
X = np.array(dataset['features'], dtype=np.float32)
y = np.array(dataset['labels'], dtype=np.float32)

# Splitting the data and converting to tf.data for optimized loading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32).prefetch(tf.data.AUTOTUNE)
test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32).prefetch(tf.data.AUTOTUNE)

# Model building function
def build_model(hp):
    model = Sequential()
    for i in range(hp.Int('num_conv_layers', 1, 3)):
        model.add(Conv1D(
            filters=hp.Int(f'filters_{i}', min_value=32, max_value=256, step=32),
            kernel_size=hp.Choice(f'kernel_size_{i}', values=[3, 5]),
            activation='relu',
            input_shape=(X_train.shape[1], 1) if i == 0 else None))
        model.add(BatchNormalization())
        model.add(MaxPooling1D(pool_size=2))
        if hp.Boolean(f'dropout_{i}'):
            model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.0, max_value=0.2, step=0.1)))

    model.add(Flatten())
    model.add(Dense(
        units=hp.Int('dense_units', min_value=32, max_value=256, step=32),
        activation='relu'))
    model.add(Dropout(rate=hp.Float('final_dropout', min_value=0.0, max_value=0.4, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    optimizer_choice = hp.Choice('optimizer', ['Adam', 'SGD'])
    learning_rate = hp.Float(f'lr_{optimizer_choice}', min_value=1e-4, max_value=1e-2, sampling='LOG')
    optimizer = getattr(optimizers, optimizer_choice)(learning_rate=learning_rate)

    METRICS = [
        keras_metrics.BinaryAccuracy(name='accuracy'),
        keras_metrics.TruePositives(name='tp'),
        keras_metrics.FalsePositives(name='fp'),
        keras_metrics.TrueNegatives(name='tn'),
        keras_metrics.FalseNegatives(name='fn'), 
        keras_metrics.Precision(name='precision'),
        keras_metrics.Recall(name='recall'),
    ]
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=METRICS)
    return model

# Set up the tuner with a more efficient search method
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=3,
    executions_per_trial=1,
    directory='cnn_keras_tuning_30',
    project_name='cnn_tuning_efficient'
)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Run the hyperparameter search
tuner.search(train_data, epochs=4, validation_data=test_data, callbacks=[early_stopping])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The best number of convolutional layers is {best_hps.get('num_conv_layers')}
The best optimizer is {best_hps.get('optimizer')}
The best number of filters is {best_hps.get(f'filters_0')}  
The best kernel size is {best_hps.get(f'kernel_size_0')}  
The best number of units is {best_hps.get('dense_units')}
The best dropout rate is {best_hps.get('final_dropout')}
The best learning rate is {best_hps.get('lr_Adam')}
""")

# Compile and fit the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_data, epochs=10, validation_data=test_data)

#calculate highest f1 based on precision and recall
def calculate_f1_scores(precision, recall):
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-7)  # adding a small epsilon to avoid division by zero
    return f1_scores

# Calculate F1 for both training and validation
train_f1_scores = calculate_f1_scores(np.array(history.history['precision']), np.array(history.history['recall']))
val_f1_scores = calculate_f1_scores(np.array(history.history['val_precision']), np.array(history.history['val_recall']))


max_train_f1 = np.max(train_f1_scores)
max_val_f1 = np.max(val_f1_scores)

print(f"Maximum Training F1 Score: {max_train_f1:.4f}")
print(f"Maximum Validation F1 Score: {max_val_f1:.4f}")


Trial 3 Complete [00h 09m 28s]
val_loss: 0.06313413381576538

Best val_loss So Far: 0.05708251893520355
Total elapsed time: 00h 21m 52s

The best number of convolutional layers is 3
The best optimizer is Adam
The best number of filters is 128  
The best kernel size is 5  
The best number of units is 96
The best dropout rate is 0.1
The best learning rate is 0.00023899521618106246

Epoch 1/10
3435/3435 ━━━━━━━━━━━━━━━━━━━━ 4:10:43 4s/step - accuracy: 0.5000 - fn: 2.0000 - fp: 14.0000 - loss: 0.8611 - precision: 0.0667 - recall: 0.3333 - tn: 15.0000 - tp: 1.000 ━━━━━━━━━━━━━━━━━━━━ 1:42 30ms/step - accuracy: 0.5955 - fn: 8.6667 - fp: 15.3333 - loss: 0.8033 - precision: 0.0963 - recall: 0.2109 - tn: 38.3333 - tp: 1.666 ━━━━━━━━━━━━━━━━━━━━ 1:50 32ms/step - accuracy: 0.6570 - fn: 13.2000 - fp: 15.6000 - loss: 0.7294 - precision: 0.1209 - recall: 0.1796 - tn: 65.0000 - tp: 2.200 ━━━━━━━━━━━━━━━━━━━━ 1:49 32ms/step - accuracy: 0.6910 - fn: 18.1429 - fp: 15.7143 - loss: 0.6804 - precision: 0.1

In [None]:
import matplotlib.pyplot as plt

# Plot accuracy
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')

# Plot F1 score
plt.plot(history.history['f1_score'], label='F1 Score')
plt.plot(history.history['val_f1_score'], label='Validation F1 Score')

plt.title('Model Performance')
plt.ylabel('Score')
plt.xlabel('Epoch')
plt.legend(loc='lower right')
plt.show()