In [1]:
################################################################################
#                                                                              #
#                         Author: Bc. Petr Pouč                                #
#                         Date: April 4, 2024                                  #
#                         School: Brno University of Technology (BUT)          #
#                                                                              #
#         Master's Thesis: Optimization of Classification Models               #
#                         for Malicious Domain Detection                       #
#                                                                              #
################################################################################

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
import pickle
from tqdm.notebook import tqdm 
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.path.append('..')
from utils.preprocess_one_domain import NDF2
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV




# SVM Training core
1. Minmax data scale, (optimal would be do some categorical encoding...)
2. Core SVM training function

In [2]:

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler


input_data = {
    'benign': '../floor/benign_2312.parquet',
    'malign': '../floor/phishing_2311.parquet'
}

dataset = NDF2("svm", True, input_data=input_data, one_line_processing=False)

print(dataset['feature_names'])

x_train, x_test, y_train, y_test = train_test_split(torch.Tensor(dataset['features']), torch.Tensor(dataset['labels']), test_size=0.2, random_state=42)

print(x_train.shape)

#print labels of z?train, values distribution
print(y_train)
print(y_train.unique(return_counts=True))

2024-04-24 16:11:20,868 - utils.preprocess_one_domain - INFO - Benign dataset path: ../floor/benign_2312.parquet
2024-04-24 16:11:20,871 - utils.preprocess_one_domain - INFO - Malign dataset path: ../floor/phishing_2311.parquet


Malign dataset path: ../floor/phishing_2311.parquet
Benign dataset path: ../floor/benign_2312.parquet


2024-04-24 16:11:23,777 - utils.preprocess_one_domain - INFO - Number of records in benign dataset: 432572
2024-04-24 16:11:23,782 - utils.preprocess_one_domain - INFO - Number of records in malign dataset: 68353
2024-04-24 16:11:26,602 - utils.preprocess_one_domain - INFO - Total percentage of missing values in benign dataset: 0.39%
2024-04-24 16:11:26,608 - utils.preprocess_one_domain - INFO - Total percentage of missing values in malign dataset: 0.45%


Before sampling: (500925, 180)
After sampling: (75139, 180)


2024-04-24 16:11:40,511 - utils.preprocess_one_domain - INFO - Decision tree model saved to trained_borders/decision_tree_model.joblib
2024-04-24 16:11:40,580 - utils.preprocess_one_domain - INFO - New feature 'dtree_prob' created from decision tree predictions.
2024-04-24 16:11:40,734 - utils.preprocess_one_domain - INFO - Decision Tree Train Accuracy: 0.94
2024-04-24 16:11:40,736 - utils.preprocess_one_domain - INFO - Decision Tree Test Accuracy: 0.92
2024-04-24 16:11:42,947 - utils.preprocess_one_domain - INFO - Decision Tree Cross-Validation Scores: [0.92466164 0.92302164 0.92174399]
2024-04-24 16:11:42,952 - utils.preprocess_one_domain - INFO - Generated class map: {'misp_2310:phishing': 1, 'benign_2310:unknown': 0}
2024-04-24 16:11:43,359 - utils.preprocess_one_domain - INFO - Outliers thresholds saved to trained_borders/outliers.joblib
2024-04-24 16:11:43,425 - utils.preprocess_one_domain - INFO - Outliers removed from dns_A_count: 61 rows
2024-04-24 16:11:43,460 - utils.preproc


Dataset Subset:
Name: dataset_../floor/benign2312_../floor/phishing2311_2024-04-24.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0        0.0        0.0        0.0        1.0        4.0        8.0   
1        1.0        1.0        0.0        0.0        0.0        0.0   
2        1.0       -1.0        0.0        0.0        0.0        0.0   
3        0.0        0.0        0.0        0.0        0.0        0.0   
4        0.0       -1.0        0.0        0.0        0.0        0.0   
5        0.0        0.0        0.0        0.0        0.0        0.0   
6        0.0        0.0        0.0        0.0        0.0        0.0   
7        0.0       -1.0        0.0        0.0        0.0        0.0   
8        0.0       -1.0        0.0        0.0        0.0        0.0   
9        0.0        2.0        3.0        0.0        2.0        0.0   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_169  Feature_170  \
0        1.0        0.0        0.0  

# SVM hyperparameters tuning

In [3]:
# # Define the kernels to test - Different kernels can handle data complexities differently
# kernels = ['linear', 'poly', 'rbf']

# # Initialize dictionary to store performance metrics for each kernel
# performance_metrics = {kernel: {} for kernel in kernels}

# # Evaluate each kernel using SVC and print the process and results
# for kernel in kernels:
#     print(f"Evaluating {kernel} kernel...")
#     svc = SVC(kernel=kernel)
#     svc.fit(x_train, y_train)  # Fit the model to the scaled training data
#     y_pred = svc.predict(x_test)  # Predict using the scaled test data
    
#     # Calculate and store the performance metrics
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     conf_matrix = confusion_matrix(y_test, y_pred)
    
#     performance_metrics[kernel]['accuracy'] = accuracy
#     performance_metrics[kernel]['precision'] = precision
#     performance_metrics[kernel]['recall'] = recall
#     performance_metrics[kernel]['f1_score'] = f1
#     performance_metrics[kernel]['confusion_matrix'] = conf_matrix

#     # Print each kernel's performance metrics
#     print(f"Results for {kernel} kernel:")
#     print(f"Accuracy: {accuracy}")
#     print(f"Precision: {precision}")
#     print(f"Recall: {recall}")
#     print(f"F1 Score: {f1}")
#     print(f"Confusion Matrix:\n{conf_matrix}")
#     print()

# # Determine the best kernel based on F1 score and explain why F1 might be important
# best_kernel = max(performance_metrics, key=lambda k: performance_metrics[k]['f1_score'])
# print(f"Best kernel based on F1 score: {best_kernel}")

# Grid search parameters

To find optimal parameters for SVM, we use grid search.
However, it is very time consuming, so we use only 10% of data for grid search. Or less. Even 1% is enough.

In [4]:
def fit_svm(kernel, class_weight, C, gamma):
    svm = SVC(kernel=kernel, class_weight=class_weight, C=C, gamma=gamma, verbose=False)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)

    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [5]:
def define_grids():
    """Define three SVM parameter grids for grid search with optimal, wide range, and default settings."""
    
    # optimal_grid = {
    #     'C': [35, 37, 42, 47, 54, 60],  # Focused range around a central value of C=50
    #     'gamma': [0.8, 1.0, 1.25, 1.5],  # Focused range around gamma=1
    #     'kernel': ['rbf'] 
    # }
    optimal_grid = {
        'C': [35, 42, 54],  # Focused range around a central value of C=50
        'gamma': [0.8, 1.5],  # Focused range around gamma=1
        'kernel': ['rbf'] 
    }

    # Wide range grid: Covers a broad spectrum to explore various scales.
    wide_range_grid = {
        'C': [0.1, 1, 10, 50, 100, 200, 500, 1000, 2000, 5000],  # Extensive range of C values
        'gamma': [0.00015, 0.0015, 0.015, 0.075, 0.35, 0.85, 3, 7, 'scale', 'auto'],  # Comprehensive range of gamma values
        'kernel': ['rbf', 'poly', 'sigmoid']
    }

    # Default grid: A balanced grid for initial experiments
    default_grid = {
        'C': [10, 100, 1000],  
        'gamma': [0.01, 0.1, 1],  # Spread across low, medium, and high impact of gamma
        'kernel': ['rbf'] 
    }

    return optimal_grid, wide_range_grid, default_grid

# Define and choose grids
optimal_grid, wide_range_grid, default_grid = define_grids()

# Select the grid to use for SVM parameter tuning
param_grid = optimal_grid
# param_grid = wide_range_grid


In [6]:
import time
import concurrent.futures

# Assuming param_grid and fit_svm are defined elsewhere
print("Number of combinations: ", len(param_grid['kernel']) * len(param_grid['C']) * len(param_grid['gamma']), "\n---")

# Function to process each combination of parameters
def process_params(kernel, C, gamma):
    print(f"\nStarting process for kernel: {kernel}, C: {C}, gamma: {gamma}")
    accuracy, f1 = fit_svm(kernel, 'balanced', C, gamma)
    print(f"Completed process for kernel: {kernel}, C: {C}, gamma: {gamma} -> F1: {f1}, Accuracy: {accuracy}\n---")
    return kernel, C, gamma, accuracy, f1

# Using ProcessPoolExecutor to parallelize the grid search
def run_parallel_grid_search():
    highest_f1 = 0
    highest_params = []
    print("Starting parallel grid search...\n---")
    
    with concurrent.futures.ProcessPoolExecutor() as executor:
        # Create a future for each combination of parameters
        futures = {executor.submit(process_params, kernel, C, gamma): (kernel, C, gamma) 
                   for kernel in param_grid['kernel'] 
                   for C in param_grid['C'] 
                   for gamma in param_grid['gamma']}
        
        # As results are completed, process them
        for future in concurrent.futures.as_completed(futures):
            kernel, C, gamma, accuracy, f1 = future.result()
            if f1 > highest_f1:
                highest_f1 = f1
                highest_params = [kernel, C, gamma]
                print(f"\nNew highest F1 found: {highest_f1:.3f} with Accuracy: {accuracy:.3f} at params: {highest_params}\n---")
            else:
                print(f"Result received: F1: {f1:.3f}, Accuracy: {accuracy:.3f} for params: {[kernel, C, gamma]}\n---")
    
    print("\nDONE, highest F1: {:.3f} with accuracy: {:.3f} and params: {}".format(highest_f1, accuracy, highest_params), "\n---")

# Estimate time for one iteration to guide expectations
start_time = time.time()
fit_svm('rbf', 'balanced', 0.001, 0.0001)
iteration_time = time.time() - start_time
print(f"Single iteration time estimated: {iteration_time:.2f} seconds.\n---")

# Estimate total time in minutes
total_mins = (iteration_time * len(param_grid['kernel']) * len(param_grid['C']) * len(param_grid['gamma'])) / 60
print("Expected time to complete grid search: {:.2f} minutes".format(total_mins), "\n---")

# Call the function to run the parallel grid search
run_parallel_grid_search()


Number of combinations:  24 
---
Single iteration time estimated: 651.63 seconds.
---
Expected time to complete grid search: 260.65 minutes 
---
Starting parallel grid search...
---

Starting process for kernel: rbf, C: 37, gamma: 1.5
Starting process for kernel: rbf, C: 42, gamma: 1.5
Starting process for kernel: rbf, C: 47, gamma: 1.25
Starting process for kernel: rbf, C: 47, gamma: 1.0
Starting process for kernel: rbf, C: 37, gamma: 0.8
Starting process for kernel: rbf, C: 37, gamma: 1.0
Starting process for kernel: rbf, C: 35, gamma: 1.25
Starting process for kernel: rbf, C: 54, gamma: 1.25
Starting process for kernel: rbf, C: 42, gamma: 0.8
Starting process for kernel: rbf, C: 35, gamma: 0.8
Starting process for kernel: rbf, C: 35, gamma: 1.5
Starting process for kernel: rbf, C: 47, gamma: 1.5
Starting process for kernel: rbf, C: 37, gamma: 1.25
Starting process for kernel: rbf, C: 47, gamma: 0.8
Starting process for kernel: rbf, C: 42, gamma: 1.25
Starting process for kernel: rbf