In [1]:
# -*- coding: utf-8 -*-
################################################################################
#                                                                              #
#                         Author: Bc. Petr Pouč                                #
#                         Date: April 4, 2024                                  #
#                         School: Brno University of Technology (BUT)          #
#                                                                              #
#         Master's Thesis: Optimization of Classification Models               #
#                         for Malicious Domain Detection                       #
#                                                                              #
################################################################################
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
import pickle
from tqdm.notebook import tqdm 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

sys.path.append('..')  # Simplify the addition of the path to sys.path
from utils.preprocess_one_domain import NDF
import torch.optim as optim
import math
from joblib import load



# Preprocessing Pipeline and Feature Selection

This document outlines the preprocessing and feature engineering steps implemented in the given Python script for handling datasets stored in Parquet files, focusing on cybersecurity data distinguishing between benign and malign behaviors. The script incorporates extensive data processing capabilities, including missing value imputation, outlier removal, categorical variable encoding, scaling, and advanced feature generation through decision tree predictions.

## Overview of Processing Steps

### Initial Setup and Imports:
- Import necessary libraries for data handling (e.g., Pandas, PyArrow), visualization (e.g., Matplotlib, Seaborn), machine learning (e.g., scikit-learn, XGBoost), and utility functions.
- Set up logging and warning suppression to streamline output.

### Class Structure and Initialization:
- Define a class `FeatureEngineeringCLI` for the preprocessing pipeline, capable of handling either single-record inputs or full datasets.
- Initialize class attributes for paths, scalers, outlier detectors, and machine learning models.

### Data Preprocessing Steps:
1. **Data Loading**: Load benign and malign datasets from specified Parquet files.
2. **Feature Cleaning and Engineering**:
   - Remove non-training columns to focus on relevant features.
   - Handle missing values by setting them to a default value of `-1`.
   - Identify and remove outliers based on standard deviation thresholds.
   - Encode categorical variables using techniques such as one-hot encoding and binary encoding.
   - Generate new features using predictions from a trained decision tree classifier.
3. **Scaling and Transformation**:
   - Apply different scaling techniques (StandardScaler, MinMaxScaler, RobustScaler) depending on the dataset characteristics and the downstream machine learning model requirements.
   - For CNN models, recommend using MinMax scaling followed by a sigmoid transformation to normalize inputs.
4. **Model Training and Evaluation**:
   - Train a decision tree classifier on the processed features to generate a new feature reflecting the likelihood of malign behavior.
   - Evaluate the decision tree model's accuracy and cross-validation scores to ensure robust performance.

### Saving and Loading Models:
- Implement functions to save and load scalers, outlier detection thresholds, and the decision tree model to ensure reproducibility and efficiency in processing new data.

### Utility Functions:
- Provide additional helper methods for categorical encoding, timestamp handling, and logging.

### Execution and Output:
- Detail the process for running the pipeline, including handling command-line arguments and producing a final processed dataset ready for machine learning model training and evaluation.


In [2]:
input_data = {
    'benign': '../floor/benign_2312.parquet',
    'malign': '../floor/phishing_2311.parquet'
}
dataset = NDF("cnn", True, input_data=input_data, one_line_processing=False)

print(dataset['feature_names'])

x_train, x_test, y_train, y_test = train_test_split(torch.Tensor(dataset['features']), torch.Tensor(dataset['labels']), test_size=0.2, random_state=42)

print(x_train.shape)

#print labels of z?train, values distribution
print(y_train)
print(y_train.unique(return_counts=True))

2024-05-11 10:46:16,892 - utils.preprocess_one_domain - INFO - Benign dataset path: ../floor/benign_2312.parquet
2024-05-11 10:46:16,895 - utils.preprocess_one_domain - INFO - Malign dataset path: ../floor/phishing_2311.parquet


Malign dataset path: ../floor/phishing_2311.parquet
Benign dataset path: ../floor/benign_2312.parquet


2024-05-11 10:46:17,967 - utils.preprocess_one_domain - INFO - Number of records in benign dataset: 432572
2024-05-11 10:46:17,970 - utils.preprocess_one_domain - INFO - Number of records in malign dataset: 68353
2024-05-11 10:46:21,220 - utils.preprocess_one_domain - INFO - Total percentage of missing values in benign dataset: 0.39%
2024-05-11 10:46:21,224 - utils.preprocess_one_domain - INFO - Total percentage of missing values in malign dataset: 0.45%
2024-05-11 10:46:36,967 - utils.preprocess_one_domain - INFO - Decision tree model saved to trained_borders/decision_tree_model.joblib
2024-05-11 10:46:37,337 - utils.preprocess_one_domain - INFO - New feature 'dtree_prob' created from decision tree predictions.
2024-05-11 10:46:38,250 - utils.preprocess_one_domain - INFO - Decision Tree Train Accuracy: 0.94
2024-05-11 10:46:38,252 - utils.preprocess_one_domain - INFO - Decision Tree Test Accuracy: 0.93
2024-05-11 10:46:59,866 - utils.preprocess_one_domain - INFO - Decision Tree Cross-


Dataset Subset:
Name: dataset_../floor/benign2312_../floor/phishing2311_2024-05-11.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0        0.5   0.535654        0.5   0.522712   0.576322   0.551896   
1        0.5   0.500000        0.5   0.500000   0.576322   0.510415   
2        0.5   0.517850        0.5   0.500000   0.500000   0.500000   
3        0.5   0.517850        0.5   0.500000   0.500000   0.500000   
4        0.5   0.500000        0.5   0.500000   0.500000   0.500000   
5        0.5   0.517850        0.5   0.500000   0.557438   0.582570   
6        0.5   0.517850        0.5   0.500000   0.500000   0.500000   
7        0.5   0.500000        0.5   0.500000   0.500000   0.500000   
8        0.5   0.500000        0.5   0.500000   0.500000   0.500000   
9        0.5   0.500000        0.5   0.500000   0.500000   0.500000   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_169  Feature_170  \
0   0.731059        0.5   0.500000  

Dynamically calculating the dimensions required to reshape the input data into a suitable format for a CNN, based on the number of features. This reshaping is necessary because CNNs expect image data, so we treat each data point as an image.

In [3]:

print(dataset['features'].shape)

import torch.nn.functional as F
import math
import pyarrow.parquet as pq

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

# Calculate the side size for the square shape dynamically
feature_size = x_train.shape[1]  # Number of features in your dataset
desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)  # Side size of the square

# Calculate padding required to achieve the desired size
padding = desired_size - feature_size

# Applying dynamic padding
if padding > 0:
    # The padding is applied to the last dimension of the dataset
    # (0, padding) applies the padding only to the right side of the last dimension
    x_train_padded = F.pad(x_train, (0, padding), 'constant', 0)
    x_test_padded = F.pad(x_test, (0, padding), 'constant', 0)
else:
    # If no padding is needed, use the original data
    x_train_padded = x_train
    x_test_padded = x_test

# Reshape the data to the new dynamically calculated square shape
x_train = x_train_padded.view(-1, 1, side_size, side_size)
x_test = x_test_padded.view(-1, 1, side_size, side_size)

torch.Size([465991, 179])


In [4]:
# Define the CNN model
class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * (side_size-4)**2, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Prepare dataset and dataloader
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = Net(side_size=side_size)  # Ensure side_size is defined based on your input reshaping logic
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()  # Ensure model is in training mode
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets.long())
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        _, predicted = torch.max(outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
    
    # Calculate accuracy
    correct = sum(p == t for p, t in zip(all_predictions, all_targets))
    accuracy = correct / len(all_targets)
    
    # Calculate F1 score
    f1 = f1_score(all_targets, all_predictions, average='weighted')
    
    print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}, Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for data, targets in test_loader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets.long()).sum().item()

    print(f'Accuracy on test set: {100 * correct / total}%')

    # Predictions and probabilities for the test set
    outputs = model(x_test)
    _, predicted = torch.max(outputs, 1)
    predicted_np = predicted.numpy()

    probabilities = F.softmax(outputs, dim=1)
    probabilities_np = probabilities.numpy()
    probabilities_np_rounded = np.round(probabilities_np, decimals=3)

    print("Predicted classes:", predicted_np)
    print("Probabilities:", probabilities_np_rounded)



model_path = 'models/ndf_classification.pth'
torch.save(model.state_dict(), model_path)


Epoch 1, Loss: 0.08942618751618611, Accuracy: 0.9705, F1 Score: 0.9697
Epoch 2, Loss: 0.059688577547073844, Accuracy: 0.9805, F1 Score: 0.9802
Epoch 3, Loss: 0.05284278821058236, Accuracy: 0.9829, F1 Score: 0.9827
Accuracy on test set: 98.47530552902928%
Predicted classes: [0 0 0 ... 0 0 0]
Probabilities: [[0.999 0.001]
 [1.    0.   ]
 [1.    0.   ]
 ...
 [0.998 0.002]
 [1.    0.   ]
 [0.997 0.003]]


## Processing a Single Domain

This section explains how to process a single domain using previously saved boundaries and models. The process involves reading a single record from a Parquet file and applying the preprocessing steps defined in the NDF function.

### Steps Included in Processing:

- **Parquet File Reading**: A specific Parquet file is read into a pandas DataFrame.
- **Record Selection**: The first record of the DataFrame is selected for processing.
- **Preprocessing Application**: The NDF function is called with parameters set for processing a single domain, including scaling and encoding based on saved boundaries and models.

#### Using `joblib` for Model Persistence:
- **Saving Models**: After training a model (e.g., a decision tree classifier), save it using `joblib.dump(model, 'model_filename.joblib')`. This makes it easy to preserve the state of the model for future use.
- **Loading Models**: When processing a new single domain record, load the saved model using `joblib.load('model_filename.joblib')`. This ensures that the exact configurations and learned patterns of the model are applied to the new data.


In [5]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import os
import datetime
import pyarrow.parquet as pq
import joblib

# Define or import your Net class and other necessary components here
class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * (side_size-4)**2, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Assuming the FeatureEngineeringCLI, NDF function are defined as provided

def read_first_record_parquet(parquet_file):
    # Read the Parquet file
    table = pq.read_table(parquet_file)
    first_record = table.to_pandas().iloc[[0]]
    return first_record

def read_first_record_parquet_dict(parquet_file):
    # Read the Parquet file
    table = pq.read_table(parquet_file)
    first_record_dict = table.to_pandas().iloc[[0]].to_dict(orient='records')[0]
    return first_record_dict

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = np.ceil(np.sqrt(n))**2
    return int(next_square)

# Example usage to preprocess and classify a single record
parquet_file_path = "../floor/benign_2312.parquet"
first_record_df = read_first_record_parquet_dict(parquet_file_path)

# Preprocess the single record
print("Preprocessing the single record...")
print(first_record_df)
preprocessed_data = NDF("cnn", True, input_data=first_record_df, one_line_processing=True)


desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)

# Load the trained CNN model
model = Net(side_size=side_size)
model.load_state_dict(torch.load('models/ndf_classification.pth', map_location=torch.device('cpu')))
model.eval()



# Ensure the features tensor is in the correct shape for prediction
# Reshape logic based on the error encountered
data_tensor = preprocessed_data['features']
feature_size = data_tensor.shape[1]
desired_size = next_perfect_square(feature_size)
padding = desired_size - feature_size
if padding > 0:
    data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
else:
    data_tensor_padded = data_tensor
side_size = int(np.sqrt(desired_size))
data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)



# Predict and calculate probabilities for the single record
with torch.no_grad():
    outputs = model(data_tensor_reshaped)
    probabilities = F.softmax(outputs, dim=1)
    probabilities_np = probabilities.detach().cpu().numpy()
    probabilities_np_rounded = np.round(probabilities_np, decimals=3)

    _, predicted = torch.max(outputs, 1)
    predicted_np = predicted.detach().cpu().numpy()

print("Predicted classes:", predicted_np)
print("Probabilities:", probabilities_np_rounded)


Preprocessing the single record...
{'domain_name': 'google.com', 'label': 'benign_2310:unknown', 'dns_has_dnskey': 0.0, 'dns_A_count': 1, 'dns_AAAA_count': 1, 'dns_MX_count': 1, 'dns_NS_count': 4, 'dns_TXT_count': 12, 'dns_SOA_count': 1, 'dns_CNAME_count': 0, 'dns_zone_level': 0, 'dns_zone_digit_count': 0, 'dns_zone_len': 10, 'dns_zone_entropy': 0.26464393446710155, 'dns_resolved_record_types': 6, 'dns_dnssec_score': 0.0, 'dns_ttl_avg': 58360.0, 'dns_ttl_stdev': 128463.52011368831, 'dns_ttl_low': 0.16666666666666666, 'dns_ttl_mid': 0.5, 'dns_ttl_distinct_count': 4.0, 'dns_soa_primary_ns_level': 1.0, 'dns_soa_primary_ns_digit_count': 1.0, 'dns_soa_primary_ns_len': 14.0, 'dns_soa_primary_ns_entropy': 0.22728612962572958, 'dns_soa_email_level': 1.0, 'dns_soa_email_digit_count': 0.0, 'dns_soa_email_len': 20.0, 'dns_soa_email_entropy': 0.17920918598895944, 'dns_soa_refresh': 900.0, 'dns_soa_retry': 900.0, 'dns_soa_expire': 1800.0, 'dns_soa_min_ttl': 60.0, 'dns_domain_name_in_mx': False, 'dn

2024-05-11 10:53:31,988 - utils.preprocess_one_domain - INFO - Single-record processing: 1 rows
2024-05-11 10:53:32,050 - utils.preprocess_one_domain - INFO - Decision tree model loaded from trained_borders/decision_tree_model.joblib
2024-05-11 10:53:32,136 - utils.preprocess_one_domain - INFO - Applied loaded decision tree pipeline to generate 'dtree_prob' for the single record.
2024-05-11 10:53:32,139 - utils.preprocess_one_domain - INFO - Generated class map: {'benign_2310:unknown': 0}
2024-05-11 10:53:32,171 - utils.preprocess_one_domain - INFO - Scaler loaded from trained_borders/scaler.joblib
2024-05-11 10:53:32,183 - utils.preprocess_one_domain - INFO - Outliers thresholds loaded from trained_borders/outliers.joblib
2024-05-11 10:53:32,292 - utils.preprocess_one_domain - INFO - Completed outlier removal.
2024-05-11 10:53:32,323 - utils.preprocess_one_domain - INFO - Applying MinMaxScaler + Sigmoid scaling to the features.
2024-05-11 10:53:32,334 - utils.preprocess_one_domain - I


Dataset Subset:
Name: single_record_dataset_2024-05-11
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0        0.0   0.071429   0.111111   0.090909   0.307692        0.5   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_169  Feature_170  \
0        1.0        0.0        0.0        0.0  ...         0.75      0.46875   

   Feature_171  Feature_172  Feature_173  Feature_174  Feature_175  \
0      0.59375         0.25     0.289062     0.107767     0.411249   

   Feature_176  Feature_177  Feature_178  
0     0.328895     0.352424          0.0  

[1 rows x 179 columns]
Labels:
   Label
0    0.0
Dimension: 179
Predicted classes: [0]
Probabilities: [[1. 0.]]
