## Include all necessary Libraries

In [None]:
# Base Libraries
import os                                                                 # OS Library
import pandas as pd                                                       # Pandas Library
import numpy as np                                                        # Numpy Library
from fastai.tabular.all import df_shrink                                  # Library to shrink 
from fastcore.parallel import *                                           # Library for parallel tasks
import time                                                               # Time function library
import matplotlib.pyplot as plt                                           # plot library
import seaborn as sns                                                     # plot library
# import memory_profiler, memory_                                         # memory usage library
from memory_profiler import memory_usage
from collections import Counter                                           # Counter library
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline
import psutil
from tqdm import tqdm
# from sklearn.compose import ColumnTransformer
# from sklearn.base import BaseEstimator, TransformerMixin

# Libraries for feature engineering
from sklearn.model_selection import train_test_split                      # Dataset split library
# from sklearn.model_selection import GridSearchCV                          # Gridsearch library for hyper parameter tuning
from sklearn.preprocessing import LabelEncoder                            # LabelEnoder
# from sklearn.preprocessing import OneHotEncoder                           # Onehotencoding 
from sklearn.feature_selection import VarianceThreshold                   # For removing zero variance features
from sklearn.preprocessing import MinMaxScaler, StandardScaler            # Apply Standarization and Normalization 
from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC                 # Over-sampling Library
from imblearn.under_sampling import RandomUnderSampler, TomekLinks        # Under-Sampling
# from imblearn.combine import SMOTETomek

# ML Model Libraries
# from sklearn.linear_model import LogisticRegression                       # Logistic Regression
# from sklearn.tree import DecisionTreeClassifier                           # Decision Tree Classifier
# from sklearn.ensemble import RandomForestClassifier                       # Random forest Classifier
# from sklearn.ensemble import AdaBoostClassifier                           # AdaBoost Classifier
# from sklearn.svm import SVC                                               # SVM Library
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB   # naive base classifiers library

# Performance measure Libraries for Trained model 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# Deep learning Model Libraries
# import tensorflow as tf
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import DataLoader, TensorDataset
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout  



## Create a csv read function 

In [None]:
def csvRead (sfilePath):
    print("CSV Read function started........")
#     ind_dfs = [pd.read_csv(dsp) for dsp in filePathArray]
    ind_dfs = pd.read_csv(sfilePath)
    return ind_dfs

## Create a datatype downsizing function

In [None]:
def dataTypeDown(ind_dfs_read):
    print("Datatype Downsizing function started........")
#     ind_dfs_down = parallel(df_shrink, ind_dfs_read, progress=True)
    ind_dfs_down = df_shrink(ind_dfs_read)
    return ind_dfs_down

## List the files in our target directory we are interested to read

In [None]:
filePath = []
individual_dfs = []
for root, subdir, files in os.walk('D:/Datasets/CICDDoS2019/testing_03_11/'):
# print(root)
    for file in files:
        if file.endswith('.csv'):
            fpath = os.path.join(root,file)
            filePath.append(fpath)
            print (fpath)
            print("-----------------------------------------------------------------------")
#             print (f'Total no of files meet our criteria are: ', len(filePath))
            if (len(filePath)):
                ind_dfs_read = csvRead(fpath)
#                 time.sleep(15)
                print("starting downsizing.....")
#                 individual_dfs = [dataTypeDown(ind_dfs_read)]
                individual_dfs.append(dataTypeDown(ind_dfs_read))
                print(f'No. of Instances in the file are:',ind_dfs_read.shape)
#                 time.sleep(5)
print('**************************')
print (f'Total no of files meet our criteria are: ', len(filePath))
print("Dataset Read Task Completed!!!!")

## Replace space with "_" in dataset column names

In [None]:
for i in individual_dfs:
    # Remove leading and trailing spaces from column names
#     cleaned_columns = [col.strip() for col in cols]
#     print(cleaned_columns)
    cols = i.columns.str.strip()
#     cols = [col.strip() for col in cols]
    cols = cols.map(lambda x: x.replace(' ', '_') )
    i.columns = cols
#     print(i.columns)
print('Task Completed')

## Dataset summary

In [None]:
# individual_dfs[0].info(verbose=True)
individual_dfs[0].info()

## Dataset split into X (input) and y (Target)

In [None]:
individual_X = []
individual_y = []
count = 0
for i in individual_dfs:
#     Target (Output) values
    temp_y = i.copy()
    individual_y.append(i['Label'])
    print(individual_y[count].shape)
#     Input values
    temp_X = i.copy()
    temp_X = temp_X.drop('Label', axis=1)
    individual_X.append(temp_X)
    print(individual_X[count].shape)
    count+=1
    print('------------------------------')

## Verify the split task

In [None]:
individual_X[0].columns

In [None]:
# To get detail overview of the dataset run this command
# individual_X[0].info(verbose=True)
individual_X[0].info()

## Further Processing on dataset by concatinating All files.

## Concatinating dataset file into a single file

In [None]:
all_X =  pd.concat(individual_X, ignore_index=True)
all_y = pd.concat(individual_y, ignore_index=True)
all_X.shape, all_y.shape

In [None]:
plt.figure(figsize=(12, 8))
# Create a count plot of the string labels
ax = sns.countplot(x=all_y)
# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.xlabel('Labels')
plt.ylabel('Count')
plt.title('Count Per Label in CIC-DDoS 2019 Dataset Day-1')
plt.show()


In [None]:
unique_value_count = all_y.value_counts()
unique_value_count

In [None]:
# summarize distribution
counter = Counter(all_y)
for k,v in counter.items():
    per = v / len(all_y) * 100
    print('Class = %-15s,     n = %-10d,        (%-8.3f%%)' % (k, v, per))

## Dataset verification

In [None]:
all_X.iloc[20:30].head(10)

## Label encoding performed on all_y dataset 

In [None]:
# Apply label encoding to the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(all_y)

# Get the mapping of original labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

# Print the mapping
print("Label Mapping:")
for label, encoded_value in label_mapping.items():
    print(f"{label}: {encoded_value}")

print (f"Shape of the target values {y_encoded.shape}")

## Train - Test Split on all 77 features

In [None]:
test_size = 0.3 # number of training raws (70%-30%)
random_state = 42 # random seed

X_train_v0, X_test_v0, y_train_v0, y_test_v0 = train_test_split(all_X, y_encoded, 
                                                                test_size=test_size, 
                                                                shuffle=True,
                                                                stratify=y_encoded, 
                                                                random_state=random_state)
X_train_v0.shape, y_train_v0.shape, X_test_v0.shape, y_test_v0.shape

## Dataset Creation with StandardScaler-Random Undersampler-SMOTE with 77 Features

## Calculate the minimum, mean, and median number of instances per class

In [None]:
# Calculate the minimum, mean, and median number of instances per class
class_distribution = pd.Series(y_train_v0).value_counts()
min_instances = class_distribution.min()
mean_instances = int(class_distribution.mean())
median_instances = int(class_distribution.median())
min_instances, mean_instances, median_instances
# sampling_strategy = {cls: mean_instances for cls in class_distribution.index}
# print(sampling_strategy)


In [None]:
def create_datasets(X, y, threshold, scaler):
    print(f'The shape of X and y is ',X.shape, y.shape)
    # Determine the class labels
    class_label = pd.Series(y).unique()

    print('------------------------------------------------')
    print(f'The class encoded labels are ',class_label)
    print (f'The threshold is ', threshold)
    print (f'The scaler is ', scaler)
    print('------------------------------------------------')
    print(f'dataset creation process started for : {threshold} instances for each Class' )
    
    # Determine the classes that need under-sampling and SMOTE
    under_sample_classes = [class_label for class_label, count in class_distribution.items() if count > threshold]
    smote_classes = [class_label for class_label, count in class_distribution.items() if count <= threshold]

    print(f'under sampling classes are: ', under_sample_classes)
    print(f'SMOT Over sampling classes are: ', smote_classes)
    
    # Create a sampling strategy dictionary
    rus_sampling_strategy = {}
    smt_sampling_strategy = {}
    
    for class_label in under_sample_classes:
        rus_sampling_strategy[class_label] = threshold
    
    for class_label in smote_classes:
        smt_sampling_strategy [class_label]= threshold
    
#     transformer = ColumnTransformer([
#         ('scale', StandardScaler(), X.columns)
#     ])
    
    # Define the pipeline
    pipeline = Pipeline([
#         ('transformer', transformer),
        # ('scale', StandardScaler()),
        ('scale', scaler), # Now getting the scaler to work, from function call
        ('rus', RandomUnderSampler(sampling_strategy=rus_sampling_strategy)),
        ('smote', SMOTE(sampling_strategy=smt_sampling_strategy))
    ], verbose=True)

    # Apply the pipeline to create a new dataset
    X_resampled, y_resampled = pipeline.fit_resample(X, y)
#     pipeline.fit(X,y)
    print(f'dataset creation process completed for : {threshold} instances for each Class' )
    return X_resampled, y_resampled, pipeline 

## Dataset Creation with Standard-Scaler Random Undersampler and SMOTE 

In [None]:
# Create three datasets based on the thresholds
# X_min_SS_77, y_min_SS_77, pipeline_min_SS_77 = create_datasets(X_train_v0, y_train_v0, min_instances, StandardScaler()) 
# X_mean_SS_77, y_mean_SS_77, pipeline_mean_SS_77 = create_datasets(X_train_v0, y_train_v0, mean_instances, StandardScaler()) 
X_median_SS_77, y_median_SS_77, pipeline_median_SS_77 = create_datasets(X_train_v0, y_train_v0, median_instances, StandardScaler()) 

In [None]:
# X_min_SS_77.shape, y_min_SS_77.shape, X_mean_SS_77.shape, y_mean_SS_77.shape, X_median_SS_77.shape, y_median_SS_77.shape
X_median_SS_77.shape, y_median_SS_77.shape

In [None]:

# Function to visualize class distribution
def plot_class_distribution(y, title):
    plt.figure(figsize=(10, 5))
    sns.barplot(x=pd.Series(y).value_counts().index, y=pd.Series(y).value_counts().values)
    plt.title(title)
    plt.xlabel("Class Label")
    plt.ylabel("Number of Instances")
    plt.xticks(rotation=90)
    plt.show()

# Plot for each dataset
# plot_class_distribution(y_min_SS_77, "Class Distribution (Min Instances)")
# plot_class_distribution(y_mean_SS_77, "Class Distribution (mean Instances)")
plot_class_distribution(y_median_SS_77, "Class Distribution (Median Instances)")


## X_test dataset scaling with pipeline scaler

In [None]:
# X_test_min_SS_77 = pipeline_min_SS_77.named_steps['scale'].transform(X_test_v0)
# X_test_mean_SS_77 = pipeline_mean_SS_77.named_steps['scale'].transform(X_test_v0)
X_test_median_SS_77 = pipeline_median_SS_77.named_steps['scale'].transform(X_test_v0)

# X_test_min_SS_77.shape, X_test_mean_SS_77.shape, X_test_median_SS_77.shape
X_test_median_SS_77.shape

### Create Validation Set from median-training set of 20%

In [None]:

# Create a validation set (20 of training data)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_median_SS_77, y_median_SS_77, test_size=0.2, random_state=42, stratify=y_median_SS_77)

# Check dataset shapes
print(f"Training Data Shape: {X_train_final.shape}, {y_train_final.shape}")
print(f"Validation Data Shape: {X_val.shape}, {y_val.shape}")
print(f"Test Data Shape: {X_test_median_SS_77.shape}, {y_test_v0.shape}")


In [None]:
# Convert training data to tensors
X_train_tensor = torch.tensor(X_train_final, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_final, dtype=torch.long)  # Classification task

# Convert validation data to tensors
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)  # StandardScaler applied
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Convert test data to tensors
X_test_tensor = torch.tensor(X_test_median_SS_77, dtype=torch.float32)  # StandardScaler applied
y_test_tensor = torch.tensor(y_test_v0, dtype=torch.long)

# Print shapes to verify
print("Training Data Shape:", X_train_tensor.shape, y_train_tensor.shape)
print("Validation Data Shape:", X_val_tensor.shape, y_val_tensor.shape)
print("Test Data Shape:", X_test_tensor.shape, y_test_tensor.shape)

In [None]:
# Create PyTorch Dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

### Create a DataLoader
#### DataLoader helps with batching, shuffling, and parallel processing for efficient training.

In [None]:
# Define batch size
batch_size = 64  # You can change this based on your system's memory
# Automatically determine an optimal number of workers
num_workers = min(4, torch.get_num_threads())  # Adjust based on your CPU
print("Number of workers:", num_workers)
# Create DataLoader for training and validation data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,shuffle=False, num_workers=4, pin_memory=True) 


### Verify DataLoader
#### To check if the DataLoader works correctly, let's load a single batch and inspect the shapes.

In [None]:
# Fetch a batch of data
X_batch, y_batch = next(iter(train_loader))
print("Batch X Shape:", X_batch.shape)
print("Batch y Shape:", y_batch.shape)

Xtest_batch, ytest_batch = next(iter(test_loader))
print("Batch xtest Shape:", Xtest_batch.shape)
print("Batch ytest Shape:", ytest_batch.shape)


## Training and making prediction on resampled dataset

In [None]:

# Define the neural network model
class DDoSClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DDoSClassifier, self).__init__()

        # Fully connected layers
        self.fc1 = nn.Linear(input_size, 128)  # Input layer
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Dropout for regularization

        self.fc2 = nn.Linear(128, 64)  # Hidden layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(64, num_classes)  # Output layer
        self.softmax = nn.Softmax(dim=1)  # Softmax activation for multiclass classification

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        x = self.softmax(x)  # Apply softmax for classification
        
        return x

# Get input size (number of features) and number of classes
input_size = X_train_tensor.shape[1]  # Number of features
num_classes = len(set(y_train_tensor))  # Number of unique labels

# Create the model instance
model = DDoSClassifier(input_size, num_classes)

# Print the model summary
print(model)


In [None]:


def memory_usage():
    """Returns memory usage in MB."""
    return psutil.Process().memory_info().rss / 1024 ** 2  # Convert bytes to MB

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    model.to(device)
    print(f" Training on device: {device}")
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    start_time = time.time()  # Track training start time
    mem_usage_before = memory_usage()  # Track memory usage before training

    scaler = torch.cuda.amp.GradScaler()  # Mixed Precision Training

    for epoch in range(epochs):
        model.train()
        train_loss, correct, total = 0, 0, 0

        # Use tqdm progress bar
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            correct += (y_pred.argmax(1) == y_batch).sum().item()
            total += y_batch.size(0)

            avg_loss = train_loss / len(train_loader)
            accuracy = correct / total
            progress_bar.set_postfix(loss=avg_loss, acc=accuracy)

        train_loss /= len(train_loader)
        train_acc = correct / total
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)

        # Validation phase
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)

                val_loss += loss.item()
                correct += (y_pred.argmax(1) == y_batch).sum().item()
                total += y_batch.size(0)

        val_acc = correct / total
        val_loss /= len(val_loader)

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)


        # print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}")
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
              f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

    # Track time and memory usage after training
    end_time = time.time()
    mem_usage_after = memory_usage()

    train_time = end_time - start_time
    train_mem_used = mem_usage_after - mem_usage_before

    print(f"Training Completed! ")
    print(f"Training Time: {train_time:.2f} seconds")
    print(f"Memory Used: {train_mem_used:.2f} MB")

    return train_time, train_mem_used, model, history


In [None]:

def make_predictions(model, test_loader, device):
    model.eval()  # Set to evaluation mode
    start_time = time.time()
    mem_usage_before = memory_usage()

    y_pred_list = []
    with torch.no_grad():
        for batch in test_loader:  # Iterate using batch
            X_batch, _ = batch  # Correctly unpack batch
            # print(f"Batch Type: {type(X_batch)}")  # Debugging statement

            X_batch = X_batch.to(device)  # Move to device
            y_pred = model(X_batch)
            y_pred_list.append(y_pred.argmax(dim=1).cpu().numpy())

    y_pred = np.concatenate(y_pred_list, axis=0)

    end_time = time.time()
    mem_usage_after = memory_usage()

    predict_time = end_time - start_time
    mem_used = mem_usage_after - mem_usage_before

    return predict_time, mem_used, y_pred


In [None]:

def make_report(y_test, y_pred, model, file_name, train_time, train_mem_used, predict_time, predict_mem_used):
    
    try:
        # Convert inputs to NumPy arrays if they are tensors
        if isinstance(y_test, torch.Tensor):
            y_test = y_test.cpu().numpy()
        if isinstance(y_pred, torch.Tensor):
            y_pred = y_pred.cpu().numpy()

        # Classification report & Confusion matrix
        report = classification_report(y_test, y_pred, digits=5, output_dict=True)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Convert to DataFrame
        report_df = pd.DataFrame(report).transpose()
        conf_matrix_df = pd.DataFrame(conf_matrix)
    
        # Create directories
        base_directory = 'DL_Model_Reports'
        dir_name = 'MLP_pytorch_reports'
        output_directory = os.path.join(base_directory, dir_name)
        os.makedirs(output_directory, exist_ok=True)

        output_file = os.path.join(output_directory, f'{file_name}_report.xlsx')

        # Save to Excel
        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
            report_df.to_excel(writer, sheet_name='Classification Report')
            conf_matrix_df.to_excel(writer, sheet_name='Confusion Matrix')

            pd.DataFrame({'Training Time': [train_time]}).to_excel(writer, sheet_name='Training Time')
            pd.DataFrame({'Training Memory Used': [train_mem_used]}).to_excel(writer, sheet_name='Training Memory Used')
            pd.DataFrame({'Prediction Time': [predict_time]}).to_excel(writer, sheet_name='Prediction Time')
            pd.DataFrame({'Prediction Memory Used': [predict_mem_used]}).to_excel(writer, sheet_name='Prediction Memory Used')

            # Save model architecture
            model_summary = [str(model)]
        # model_summary.append(str(model))  # Save model details
            pd.DataFrame({'Model Summary': model_summary}).to_excel(writer, sheet_name='Model Summary')

        print(f"Report saved successfully: {output_file}")
        return report, conf_matrix
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [None]:
# Training and prediction with median_dataset

# Define Hyperparameters
EPOCHS = 20
# BATCH_SIZE = 64
LEARNING_RATE = 0.001

# Define Loss Function & Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Move model to GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train Model
train_time, train_mem_used, trained_model, history = train_model(
    model, train_loader, val_loader, criterion, optimizer, EPOCHS, device)

print(f"Training time: {train_time:.2f} seconds, Memory used: {train_mem_used:.2f} MB")

# Make Predictions
predict_time, predict_mem_used, y_pred_test = make_predictions(
    trained_model, test_loader, device)

print(f"Prediction time: {predict_time:.2f} seconds, Memory used: {predict_mem_used:.2f} MB")

# Extract true labels from test_loader
y_true_test = np.concatenate([y_batch.numpy() for _, y_batch in test_loader], axis=0)

# Generate Classification Report
file_name = "MLP_model_median_ss_77"
report, confusion_mat = make_report(y_true_test, y_pred_test, trained_model, file_name, 
                                    train_time, train_mem_used, predict_time, predict_mem_used)

print("Model evaluation completed!")


In [None]:
# import matplotlib.pyplot as plt

# Extract history details
train_loss = history['train_loss']
val_loss = history['val_loss']
train_acc = history['train_acc']
val_acc = history['val_acc']

# Plot Loss Curve
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Curve')
plt.legend()

# Plot Accuracy Curve
plt.subplot(1, 2, 2)
plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Curve')
plt.legend()

plt.show()


In [None]:
# Plot loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.show()


In [None]:
# Generate the confusion matrix

# Convert numerical predictions back to original labels
y_test_labels = label_encoder.inverse_transform(y_test_v0)  # Convert y_test to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred_median_ss_77)  # Convert predictions to original labels

conf_matrix = confusion_matrix(y_test_labels, y_pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.xticks(rotation=90)  # Rotate labels if they are long
plt.yticks(rotation=0)
plt.show()


## Dataset Creation with MinMax-Random Undersampler-SMOTE 

In [None]:
# Create three datasets based on the thresholds
# X_min_mm, y_min_mm, pipeline_min_mm = create_datasets(X_train_v1, y_train_v1, min_instances, MinMaxScaler())
# X_mean_mm_77, y_mean_mm_77, pipeline_mean_mm_77 = create_datasets(X_train_v0, y_train_v0, mean_instances, MinMaxScaler()) 
X_median_mm_77, y_median_mm_77, pipeline_median_mm_77 = create_datasets(X_train_v0, y_train_v0, median_instances, MinMaxScaler()) 

In [None]:
# X_min_SS_77.shape, y_min_SS_77.shape, X_mean_SS_77.shape, y_mean_SS_77.shape, X_median_SS_77.shape, y_median_SS_77.shape
X_median_mm_77.shape, y_median_mm_77.shape

In [None]:

# Function to visualize class distribution
def plot_class_distribution(y, title):
    plt.figure(figsize=(10, 5))
    sns.barplot(x=pd.Series(y).value_counts().index, y=pd.Series(y).value_counts().values)
    plt.title(title)
    plt.xlabel("Class Label")
    plt.ylabel("Number of Instances")
    plt.xticks(rotation=90)
    plt.show()

# Plot for each dataset
# plot_class_distribution(y_min_SS_77, "Class Distribution (Min Instances)")
plot_class_distribution(y_median_mm_77, "Class Distribution (Mean Instances)")
# plot_class_distribution(y_median_SS_77, "Class Distribution (Median Instances)")


## X_test dataset scaling with pipeline scaler

In [None]:
# X_test_min_SS_77 = pipeline_min_SS_77.named_steps['scale'].transform(X_test_v0)
# X_test_mean_mm_77 = pipeline_mean_mm_77.named_steps['scale'].transform(X_test_v0)
X_test_median_mm_77 = pipeline_median_mm_77.named_steps['scale'].transform(X_test_v0)

# X_test_min_SS_77.shape, X_test_median_SS_77.shape, X_test_median_SS_77.shape
X_test_median_mm_77.shape

## Create Validation Set from min-training set of 20%

In [None]:

# Create a validation set (20 of training data)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_median_mm_77, y_median_mm_77, test_size=0.2, random_state=42, stratify=y_median_mm_77)

# Check dataset shapes
print(f"Training Data Shape: {X_train_final.shape}, {y_train_final.shape}")
print(f"Validation Data Shape: {X_val.shape}, {y_val.shape}")
print(f"Test Data Shape: {X_test_median_mm_77.shape}, {y_test_v0.shape}")


## Training and making prediction on resampled dataset

In [None]:
# Training and prediction with min_dataset

# Train the model
train_time_median, train_mem_used_median, trained_model_mm_77, history = train_model(
    model, X_train_final, y_train_final, X_val, y_val, epochs=20, batch_size=32)

print(f"Training time: {train_time_median} seconds, Memory used: {train_mem_used_median} MB")

# Make predictions
predict_time_median, predict_mem_used_median, y_pred_median_mm_77 = make_predictions(trained_model_mm_77, X_test_median_mm_77)

print(f"Prediction time: {predict_time_median} seconds, Memory used: {predict_mem_used_median} MB")

# Generate Classification Report
report_median_mm_77, confusion_matrix_median_mm_77 = make_report(
    y_test_v0, y_pred_median_ss_77, trained_model_mm_77, 'MLP_model_median_mm_77', 
    train_time_median, train_mem_used_median, predict_time_median, predict_mem_used_median)

print(report_median_mm_77)

In [None]:

# Extract history details
train_loss = history['train_loss']
val_loss = history['val_loss']
train_acc = history['train_acc']
val_acc = history['val_acc']

# Plot Loss Curve
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Curve')
plt.legend()

# Plot Accuracy Curve
plt.subplot(1, 2, 2)
plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Curve')
plt.legend()

plt.show()



In [None]:
# Plot loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.show()


In [None]:
# Generate the confusion matrix

# Convert numerical predictions back to original labels
y_test_labels = label_encoder.inverse_transform(y_test_v0)  # Convert y_test to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred_median_mm_77)  # Convert predictions to original labels

conf_matrix = confusion_matrix(y_test_labels, y_pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.xticks(rotation=90)  # Rotate labels if they are long
plt.yticks(rotation=0)
plt.show()
