In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import Can_Algorithms as alg
import numpy as nmp
import logging as log
import datetime
import os
import umap
import seaborn as sns  # for visualizing the confusion matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from scipy.interpolate import interp1d
from sklearn.metrics import confusion_matrix
from sklearn.manifold import TSNE
from appconfig import Config
from sklearn.feature_selection import RFE
import itertools
from tabulate import tabulate

In [None]:
# Initialize Config with the environment ('dev' or 'prod') and get Configuration value
appconfig = Config('prod')
log_folder=appconfig.getconfig_from_key("log_folder")
log_filename=appconfig.getconfig_from_key("log_filename")
datasets=appconfig.getconfig_from_key("datasets")
train_models=appconfig.getconfig_from_key("train_models")

# Configure logging to save log file in the folder
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, log_filename)
log.basicConfig(filename=log_file,
                level=log.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def label_encode(df):
    for col in df.columns:
        if df[col].dtype == 'object':
                label_encoder = LabelEncoder()
                df[col] = label_encoder.fit_transform(df[col])

#Function to convert hex string to integer
def hex_to_int(x):
    if isinstance(x, str):
        try:
            return int(x, 16)
        except ValueError:
            return nmp.nan
    else:
        return x

def getDate():
    current_ts = datetime.datetime.now()
    formatted_time= current_ts.strftime("%Y-%m-%d %H:%M:%S")
    return current_ts,formatted_time

def PreprocessData(data, testSize=0.2): 
    ds = pd.read_csv(data, header=None)
     # Assign column names
    ds.columns = appconfig.getconfig_from_key("dataset_columns")
    print("---Dataset Info---")
    print(ds.info())
    print("---Dataset Objects---")
    print(ds.describe(include='object'))
    print("---Dataset Shape---")
    # print(ds.shape())
    print(ds.isnull().sum())    

    total = ds.shape[0]
    missing_columns = [col for col in ds.columns if ds[col].isnull().sum() > 0]
    for col in missing_columns:
        null_count = ds[col].isnull().sum()
        per = (null_count/total) * 100
        print(f"{col}: {null_count} ({round(per, 3)}%)")

    print(f"Number of duplicate rows: {ds.duplicated().sum()}")

    #Plot the type of Attacks
    sns.countplot(x=ds['Flag'])

    print('Class distribution Training set:')
    print(ds['Flag'].value_counts())

    #Label encode of dataset
    label_encode(ds)  

    
    ds.drop(['Timestamp'], axis=1, inplace=True)
    # ds.head()
    X=ds.drop(['Flag'],axis=1)
    y=ds['Flag']


    rfc = RandomForestClassifier()

    rfe = RFE(rfc, n_features_to_select=5)
    rfe = rfe.fit(X, y)

    feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X.columns)]
    selected_features = [v for i, v in feature_map if i==True]
    # selected_features=['CAN_ID', 'DATA0', 'DATA1', 'DATA2', 'DATA3']

    print("-----------Selected features----------")
    print(selected_features)

    X = X[selected_features] #Set the best features for training
  
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=42)
    # return x_train, x_test, y_train, y_test
    scale = StandardScaler()
    X_train = scale.fit_transform(X)
    print("X Train")
    print(X_train)

     # Impute missing values using SimpleImputer
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(x_train)
   
    X_test_imputed = imputer.transform(x_test)
    return X_train_imputed,X_test_imputed,y_train,y_test
       

def PlotBarGraph(models,metrics,values,title,x_label,y_label):
    # Plotting
    plt.figure(figsize=(5,3))

    # Plot bars for each metric and model
    bar_width = 0.2
    index = nmp.arange(len(models))

    for i, metric in enumerate(metrics):
        plt.bar(index + i * bar_width, values[i], bar_width, label=metric)

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(index + bar_width * 1.5, models)
    plt.legend()
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

def PlotLineGraph(models,metrics,values,title,x_label,y_label):
    # Plot curved lines for each metric and model
    for i, metric in enumerate(metrics):
        f = interp1d(nmp.arange(len(models)), values[i], kind='cubic')
        x_new = nmp.linspace(0, len(models) - 1, 100)
        y_new = f(x_new)
        plt.plot(x_new, y_new, label=metric)

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(nmp.arange(len(models)), models)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_true, y_pred, title=None, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix with specified labels for normal and attack traffic.
    """
    if not title:
        title = 'Confusion matrix'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # Ensure labels are in the correct order: [Normal, Attack]
    classes = ['Normal', 'Attack']

    # Print confusion matrix values
    print('Confusion matrix, without normalization')
    print(cm)

    # Plotting
    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt="d", cmap=cmap, ax=ax, xticklabels=classes, yticklabels=classes)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(title)
    plt.show()

def plot_attack_scatter(y_true, y_pred):
    """
    This function plots a scatter diagram of the actual vs. predicted attack/normal status.
    """
    plt.figure(figsize=(10, 6))
    indices = nmp.arange(len(y_true))  # Generate indices for x-axis

    # Plotting actual values
    plt.scatter(indices, y_true, color='blue', alpha=0.5, marker='o', label='Actual')

    # Plotting predicted values
    plt.scatter(indices, y_pred, color='red', alpha=0.5, marker='x', label='Predicted')

    plt.title('Scatter Plot of Actual vs. Predicted Attacks/Normal Traffic')
    plt.xlabel('Sample Index')
    plt.ylabel('Attack Status (1 for Attack, 0 for Normal)')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_tsne(X, y_true, y_pred, perplexity=30, n_components=2, learning_rate=200):
    """
    Function to plot t-SNE with actual and predicted labels.
    Args:
    - X: Feature matrix.
    - y_true: Actual labels.
    - y_pred: Predicted labels.
    - perplexity: t-SNE perplexity parameter.
    - n_components: Number of dimensions t-SNE should reduce to.
    - learning_rate: t-SNE learning rate.
    """
    # Initialize t-SNE
    tsne = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, random_state=42)
    
    # Fit and transform the data
    X_tsne = tsne.fit_transform(X)

    # Plotting the results
    plt.figure(figsize=(12, 6))
    
    # Plot actual labels
    plt.subplot(1, 2, 1)
    for cls in nmp.unique(y_true):
        plt.scatter(X_tsne[y_true == cls, 0], X_tsne[y_true == cls, 1], label=f'Actual {cls}', alpha=0.5)
    plt.title('t-SNE based on Actual Labels')
    plt.legend()
    
    # Plot predicted labels
    plt.subplot(1, 2, 2)
    for cls in nmp.unique(y_pred):
        plt.scatter(X_tsne[y_pred == cls, 0], X_tsne[y_pred == cls, 1], label=f'Predicted {cls}', alpha=0.5)
    plt.title('t-SNE based on Predicted Labels')
    plt.legend()

    plt.show()

def plot_umap(X, y_true, y_pred, n_neighbors=15, min_dist=0.1, n_components=2):
    """
    Function to plot UMAP with actual and predicted labels.
    Args:
    - X: Feature matrix.
    - y_true: Actual labels.
    - y_pred: Predicted labels.
    - n_neighbors: The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
    - min_dist: The minimum distance apart that points are allowed to be in the low-dimensional representation.
    - n_components: Number of dimensions UMAP should reduce to.
    """
    # Initialize UMAP
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=42)
    
    # Fit and transform the data
    X_umap = reducer.fit_transform(X)

    # Plotting the results
    plt.figure(figsize=(12, 6))
    
    # Plot actual labels
    plt.subplot(1, 2, 1)
    for cls in nmp.unique(y_true):
        plt.scatter(X_umap[y_true == cls, 0], X_umap[y_true == cls, 1], label=f'Actual {cls}', alpha=0.5)
    plt.title('UMAP based on Actual Labels')
    plt.legend()
    
    # Plot predicted labels
    plt.subplot(1, 2, 2)
    for cls in nmp.unique(y_pred):
        plt.scatter(X_umap[y_pred == cls, 0], X_umap[y_pred == cls, 1], label=f'Predicted {cls}', alpha=0.5)
    plt.title('UMAP based on Predicted Labels')
    plt.legend()

    plt.show()

In [None]:
dataPath=""
for ds_name in datasets:
    log.info("#####################################")
    log.info("Running dataset: [%s]",ds_name)
    log.info("#####################################")

    precision = []
    recall = []
    f1_score = []
    support = []
    results =[]
    data=[]
    metrics=[]
    values=[]

    dataPath=appconfig.getconfig_from_keys(["data_path",ds_name])  
    print(f"Datapath: {dataPath}")
    #Load Data
    x_train, x_test, y_train, y_test = PreprocessData(dataPath,0.2)

    print(x_train.shape)
    print(x_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    print(f"x train")

    print(x_train)

    total_samples = len(y_test)
    print(f"Total Samples: {total_samples}")


    # Run and evaluate each model
    for model_name in train_models:
        print('-------------------------------')
        print('Model: ',model_name)
        print('-------------------------------')        
        if model_name == 'SVC':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting SVC model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.SVC_Scan(x_train, y_train, x_test,y_test)
            end_ts,formatted_end_ts = getDate()
            data.append(["Support Vector Classifier", training_score, testing_score])
            log.info('SVC model finished. Elapsed time: %s',
            end_ts - start_ts)    
        elif model_name == 'LOGR':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting Logistic Regression model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.Logistic_regression_Scan(x_train, y_train, x_test,y_test) 
            end_ts,formatted_end_ts = getDate()
            data.append(["Logistic Regression", training_score, testing_score])
            log.info('Logistic Regression model finished. Elapsed time: %s',
            end_ts - start_ts)
        elif model_name == 'MLP':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting MLP model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.MLP_Scan(x_train, y_train, x_test,y_test)
            end_ts,formatted_end_ts = getDate()
            data.append(["MLP Classifier", training_score, testing_score])
            log.info('MLP model finished. Elapsed time: %s',
            end_ts - start_ts)
        elif model_name == 'SGD':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting SGD model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.SGD_Scan(x_train, y_train, x_test,y_test)
            end_ts,formatted_end_ts = getDate()
            data.append(["SGD Classifier", training_score, testing_score])
            log.info('SGD model finished. Elapsed time: %s',
                end_ts - start_ts)
        elif model_name == 'LRG':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting Linear regression model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.Linear_regression_Scan(x_train, y_train, x_test,y_test)          
            end_ts,formatted_end_ts = getDate()
            data.append(["Linear Regression", training_score, testing_score])
            log.info('Linear regression model finished. Elapsed time: %s',
            end_ts - start_ts)
        elif model_name == 'CNN':
            start_ts,formatted_start_ts = getDate()
            log.info("Starting CNN model...[%s]",formatted_start_ts)
            y_pred,training_score,testing_score = alg.CNN_Scan(x_train, y_train, x_test,y_test) 
            end_ts,formatted_end_ts = getDate()
            data.append(["CNN model", training_score, testing_score])
            log.info('CNN model finished. Elapsed time: %s',
            end_ts - start_ts)

        print(f"--Report for "+model_name+"--")
        print(classification_report(y_test, y_pred))

        precision_score, recall_score, f1_score_val, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=1)

        precision.append(precision_score)
        recall.append(recall_score)
        f1_score.append(f1_score_val)
            # support.append(1)

        support_score=(pd.Series(y_pred).value_counts() / len(y_pred)).loc[0]
        support.append(support_score)
        # plot_confusion_matrix(y_test, y_pred, title=f'Confusion Matrix - {model_name}')
        # plot_attack_scatter(y_test,y_pred)
      
        print('Precision:',precision_score)
        print('Recall:',recall_score)
        print('f1 score:',f1_score_val)
        print('Support:',support_score)
    
    col_names = ["Model", "Train Score", "Test Score"]
    print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))




        # Define the metrics to plot
    metrics = ['Precision', 'Recall', 'F1 Score', 'Support']
    values = [precision, recall, f1_score, support]

    print(values) 
    PlotBarGraph(train_models,metrics,values,'Metrics by dataset - '+ds_name,'Models','Score')
        