In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
import random

In [131]:
def initialize_population(num_individuals, num_features):
    print("initialize_population method has started")
    population=[random.sample(range(num_features), k=random.randint(1, num_features)) for _ in range(num_individuals)]
    print("initialize_population method has ended")
    return population

In [132]:
def fitness_function(features, X_train, y_train, X_val, y_val, generation, num_generations):
    print("fitness_function method has started")
    if len(features)==0:
        print("No features were selected. Thus, returning fitness=0")
        return 0

    try:
        print(f"Features selected for computing the fitness: {features}")
        X_train_selected=X_train[:, features]
        X_val_selected=X_val[:, features]
        model=KNeighborsClassifier()
        model.fit(X_train_selected, y_train)
        y_pred=model.predict(X_val_selected)
        recall=recall_score(y_val, y_pred, average='macro')
        penalty=(len(features)/X_train.shape[1])*(generation/num_generations)*2
        print("generation: ",generation)
        print("num_generations: ",num_generations)
        print("len(features): ",len(features))
        print("X_train.shape[1]",X_train.shape[1])
        adjusted_recall=recall-penalty
        print(f"Recall for selected features: {recall}, Adjusted recall for selected features: {adjusted_recall}")
        print("fitness_function method has ended")
        return adjusted_recall
    except Exception as e:
        print(f"Error in fitness_function with features {features}: {e}")
        print("fitness_function method has ended")
        return 0

In [133]:
def employed_bees_phase(population, fitness_values, X_train, y_train, X_val, y_val, generation, num_generations):
    print("employed_bees_phase method has started")
    global trial_counters
    new_population=[]
    for i, individual in enumerate(population):
        new_individual=list(individual)
        #Generating a new solution using partner solution
        print("Generating a new solution using partner solution.")
        if len(individual)>0:
            k=random.randint(0, len(individual)-1)
            new_individual[k]=random.choice(range(len(individual)))
            new_fitness=fitness_function(new_individual, X_train, y_train, X_val, y_val, generation, num_generations)
            print("new_fitness: ",new_fitness)
            print("fitness_values[i]: ",fitness_values[i])
            if new_fitness>fitness_values[i]:
                new_population.append(new_individual)
                print("i :",i,", trial_counters[i]: ",trial_counters[i])
                print("trial_counters: ",trial_counters)
                trial_counters[i]=0
            else:
                new_population.append(individual)
                print("i :",i,", trial_counters[i]: ",trial_counters[i])
                print("trial_counters: ",trial_counters)
                trial_counters[i]=trial_counters[i]+1
        #Ensusing population is not empty
        if len(new_population)==0:
            new_population=population
        print("employed_bees_phase method has ended")
        return new_population

In [134]:
def onlooker_bees_phase(population, fitness_values, X_train, y_train, X_val, y_val, generation, num_generations):
    print("onlooker_bees_phase method has started")
    global trial_counters
    new_population=[]
    for i, individual in enumerate(population):
        if random.random()<fitness_values[i]/sum(fitness_values):
            new_individual=list(individual)
            if len(individual)>0:
                k=random.randint(0, len(individual)-1)
                new_individual[k]=random.choice(range(len(individual)))
                new_fitness=fitness_function(new_individual, X_train, y_train, X_val, y_val, generation, num_generations)
                if new_fitness>fitness_values[i]:
                    new_population.append(new_individual)
                    print("i :",i,", trial_counters[i]: ",trial_counters[i])
                    print("trial_counters: ",trial_counters)
                    trial_counters[i]=0
                else:
                    new_population.append(individual)
                    print("i :",i,", trial_counters[i]: ",trial_counters[i])
                    print("trial_counters: ",trial_counters)
                    trial_counters[i]=trial_counters[i]+1
    if len(new_population)==0:
        new_population=population
    print("onlooker_bees_phase method has ended")
    return new_population

In [135]:
'''
def scout_bees_phase(population, trial_counters, limit, num_features):
    print("scout_bees_phase method has started")
    for i, trials in enumerate(trial_counters):
        if trials>limit:
            population[i]=random.sample(range(num_features), k=random.randint(1, num_features))
            print("Resetting trial counter for new individual to 0")
            trial_counters[i]=0
    print("scout_bees_phase method has ended")
    return population
'''

'\ndef scout_bees_phase(population, trial_counters, limit, num_features):\n    print("scout_bees_phase method has started")\n    for i, trials in enumerate(trial_counters):\n        if trials>limit:\n            population[i]=random.sample(range(num_features), k=random.randint(1, num_features))\n            print("Resetting trial counter for new individual to 0")\n            trial_counters[i]=0\n    print("scout_bees_phase method has ended")\n    return population\n'

In [136]:
def scout_bees_phase(population, limit, num_features):
    print("scout_bees_phase method has started")
    global trial_counters
    print("trial_counters before for loop: ",trial_counters)
    for i, trials in enumerate(trial_counters):
        if trials>limit:
            population[i]=random.sample(range(num_features), k=random.randint(1, num_features))
            print("Resetting trial counter for new individual to 0")
            trial_counters[i]=0
    print("trial_counters after for loop: ",trial_counters)
    print("scout_bees_phase method has ended")
    return population

In [137]:
'''
def update_trial_counters(population, new_population, trial_counters):
    print("update_trial_counters method has started")
    
    for i, individual in enumerate(population):
        if new_population[i]==individual:
            print("Incrementing trial counter since there is no improvement")
            trial_counters[i]=trial_counters[i]+1
        else:
            print("Resetting trial counter to 0 since there is improvement")
            trial_counters[i]=0
    
    print("population: ",population)
    print("type(population): ",type(population))
    print("new_population: ",new_population)
    print("type(new_population): ",type(new_population))
    print("trial_counters: ",trial_counters)
    print("type(trial_counters): ",type(trial_counters))
    print("update_trial_counters method has ended")
    return trial_counters
'''

'\ndef update_trial_counters(population, new_population, trial_counters):\n    print("update_trial_counters method has started")\n    \n    for i, individual in enumerate(population):\n        if new_population[i]==individual:\n            print("Incrementing trial counter since there is no improvement")\n            trial_counters[i]=trial_counters[i]+1\n        else:\n            print("Resetting trial counter to 0 since there is improvement")\n            trial_counters[i]=0\n    \n    print("population: ",population)\n    print("type(population): ",type(population))\n    print("new_population: ",new_population)\n    print("type(new_population): ",type(new_population))\n    print("trial_counters: ",trial_counters)\n    print("type(trial_counters): ",type(trial_counters))\n    print("update_trial_counters method has ended")\n    return trial_counters\n'

In [138]:
def abc_feature_selection(X, y, num_generations=10, num_individuals=20, num_best=5, limit=5):
    print("abc_feature_selection method has started")
    num_features=X.shape[1]
    
    #Getting the initial population
    population=initialize_population(num_individuals, num_features)
    print("Initial population: \n",population) 
    
    #Initializing the trial counters
    global trial_counters
    trial_counters=[0]*num_individuals
    print("Trial counters after initialization: \n",trial_counters)

    #Splitting the dataset into training and validation set
    X_train, X_val, y_train, y_val=train_test_split(X, y, test_size=0.2, random_state=42)

    for generation in range(num_generations):
        print("Generation number: ", generation)
        fitness_values=[]

        for individual in population:
            try:
                fitness=fitness_function(individual, X_train, y_train, X_val, y_val, generation, num_generations)
                fitness_values.append(fitness)
            except Exception as e:
                print(f"Error calculating fitness for individual {individual}: {e}")

        print("Fitness values are: ",fitness_values)

        if len(fitness_values)==0:
            print("Fitness values list is empty")
            break

        #Employed Bees Phase
        old_population=population.copy()
        population=employed_bees_phase(population, fitness_values, X_train, y_train, X_val, y_val, generation, num_generations)
        #trial_counters=update_trial_counters(old_population, population, trial_counters)
        print("Population after employed bees phase: ", population)
        
        #Onlooker Bees Phase
        old_population=population.copy()
        population=onlooker_bees_phase(population, fitness_values, X_train, y_train, X_val, y_val, generation, num_generations)
        #trial_counters=update_trial_counters(old_population, population, trial_counters)
        print("Population after onlooker bees phase: ", population)
        
        #Scout Bees Phase
        #population=scout_bees_phase(population, trial_counters, limit, num_features)
        population=scout_bees_phase(population, limit, num_features)
        print("Population after scout bees phase: ", population)

    print("abc_feature_selection method has ended")
    #Returning the best feature subset fetched
    return max(population, key=lambda x: fitness_function(x, X_train, y_train, X_val, y_val, generation, num_generations))

In [139]:
#Reading the processed dataset
cic_df=pd.read_parquet("processed_dataset.parquet")

In [140]:
#Fetching the first 5 rows of the dataset
cic_df.head()

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Fwd_Packets_Length_Total,Bwd_Packets_Length_Total,Fwd_Packet_Length_Max,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,...,Avg_Bwd_Segment_Size,Subflow_Fwd_Packets,Subflow_Fwd_Bytes,Subflow_Bwd_Packets,Subflow_Bwd_Bytes,Init_Fwd_Win_Bytes,Fwd_Act_Data_Packets,Fwd_Seg_Size_Min,isMalicious,attack_id
5968290,3813760.0,5.0,3.0,935.0,397.0,935.0,187.0,418.144714,397.0,132.333328,...,132.333328,5.0,935.0,3.0,397.0,219.0,1.0,32.0,0,0
8285216,396839.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,63326.0,0.0,20.0,0,0
8349977,1914354.0,8.0,7.0,1144.0,1581.0,677.0,143.0,227.969925,1173.0,225.857147,...,225.857147,8.0,1144.0,7.0,1581.0,8192.0,5.0,20.0,0,0
7180832,4002.0,6.0,0.0,2064.0,0.0,440.0,44.0,148.722565,0.0,0.0,...,0.0,6.0,2064.0,0.0,0.0,8192.0,5.0,8.0,1,3
2324438,5368715.0,8.0,6.0,355.0,232.0,198.0,44.375,75.864426,1460.0,108.0,...,108.0,8.0,355.0,6.0,232.0,8192.0,3.0,20.0,0,0


In [141]:
#Computing shape of the dataset
cic_df.shape

(1734343, 47)

In [142]:
#We shall perform feature selection and model training for multi-class classification.
#Thus, we will drop the feature: isMalicious
cic_df=cic_df.drop('isMalicious', axis=1)
cic_df.shape

(1734343, 46)

In [143]:
#Fetching distribution of different attack ids in the dataset
cic_df['attack_id'].value_counts()

attack_id
0    1437467
3     246982
1      29348
2      20546
Name: count, dtype: int64

Due to imbalanced nature of the dataset, we will perform Undersampling. 

Advantages: -
1. This will help us avoid creating duplication.
2. It will ensure all records have equal number of coounts, thus, reducing bias.

Disadvantages: -
1. It will lead to loss of data from the training dataset.

In [144]:
attack_id_counts=cic_df['attack_id'].value_counts()
target_count=attack_id_counts.min()
print("Minimum count of attack_id: ", target_count)

Minimum count of attack_id:  20546


In [145]:
#Performing undersampling on the dataset
undersampled_cic_df=pd.concat([
    cic_df[cic_df['attack_id']==0].sample(target_count, replace=False),
    cic_df[cic_df['attack_id']==1].sample(target_count, replace=False),
    cic_df[cic_df['attack_id']==2],
    cic_df[cic_df['attack_id']==3].sample(target_count, replace=False)], axis=0)

#Shuffling the dataset after undersampling
undersampled_cic_df=undersampled_cic_df.sample(frac=1).reset_index(drop=True)

In [146]:
#Fetching distribution of attack_id after performing undersampling
undersampled_cic_df['attack_id'].value_counts()

attack_id
0    20546
1    20546
2    20546
3    20546
Name: count, dtype: int64

In [147]:
#Dividing the dataset into Independent and Dependent features
X=undersampled_cic_df.iloc[:, 0:-1]
y=undersampled_cic_df.iloc[:, -1]

In [148]:
X

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Fwd_Packets_Length_Total,Bwd_Packets_Length_Total,Fwd_Packet_Length_Max,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,...,Avg_Packet_Size,Avg_Fwd_Segment_Size,Avg_Bwd_Segment_Size,Subflow_Fwd_Packets,Subflow_Fwd_Bytes,Subflow_Bwd_Packets,Subflow_Bwd_Bytes,Init_Fwd_Win_Bytes,Fwd_Act_Data_Packets,Fwd_Seg_Size_Min
0,396839.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.00000,2.0,0.0,0.0,0.0,62559.0,0.0,20.0
1,10053.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
2,363942.0,3.0,2.0,1944.0,232.0,640.0,88.363640,137.869003,976.0,133.25000,...,109.738098,88.363640,133.25000,3.0,1944.0,2.0,232.0,26883.0,1.0,32.0
3,395609.0,3.0,2.0,1976.0,232.0,640.0,79.040001,132.510529,976.0,133.25000,...,103.133331,79.040001,133.25000,3.0,1976.0,2.0,232.0,26883.0,1.0,32.0
4,396839.0,3.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.00000,3.0,0.0,0.0,0.0,8192.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82179,11169.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
82180,321178.0,3.0,2.0,1944.0,232.0,640.0,88.363640,137.869003,976.0,121.13636,...,104.750000,88.363640,121.13636,3.0,1944.0,2.0,232.0,26883.0,1.0,32.0
82181,384895.0,3.0,2.0,1928.0,232.0,640.0,80.333336,133.958679,976.0,133.25000,...,104.386360,80.333336,133.25000,3.0,1928.0,2.0,232.0,26883.0,1.0,32.0
82182,24244.0,2.0,2.0,86.0,194.0,43.0,43.000000,0.000000,97.0,97.00000,...,80.750000,43.000000,97.00000,2.0,86.0,2.0,194.0,8192.0,1.0,20.0


In [149]:
y

0        0
1        1
2        2
3        2
4        1
        ..
82179    1
82180    2
82181    2
82182    0
82183    1
Name: attack_id, Length: 82184, dtype: int32

In [150]:
#Standardizing distribution of independent features using Standard Scaler
scaler=StandardScaler()
model=scaler.fit(X)
scaled_data=model.transform(X)
print(scaled_data)
#Declaring a global variable
trial_counters=[]

[[-0.25051252 -0.69748778 -1.41895005 ...  3.04446304 -0.87283623
  -0.40736053]
 [-0.43252924 -0.20455355  0.72171723 ... -0.47609135 -0.22966244
  -0.40736053]
 [-0.26599345 -0.20455355 -0.34861641 ...  0.73425099 -0.22966244
   1.36460684]
 ...
 [-0.25613322 -0.20455355 -0.34861641 ...  0.73425099 -0.22966244
   1.36460684]
 [-0.42585113 -0.69748778 -0.34861641 ... -0.47609135 -0.22966244
  -0.40736053]
 [-0.4326883  -0.20455355  0.72171723 ... -0.47609135 -0.22966244
  -0.40736053]]


In [152]:
from sklearn.tree import DecisionTreeClassifier
def training_DecisionTreeClassifier(X_train, y_train, features_selected):
    X_train_selected=X_train[:, features_selected]
    dtree_model=DecisionTreeClassifier(max_depth=5).fit(X_train_selected, y_train)
    return dtree_model

In [225]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score
def model_evaluation(y_test, y_pred):
    recall=recall_score(y_test, y_pred, average='macro')
    accuracy=accuracy_score(y_test, y_pred)
    precision=precision_score(y_test, y_pred, average='macro')
    f1=f1_score(y_test, y_pred, average='macro')
    conf_matrix=confusion_matrix(y_test, y_pred)
    balanced_acc=balanced_accuracy_score(y_test, y_pred)
    mcc=matthews_corrcoef(y_test, y_pred)
    '''
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    npv = tn / (tn + fn)
    fdr = fp / (fp + tp)
    '''
    kappa = cohen_kappa_score(y_test, y_pred)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-Score: ",f1)
    print("Balanced accuracy: ", balanced_acc)
    print("Matthews Correlation Coefficient: ",mcc)
    #print("Negative Predictive value: ", npv)
    #print("False Discovery rate: ", fdr)
    print("Cohen's Kappa: ", kappa)
    '''
    fig, ax = plt.subplots(figsize=(10, 7))
    cax = ax.matshow(conf_matrix, cmap='Blues')

    # Add colorbar
    fig.colorbar(cax)

    # Add labels to the axes
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    # Set tick marks
    ax.set_xticks(range(len(set(y_test))))
    ax.set_yticks(range(len(set(y_test))))

    # Set tick labels
    ax.set_xticklabels(range(len(set(y_test))))
    ax.set_yticklabels(range(len(set(y_test))))

    # Loop over data dimensions and create text annotations
    for i in range(len(set(y_test))):
        for j in range(len(set(y_test))):
            ax.text(j, i, conf_matrix[i, j], ha='center', va='center', color='black')

    plt.title('Confusion Matrix')
    plt.show()
    '''

In [230]:
def testing_kNNClassifier(X_test, y_test, features_selected, dt_model):
    print("testing_kNNClassifier method started")
    print("features_selected: ",features_selected)
    X_test_selected=X_test[:, features_selected]
    print("X_test_selected: \n",X_test_selected)
    y_pred=dt_model.predict(X_test_selected)
    model_evaluation(y_test, y_pred)

In [221]:
def testing_DecisionTreeClassifier(X_test, y_test, features_selected, dt_model):
    print("testing_DecisionTreeClassifier method started")
    print("features_selected: ",features_selected)
    X_test_selected=X_test[:, features_selected]
    print("X_test_selected: \n",X_test_selected)
    y_pred=dt_model.predict(X_test_selected)
    model_evaluation(y_test, y_pred)

In [228]:
def training_kNNClassifier(X_train, y_train, features_selected):
    X_train_selected=X_train[:, features_selected]
    knn_model=KNeighborsClassifier()
    knn_model.fit(X_train_selected, y_train)
    return knn_model

In [151]:
best_features=abc_feature_selection(scaled_data, y)
print(f"Selected features: {best_features}")

abc_feature_selection method has started
initialize_population method has started
initialize_population method has ended
Initial population: 
 [[10, 40, 35, 19, 27, 30, 37, 42, 23, 7, 6, 1, 36, 31, 25, 28, 39, 12, 29, 43, 41, 15, 13, 16, 22, 24, 2, 8, 33, 32, 20, 14, 17], [23, 9, 34, 37, 18, 28, 40, 41, 38, 31, 11, 32, 16], [34, 18, 41, 42], [13, 1, 43, 31, 9, 38, 36], [42, 18, 25, 17, 29, 9, 3, 41, 0, 11, 2, 31, 10, 1, 13, 27, 44, 14, 23, 21, 33, 34, 43, 35, 39, 24, 22, 12, 4, 40, 32, 19, 15, 36, 26, 30, 37, 7, 28, 6], [4, 2, 44], [11, 33, 13, 39, 27, 19, 23, 28, 7, 8, 36, 37, 40, 15, 9, 6, 32, 17, 41, 12, 25, 30, 26, 29, 3, 10, 2, 22, 38, 14, 4, 34, 18, 0, 16], [2, 44, 1, 41, 13, 33, 19, 30, 9, 18, 42, 40, 17, 15, 43, 20, 39, 37, 34, 7, 21, 35, 36, 11, 5, 24, 12, 22, 38, 0, 28, 6, 27], [16, 7, 35, 29, 21, 14, 24, 18, 28, 39, 5, 43, 11, 33, 40, 32, 1, 3, 26, 10, 31, 0, 44, 17, 27, 23, 13, 25, 19], [8, 13, 16, 31, 6, 18, 15, 26, 11, 33, 37, 10, 43, 19, 22, 29, 14, 38, 2, 44, 25, 17, 35

In [153]:
dt_model=training_DecisionTreeClassifier(scaled_data, y, best_features)

In [229]:
knn_model=training_kNNClassifier(scaled_data, y, best_features)

In [154]:
orig_cic_df=pd.read_parquet("..//cic/cic-collection.parquet")
sample_size=int(0.2*len(orig_cic_df))
test_cic_df=orig_cic_df.sample(n=sample_size, replace=False, random_state=30)
test_cic_df.shape

(1833516, 59)

In [156]:
test_cic_df['ClassLabel'].value_counts()

ClassLabel
Benign          1437177
DDoS             247082
DoS               79319
Botnet            29163
Bruteforce        20692
Infiltration      19042
Webattack           595
Portscan            446
Name: count, dtype: int64

In [157]:
test_cic_df['Label'].value_counts()

Label
Benign                  1437177
DDoS-LOIC-HTTP           115243
DoS-Hulk                  63645
DDoS-HOIC                 39799
Botnet                    29163
DDoS                      25653
DDoS-NTP                  24135
DDoS-TFTP                 19772
Bruteforce-SSH            19538
Infiltration              19042
DoS-Goldeneye             10399
DDoS-Syn                   9574
DDoS-UDP                   5775
DoS-Slowloris              3077
DDoS-MSSQL                 2382
DDoS-UDPLag                1651
Bruteforce-FTP             1154
DDoS-Ddossim               1040
DoS-Slowhttptest           1023
DDoS-DNS                    762
DoS-Slowread                570
Portscan                    446
DDoS-LDAP                   438
Webattack-bruteforce        398
DDoS-SNMP                   366
DDoS-Slowloris              357
DoS-Slowheaders             304
DoS-Rudy                    171
Webattack-XSS               169
DDoS-NetBIOS                135
DoS-Slowbody                126
We

In [158]:
test_cic_df[test_cic_df.duplicated()]

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,ClassLabel
7192593,3,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7037489,1,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
6761877,21,1,1,31.0,0.0,31.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7040000,1,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7041858,3,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
6890373,49,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7006200,47,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7057763,2,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7312367,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
7363220,2,2,0,12.0,0.0,6.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign


In [159]:
test_cic_df[test_cic_df.duplicated()].shape

(13, 59)

In [160]:
#Removing the duplicate records
test_cic_df.drop_duplicates(inplace=True)

In [161]:
test_cic_df.shape

(1833503, 59)

In [165]:
columnsList=list(test_cic_df.columns)
print(columnsList)

['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min', 'Active Me

In [166]:
type(columnsList)

list

In [167]:
columnsList.remove('Label')
columnsList.remove('ClassLabel')
print(columnsList)

['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min', 'Active Me

In [168]:
#Fetching count and proportion of negative values in the test dataset
negative_proportion=[]
for feature in columnsList:
    negative_count=(test_cic_df[feature]<0).sum()
    negative_Proportion=negative_count*100/1833503
    negative_proportion.append((feature, negative_count, negative_Proportion))

negative_proportion_df=pd.DataFrame(negative_proportion, columns=["Feature name","Number of negative values", "Percentage of negative values"])
print(negative_proportion_df)

                Feature name  Number of negative values  \
0              Flow Duration                         15   
1          Total Fwd Packets                          0   
2     Total Backward Packets                          0   
3   Fwd Packets Length Total                          0   
4   Bwd Packets Length Total                          0   
5      Fwd Packet Length Max                          0   
6     Fwd Packet Length Mean                          0   
7      Fwd Packet Length Std                          0   
8      Bwd Packet Length Max                          0   
9     Bwd Packet Length Mean                          0   
10     Bwd Packet Length Std                          0   
11              Flow Bytes/s                          9   
12            Flow Packets/s                         15   
13             Flow IAT Mean                         15   
14              Flow IAT Std                          0   
15              Flow IAT Max                         14 

In [169]:
#Replacing all negative values with respective median values
for c in columnsList: 
    median_value = test_cic_df[c][test_cic_df[c] >= 0].median() 
    test_cic_df[c] = test_cic_df[c].apply(lambda x: median_value if x < 0 else x)

In [170]:
#Creating a new feature: isMalicious : Yes=1 , No=0
test_cic_df['isMalicious']=np.where(test_cic_df['ClassLabel']!='Benign', 1, 0)
test_cic_df.head(10)

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,ClassLabel,isMalicious
7131407,400.0,20,0,8368.0,0.0,440.0,418.399994,66.483398,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS-NTP,DDoS,1
5064597,61175828.0,20,19,2391.0,9249.0,584.0,119.550003,186.248688,1460.0,486.789459,...,103384.320312,264966.0,11564.0,9995573.0,20157.779297,10003973.0,9954428.0,Benign,Benign,0
5116882,5750464.0,4,4,97.0,232.0,97.0,24.25,48.5,232.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
201601,61133.0,1,1,48.0,110.0,48.0,48.0,0.0,110.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
2619988,1113.0,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
5065839,5362237.0,4,4,97.0,232.0,97.0,24.25,48.5,232.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
1676786,80.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
2619503,5924298.0,19,51,1294.0,5758.0,240.0,68.105263,54.407818,784.0,112.901962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
5649418,15214.0,1,1,39.0,96.0,39.0,39.0,0.0,96.0,96.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
802070,2462.0,1,1,33.0,129.0,33.0,33.0,0.0,129.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0


In [173]:
#Identifying outliers in test dataset
independent_features=test_cic_df.copy()
independent_features=independent_features.drop(['ClassLabel','isMalicious','Label'],axis=1)
q1=independent_features.quantile(0.25)
q3=independent_features.quantile(0.75)
iqr=q3-q1
outlier=(independent_features<(q1-1.5*iqr))|(independent_features>(q3+1.5*iqr))
outlier_count=outlier.sum()
outlier_percentage=round(outlier.mean() * 100, 2)
outlier_stats=pd.concat([outlier_count, outlier_percentage], axis=1)
outlier_stats.columns = ['Outlier Count', 'Outlier Percentage']
print(outlier_stats)

                          Outlier Count  Outlier Percentage
Flow Duration                    362382               19.76
Total Fwd Packets                168185                9.17
Total Backward Packets           177445                9.68
Fwd Packets Length Total          71579                3.90
Bwd Packets Length Total         266621               14.54
Fwd Packet Length Max             24337                1.33
Fwd Packet Length Mean            74220                4.05
Fwd Packet Length Std             21084                1.15
Bwd Packet Length Max             70144                3.83
Bwd Packet Length Mean           140567                7.67
Bwd Packet Length Std             56365                3.07
Flow Bytes/s                     377556               20.59
Flow Packets/s                   380157               20.73
Flow IAT Mean                    346961               18.92
Flow IAT Std                     285341               15.56
Flow IAT Max                     255976 

In [174]:
#Fetching outliers grouped by isMalicious
outlier_counts = {}
for i in independent_features:
    for attack_type in test_cic_df['isMalicious'].unique():
        attack_data = test_cic_df[i][test_cic_df['isMalicious'] == attack_type]
        q1, q3 = np.percentile(attack_data, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
        outlier_percent = num_outliers / len(attack_data) * 100
        outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)

for i in independent_features:
  print(f'Feature: {i}')
  for attack_type in test_cic_df['isMalicious'].unique():
    num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
    print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
  print()

Feature: Flow Duration
- 1: 69554 (17.55%)
- 0: 271944 (18.92%)

Feature: Total Fwd Packets
- 1: 49088 (12.39%)
- 0: 83110 (5.78%)

Feature: Total Backward Packets
- 1: 23676 (5.97%)
- 0: 87467 (6.09%)

Feature: Fwd Packets Length Total
- 1: 79909 (20.16%)
- 0: 39906 (2.78%)

Feature: Bwd Packets Length Total
- 1: 72110 (18.19%)
- 0: 176980 (12.31%)

Feature: Fwd Packet Length Max
- 1: 2262 (0.57%)
- 0: 10863 (0.76%)

Feature: Fwd Packet Length Mean
- 1: 54026 (13.63%)
- 0: 28969 (2.02%)

Feature: Fwd Packet Length Std
- 1: 505 (0.13%)
- 0: 14950 (1.04%)

Feature: Bwd Packet Length Max
- 1: 48763 (12.30%)
- 0: 27190 (1.89%)

Feature: Bwd Packet Length Mean
- 1: 49972 (12.61%)
- 0: 120687 (8.40%)

Feature: Bwd Packet Length Std
- 1: 48777 (12.31%)
- 0: 39834 (2.77%)

Feature: Flow Bytes/s
- 1: 57912 (14.61%)
- 0: 319030 (22.20%)

Feature: Flow Packets/s
- 1: 51548 (13.01%)
- 0: 300358 (20.90%)

Feature: Flow IAT Mean
- 1: 51659 (13.03%)
- 0: 258885 (18.01%)

Feature: Flow IAT Std
- 1: 8

In [175]:
#Fetching outliers grouped by ClassLabel
outlier_counts = {}
for i in independent_features:
    for attack_type in test_cic_df['ClassLabel'].unique():
        attack_data = test_cic_df[i][test_cic_df['ClassLabel'] == attack_type]
        q1, q3 = np.percentile(attack_data, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
        outlier_percent = num_outliers / len(attack_data) * 100
        outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)

for i in independent_features:
  print(f'Feature: {i}')
  for attack_type in test_cic_df['ClassLabel'].unique():
    num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
    print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
  print()

Feature: Flow Duration
- DDoS: 39589 (16.02%)
- Benign: 271944 (18.92%)
- Botnet: 3179 (10.90%)
- DoS: 0 (0.00%)
- Bruteforce: 2522 (12.19%)
- Infiltration: 4036 (21.20%)
- Webattack: 198 (33.28%)
- Portscan: 99 (22.20%)

Feature: Total Fwd Packets
- DDoS: 35571 (14.40%)
- Benign: 83110 (5.78%)
- Botnet: 1494 (5.12%)
- DoS: 1559 (1.97%)
- Bruteforce: 8175 (39.51%)
- Infiltration: 1089 (5.72%)
- Webattack: 111 (18.66%)
- Portscan: 42 (9.42%)

Feature: Total Backward Packets
- DDoS: 203 (0.08%)
- Benign: 87467 (6.09%)
- Botnet: 1601 (5.49%)
- DoS: 64 (0.08%)
- Bruteforce: 1850 (8.94%)
- Infiltration: 1197 (6.29%)
- Webattack: 221 (37.14%)
- Portscan: 177 (39.69%)

Feature: Fwd Packets Length Total
- DDoS: 53695 (21.73%)
- Benign: 39906 (2.78%)
- Botnet: 1979 (6.79%)
- DoS: 3224 (4.06%)
- Bruteforce: 2451 (11.85%)
- Infiltration: 619 (3.25%)
- Webattack: 87 (14.62%)
- Portscan: 76 (17.04%)

Feature: Bwd Packets Length Total
- DDoS: 16545 (6.70%)
- Benign: 176980 (12.31%)
- Botnet: 1645 (5

In [178]:
features_to_impute=test_cic_df.columns.tolist()
print(features_to_impute)
features_to_impute.remove('Init Fwd Win Bytes')
features_to_impute.remove('Init Bwd Win Bytes')
features_to_impute.remove('Fwd Seg Size Min')
features_to_impute.remove('Bwd IAT Mean')
features_to_impute.remove('ClassLabel')
features_to_impute.remove('isMalicious')
features_to_impute.remove('Label')

['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min', 'Active Me

In [179]:
print(features_to_impute)

['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Fwd Act Data Packets', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Ma

In [180]:
#Imputation of outliers with median
for col in features_to_impute:
    Q1 = test_cic_df[col].quantile(0.25)
    Q3 = test_cic_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Replacing outliers with the median
    median_value = test_cic_df[col].median()
    test_cic_df[col] = np.where((test_cic_df[col] < lower_bound) | (test_cic_df[col] > upper_bound), median_value, test_cic_df[col])

In [182]:
#Winsorization of outliers
features_to_cap=['Init Fwd Win Bytes','Init Bwd Win Bytes','Fwd Seg Size Min','Bwd IAT Mean']
for col in features_to_cap:
    upper_limit=test_cic_df[col].quantile(0.95)
    lower_limit=test_cic_df[col].quantile(0.05)
    test_cic_df[col]=np.where(test_cic_df[col]>=upper_limit,
                upper_limit,
               np.where(test_cic_df[col]<=lower_limit,
               lower_limit,
               test_cic_df[col]))

In [183]:
test_cic_df.shape

(1833503, 60)

In [184]:
test_cic_df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,ClassLabel,isMalicious
7131407,400.0,3.0,0.0,97.0,0.0,440.0,44.0,66.483398,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS-NTP,DDoS,1
5064597,396502.0,3.0,2.0,97.0,232.0,584.0,119.550003,186.248688,1460.0,486.789459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
5116882,5750464.0,4.0,4.0,97.0,232.0,97.0,24.25,48.5,232.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
201601,61133.0,1.0,1.0,48.0,110.0,48.0,48.0,0.0,110.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0
2619988,1113.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign,0


In [185]:
test_cic_df.ClassLabel.value_counts()

ClassLabel
Benign          1437164
DDoS             247082
DoS               79319
Botnet            29163
Bruteforce        20692
Infiltration      19042
Webattack           595
Portscan            446
Name: count, dtype: int64

In [186]:
test_cic_df.drop_duplicates(subset=test_cic_df.columns[:-1], keep='first')
test_cic_df.shape

(1833503, 60)

In [187]:
#List of labels to keep
labels_to_keep=['Benign','DDoS','Botnet','Bruteforce']

#Filtering the sampled dataset to only keep rows with the above set of labels
test_cic_df=test_cic_df[test_cic_df['ClassLabel'].isin(labels_to_keep)]

In [188]:
test_cic_df.ClassLabel.value_counts()

ClassLabel
Benign        1437164
DDoS           247082
Botnet          29163
Bruteforce      20692
Name: count, dtype: int64

In [190]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test_cic_df["attack_id"]=le.fit_transform(test_cic_df["ClassLabel"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_))) 
print("Attack id of each distinct value in field ClassLabel:", label_mapping)

Attack id of each distinct value in field ClassLabel: {'Benign': 0, 'Botnet': 1, 'Bruteforce': 2, 'DDoS': 3}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_cic_df["attack_id"]=le.fit_transform(test_cic_df["ClassLabel"])


In [191]:
test_cic_df.shape

(1734101, 61)

In [192]:
test_cic_df.columns

Index(['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Fwd Packets Length Total', 'Bwd Packets Length Total',
       'Fwd Packet Length Max', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count',
       'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size',
       'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes',
       'Subflow Bwd Packets', 'Subflow Bwd By

In [194]:
test_cic_df=test_cic_df.drop(['Label','ClassLabel','Fwd PSH Flags','SYN Flag Count','URG Flag Count','Active Mean',
                              'Active Std','Active Max','Active Min','Idle Mean','Idle Std','Idle Max','Idle Min',
                              'Init Bwd Win Bytes'], axis=1)

In [195]:
test_cic_df.shape

(1734101, 47)

In [196]:
#Renaming columns to remove space and replace it with underscore (_)
test_cic_df.columns=[col.replace(' ','_') for col in test_cic_df.columns]

In [197]:
test_cic_df.columns

Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
       'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
       'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
       'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
       'Fwd_Act_Data_Packets', '

In [198]:
test_cic_df.head()

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Fwd_Packets_Length_Total,Bwd_Packets_Length_Total,Fwd_Packet_Length_Max,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,...,Avg_Bwd_Segment_Size,Subflow_Fwd_Packets,Subflow_Fwd_Bytes,Subflow_Bwd_Packets,Subflow_Bwd_Bytes,Init_Fwd_Win_Bytes,Fwd_Act_Data_Packets,Fwd_Seg_Size_Min,isMalicious,attack_id
7131407,400.0,3.0,0.0,97.0,0.0,440.0,44.0,66.483398,0.0,0.0,...,0.0,3.0,97.0,0.0,0.0,8192.0,1.0,8.0,1,3
5064597,396502.0,3.0,2.0,97.0,232.0,584.0,119.550003,186.248688,1460.0,486.789459,...,486.789459,3.0,97.0,2.0,232.0,8192.0,1.0,20.0,0,0
5116882,5750464.0,4.0,4.0,97.0,232.0,97.0,24.25,48.5,232.0,58.0,...,58.0,4.0,97.0,4.0,232.0,8192.0,1.0,20.0,0,0
201601,61133.0,1.0,1.0,48.0,110.0,48.0,48.0,0.0,110.0,110.0,...,110.0,1.0,48.0,1.0,110.0,8192.0,0.0,32.0,0,0
2619988,1113.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,4.0,0.0,8192.0,0.0,8.0,0,0


In [199]:
test_cic_df['attack_id'].value_counts()

attack_id
0    1437164
3     247082
1      29163
2      20692
Name: count, dtype: int64

In [201]:
#Performing undersampling on the test dataset
attack_id_counts=test_cic_df['attack_id'].value_counts()
target_count=attack_id_counts.min()
undersampled_test_cic_df=pd.concat([
    test_cic_df[test_cic_df['attack_id']==0].sample(target_count, replace=False),
    test_cic_df[test_cic_df['attack_id']==1].sample(target_count, replace=False),
    test_cic_df[test_cic_df['attack_id']==2],
    test_cic_df[test_cic_df['attack_id']==3].sample(target_count, replace=False)], axis=0)

#Shuffling the dataset after undersampling
undersampled_test_cic_df=undersampled_test_cic_df.sample(frac=1).reset_index(drop=True)

In [202]:
undersampled_test_cic_df['attack_id'].value_counts()

attack_id
3    20692
1    20692
2    20692
0    20692
Name: count, dtype: int64

In [203]:
test_cic_df['attack_id'].value_counts()

attack_id
0    1437164
3     247082
1      29163
2      20692
Name: count, dtype: int64

In [204]:
test_cic_df=test_cic_df.drop('isMalicious', axis=1)
undersampled_test_cic_df=undersampled_test_cic_df.drop('isMalicious', axis=1)

In [205]:
test_cic_df.shape

(1734101, 46)

In [206]:
undersampled_test_cic_df.shape

(82768, 46)

Thus, we have two forms of datasets: -

1. Balanced dataset: undersampled_test_cic_df
2. Imbalanced dataset: test_cic_df

In [209]:
#Dividing the balanced dataset into Independent and Dependent features
X_balanced_test=undersampled_test_cic_df.iloc[:, 0:-1]
y_balanced_test=undersampled_test_cic_df.iloc[:, -1]

In [210]:
model_balanced=scaler.fit(X_balanced_test)
balanced_scaled_data=model_balanced.transform(X_balanced_test)
print(balanced_scaled_data)

[[-0.24981042 -0.69492236 -1.42097595 ... -0.87535378 -0.87486524
  -0.41195097]
 [ 0.26779428 -0.20796603  0.71305664 ... -0.47700355 -0.23179632
  -0.41195097]
 [-0.42992636 -0.20796603  0.71305664 ... -0.47700355 -0.23179632
  -0.41195097]
 ...
 [ 1.9346583   2.71377194 -0.35395966 ... -0.47700355  1.69741042
  -0.41195097]
 [-0.4302246  -0.20796603  0.71305664 ... -0.47700355 -0.23179632
  -0.41195097]
 [-0.43478884 -0.20796603  0.71305664 ... -0.47700355 -0.23179632
  -0.41195097]]


In [211]:
#Dividing the imbalanced dataset into Independent and Dependent features
X_imbalanced_test=test_cic_df.iloc[:, 0:-1]
y_imbalanced_test=test_cic_df.iloc[:, -1]

In [212]:
model_imbalanced=scaler.fit(X_imbalanced_test)
imbalanced_scaled_data=model_imbalanced.transform(X_imbalanced_test)
print(imbalanced_scaled_data)

[[-0.56854838 -0.30500688 -1.07963986 ... -0.28203268 -0.34004088
  -1.6987043 ]
 [-0.3841412  -0.30500688 -0.27528921 ... -0.28203268 -0.34004088
  -0.00274217]
 [ 2.10842133  0.02058327  0.52906144 ... -0.28203268 -0.34004088
  -0.00274217]
 ...
 [ 2.04039633  1.32294388  0.52906144 ... -0.28203268  2.39336023
  -0.00274217]
 [-0.54815848 -0.30500688  0.52906144 ... -0.28203268  0.11552597
  -0.00274217]
 [-0.56861402 -0.63059703 -0.27528921 ... -0.28203268 -0.34004088
   1.69321996]]


In [216]:
type(best_features)

list

In [226]:
print("Results of Decision Tree classifier on Balanced test dataset")
testing_DecisionTreeClassifier(balanced_scaled_data, y_balanced_test, best_features, dt_model)

Results of Decision Tree classifier on Balanced test dataset
testing_DecisionTreeClassifier method started
features_selected:  [6, 9, 1, 7, 17, 10, 6, 23, 13, 10, 4, 25]
X_test_selected: 
 [[-1.46128626 -1.06236182 -0.69492236 ... -0.91258253 -0.77080718
  -0.43110691]
 [-1.32083363  1.74842063 -0.20796603 ...  1.78368974  1.94494669
   3.48538143]
 [ 0.82809159 -0.68623015 -0.20796603 ... -0.61181254 -0.40739199
  -0.40784254]
 ...
 [ 1.69655701  1.02672851  2.71377194 ...  1.51053944 -0.11722326
  -0.34009022]
 [ 0.82809159 -0.68623015 -0.20796603 ... -0.61181254 -0.40739199
  -0.40589298]
 [-0.33064255  0.3605239  -0.20796603 ...  0.45233538  0.60397279
  -0.42986368]]
Accuracy:  0.8630146916682776
Precision:  0.8717678067449587
Recall:  0.8630146916682776
F1-Score:  0.8663168071963818
Balanced accuracy:  0.8630146916682776
Matthews Correlation Coefficient:  0.8180552275388117
Cohen's Kappa:  0.8173529222243701


In [227]:
print("Results of Decision Tree classifier on Imbalanced test dataset")
testing_DecisionTreeClassifier(imbalanced_scaled_data, y_imbalanced_test, best_features, dt_model)

Results of Decision Tree classifier on Imbalanced test dataset
testing_DecisionTreeClassifier method started
features_selected:  [6, 9, 1, 7, 17, 10, 6, 23, 13, 10, 4, 25]
X_test_selected: 
 [[-0.27857382 -1.05856388 -0.30500688 ... -0.75082586 -0.73138555
  -0.42141723]
 [ 0.99398679  3.61651274 -0.30500688 ...  1.71986889 -0.25675072
  -0.41964346]
 [-0.61124186 -0.50153776  0.02058327 ... -0.23173542 -0.25675072
  -0.41964346]
 ...
 [ 0.26885461 -0.02134283  1.32294388 ...  2.58717368 -0.25675072
   0.05167531]
 [-0.87372677 -0.02134283 -0.30500688 ... -0.75082586 -0.25675072
  -0.35598089]
 [-0.41332543  0.4876638  -0.63059703 ... -0.75082586 -0.07262514
  -0.42141472]]
Accuracy:  0.8687019960198397
Precision:  0.7816760401320895
Recall:  0.7754949214645214
F1-Score:  0.7610265145217597
Balanced accuracy:  0.7754949214645214
Matthews Correlation Coefficient:  0.4956840374512403
Cohen's Kappa:  0.4805801256683381


In [231]:
print("Results of K Nearest Neighbor classifier on Balanced test dataset")
testing_kNNClassifier(balanced_scaled_data, y_balanced_test, best_features, knn_model)

Results of K Nearest Neighbor classifier on Balanced test dataset
testing_kNNClassifier method started
features_selected:  [6, 9, 1, 7, 17, 10, 6, 23, 13, 10, 4, 25]
X_test_selected: 
 [[-1.46128626 -1.06236182 -0.69492236 ... -0.91258253 -0.77080718
  -0.43110691]
 [-1.32083363  1.74842063 -0.20796603 ...  1.78368974  1.94494669
   3.48538143]
 [ 0.82809159 -0.68623015 -0.20796603 ... -0.61181254 -0.40739199
  -0.40784254]
 ...
 [ 1.69655701  1.02672851  2.71377194 ...  1.51053944 -0.11722326
  -0.34009022]
 [ 0.82809159 -0.68623015 -0.20796603 ... -0.61181254 -0.40739199
  -0.40589298]
 [-0.33064255  0.3605239  -0.20796603 ...  0.45233538  0.60397279
  -0.42986368]]
Accuracy:  0.9756790063792771
Precision:  0.9763530144508412
Recall:  0.975679006379277
F1-Score:  0.9756688888638373
Balanced accuracy:  0.975679006379277
Matthews Correlation Coefficient:  0.9678082900996628
Cohen's Kappa:  0.9675720085057027


In [232]:
print("Results of K Nearest Neighbor classifier on Imbalanced test dataset")
testing_kNNClassifier(imbalanced_scaled_data, y_imbalanced_test, best_features, knn_model)

Results of K Nearest Neighbor classifier on Imbalanced test dataset
testing_kNNClassifier method started
features_selected:  [6, 9, 1, 7, 17, 10, 6, 23, 13, 10, 4, 25]
X_test_selected: 
 [[-0.27857382 -1.05856388 -0.30500688 ... -0.75082586 -0.73138555
  -0.42141723]
 [ 0.99398679  3.61651274 -0.30500688 ...  1.71986889 -0.25675072
  -0.41964346]
 [-0.61124186 -0.50153776  0.02058327 ... -0.23173542 -0.25675072
  -0.41964346]
 ...
 [ 0.26885461 -0.02134283  1.32294388 ...  2.58717368 -0.25675072
   0.05167531]
 [-0.87372677 -0.02134283 -0.30500688 ... -0.75082586 -0.25675072
  -0.35598089]
 [-0.41332543  0.4876638  -0.63059703 ... -0.75082586 -0.07262514
  -0.42141472]]
Accuracy:  0.9168289505628565
Precision:  0.8460712440369914
Recall:  0.8861777100739439
F1-Score:  0.8631140201867274
Balanced accuracy:  0.8861777100739439
Matthews Correlation Coefficient:  0.7077856995157721
Cohen's Kappa:  0.7067790772416432
