In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score
import random

In [2]:
def initialize_population(num_features, n):
    '''
    Here we create an initial population of subset of features in binary format.
    0: The feature is missing from the subset.
    1: The feature is present in the subset.
    Each subset has same length which is equal to num_features.
    Each subset is a flower.
    And collection of all flowers make the population.
    '''
    print("initialize_population method has started")
    print("Here we initialize a population of n flowers with random solutions")
    population=list()
    for i in range(n):
        flower=random.choices([0, 1], k=num_features)
        population.append(flower)
    print("initialize_population method has ended")
    return population

In [3]:
def fitness_function(features, X_train, y_train, X_val, y_val, generation, num_generations):
    print("fitness_function method has started.")
    if len(features)==0:
        print("No features were selected. Thus, returning fitness=0")
    print("Features: ",features)
    selected_feaures=list()
    try:
        for i in range(0, len(features)):
            if features[i]==1:
                selected_feaures.append(i)
            
        print("selected_features: ",selected_feaures)

        X_train_selected=X_train[:, selected_feaures]
        X_val_selected=X_val[:, selected_feaures]
        model=KNeighborsClassifier()
        model.fit(X_train_selected, y_train)
        y_pred=model.predict(X_val_selected)
        recall=recall_score(y_val, y_pred, average='binary')
        penalty=(len(features)/X_train.shape[1])*(generation/num_generations)*2
        adjusted_recall=recall-penalty
        print("generation: ",generation)
        print("num_generations: ",num_generations)
        print("len(selected_feaures): ",len(selected_feaures))
        #Final fitness value
        adjusted_recall=recall-penalty
        print(f"Recall for selected features: {recall}, Adjusted recall for selected features: {adjusted_recall}")
        print("fitness_function method has ended")
        return adjusted_recall
    except Exception as e:
        print(f"Error in fitness_function with features {features}: {e}")
        print("fitness_function method has ended")
        return 0

In [4]:
def levy_flight(Lambda, num_features):
    print("levy_flight method has started.")
    a=math.gamma(1+Lambda)
    b=Lambda*math.gamma((1+Lambda)/2)
    c=np.sin((np.pi*Lambda)/2)
    d=(Lambda-1)/2
    e=2**d
    sigma=(a*c)/(b*e)
    print("sigma: ",sigma)
    u=np.random.normal(0,sigma,size=num_features)
    v=np.random.normal(0,1,size=num_features)
    s=u/np.abs(v)**1/Lambda
    print("Step: ",s)
    print("levy_flight method has ended.")
    return s

In [5]:
def fpa_feature_selection(X, y):
    print("fpa_feature_selection method has started.")
    #Setting parameters for Flower Pollination Algorithm
    num_features=X.shape[1] #Number of independent features
    max_generations=100 #Number of iterations
    population_size=int(num_features/2) #Number of subset of features
    p=0.8 #Switching probability
    Lambda=1.5 #It helps to control step size in Levy flight

    print("num_features: ",num_features)
    print("max_generations: ",max_generations)
    print("population_size: ",population_size)
    print("Switch probability p: ",p)

    #Splitting the dataset into training and validation set
    X_train, X_val, y_train, y_val=train_test_split(X, y, test_size=0.2, random_state=42)

    #Generating the initial population
    population=initialize_population(num_features, population_size)
    print("Initial population: ",population)

    #Evaluating initial population
    fitness_values=list()
    for flower in population:
        fitness=fitness_function(flower, X_train, y_train, X_val, y_val, 1, 1)
        fitness_values.append(fitness)

    print("Fitness values after evaluating initial population: ",fitness_values)
    best_fitness=max(fitness_values)
    print("Best fitness in the initial population is: ",best_fitness)
    best_fitness_index=fitness_values.index(best_fitness)
    best_flower=population[best_fitness_index]
    print("Best flower in the initial population: ",best_flower)
    print("Best fitness value lies at index: ",best_fitness_index)

    for generation in range(max_generations):
        for i in range(population_size):
            #Generating a random number
            r=random.random()
            print("Random number generated is: ",r)
            if r<p:
                print("Global pollination")
                L=levy_flight(Lambda, num_features)
                new_flower=population[i]+0.1*L*np.linalg.norm(np.array(best_flower)-np.array(population[i]))
                print("new_flower= ",new_flower)    
            else:
                print("Local pollination")
                epsilon=np.random.uniform(0,1,num_features)
                print("epsilon: ",epsilon)
                j, k = np.random.choice(population_size, 2, replace=False)
                xj=population[j]
                xk=population[k]
                print("Pollens selected from two randomly selected flowers: j=",xj," and xk=",xk)
                new_flower=population[i]+epsilon*(np.array(xj)-np.array(xk))
                print("new_flower: ",new_flower)
                
            print("type(new_flower): ",type(new_flower))

            new_flower=np.clip(new_flower, 0, 1)
            new_flower=np.round(new_flower).astype(int)

            print("new_flower: ",new_flower)

            
            new_fitness=fitness_function(new_flower, X_train, y_train, X_val, y_val, generation, max_generations)
            print("New fitness is: ", new_fitness)
            print("Current best_fitness= ",best_fitness)
            if new_fitness>best_fitness:
                print("new_fitness > best_fitness")
                best_fitness=new_fitness
                best_flower=new_flower
            else:
                print("new_fitness < best_fitness")
                
                
    
    return best_flower

In [6]:
#Reading the processed dataset
cic_df=pd.read_parquet("binary_training_data.parquet")

In [7]:
#Fetching the first 5 rows of the dataset
cic_df.head()

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Fwd_Packets_Length_Total,Bwd_Packets_Length_Total,Fwd_Packet_Length_Max,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,...,Avg_Fwd_Segment_Size,Avg_Bwd_Segment_Size,Subflow_Fwd_Packets,Subflow_Fwd_Bytes,Subflow_Bwd_Packets,Subflow_Bwd_Bytes,Init_Fwd_Win_Bytes,Fwd_Act_Data_Packets,Fwd_Seg_Size_Min,isMalicious
0,2999994.0,4.0,0.0,2064.0,0.0,516.0,44.0,0.0,0.0,0.0,...,44.0,0.0,4.0,2064.0,0.0,0.0,8192.0,3.0,20.0,1
1,11487.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25,...,108.666664,32.25,3.0,326.0,4.0,129.0,8192.0,1.0,20.0,1
2,9818.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25,...,108.666664,32.25,3.0,326.0,4.0,129.0,8192.0,1.0,20.0,1
3,10138.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25,...,108.666664,32.25,3.0,326.0,4.0,129.0,8192.0,1.0,20.0,1
4,109.0,3.0,1.0,53.0,0.0,53.0,17.666666,30.599564,0.0,0.0,...,17.666666,0.0,3.0,53.0,1.0,0.0,1080.0,1.0,20.0,0


In [8]:
#Computing shape of the dataset
cic_df.shape

(82184, 46)

In [9]:
#Dividing the dataset into Independent and Dependent features
X=cic_df.iloc[:, 0:-1]
y=cic_df.iloc[:, -1]

In [10]:
X

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Fwd_Packets_Length_Total,Bwd_Packets_Length_Total,Fwd_Packet_Length_Max,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,...,Avg_Packet_Size,Avg_Fwd_Segment_Size,Avg_Bwd_Segment_Size,Subflow_Fwd_Packets,Subflow_Fwd_Bytes,Subflow_Bwd_Packets,Subflow_Bwd_Bytes,Init_Fwd_Win_Bytes,Fwd_Act_Data_Packets,Fwd_Seg_Size_Min
0,2999994.0,4.0,0.0,2064.0,0.0,516.0,44.000000,0.000000,0.0,0.00000,...,99.500000,44.000000,0.00000,4.0,2064.0,0.0,0.0,8192.0,3.0,20.0
1,11487.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
2,9818.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
3,10138.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
4,109.0,3.0,1.0,53.0,0.0,53.0,17.666666,30.599564,0.0,0.00000,...,13.250000,17.666666,0.00000,3.0,53.0,1.0,0.0,1080.0,1.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82179,9749.0,3.0,4.0,326.0,129.0,326.0,108.666664,188.216187,112.0,32.25000,...,65.000000,108.666664,32.25000,3.0,326.0,4.0,129.0,8192.0,1.0,20.0
82180,343659.0,3.0,2.0,1912.0,232.0,640.0,86.909088,137.688019,976.0,121.13636,...,104.022728,86.909088,121.13636,3.0,1912.0,2.0,232.0,26883.0,1.0,32.0
82181,5401.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.00000,2.0,0.0,0.0,0.0,32738.0,0.0,20.0
82182,412081.0,3.0,2.0,1912.0,232.0,640.0,83.130432,135.737488,976.0,121.13636,...,101.711113,83.130432,121.13636,3.0,1912.0,2.0,232.0,26883.0,1.0,32.0


In [11]:
y

0        1
1        1
2        1
3        1
4        0
        ..
82179    1
82180    1
82181    1
82182    1
82183    1
Name: isMalicious, Length: 82184, dtype: int32

In [12]:
#Standardizing distribution of independent features using Standard Scaler
scaler=StandardScaler()
model=scaler.fit(X)
scaled_data=model.transform(X)
print(scaled_data)

[[ 0.97032358  0.28641171 -1.41438795 ... -0.47471168  1.04736257
  -0.40746592]
 [-0.43014952 -0.20728475  0.72432392 ... -0.47471168 -0.23363223
  -0.40746592]
 [-0.43093165 -0.20728475  0.72432392 ... -0.47471168 -0.23363223
  -0.40746592]
 ...
 [-0.43300154 -0.70098122 -1.41438795 ...  1.1193544  -0.87412963
  -0.40746592]
 [-0.2424233  -0.20728475 -0.34503202 ...  0.73911905 -0.23363223
   1.36375676]
 [-0.29255435  4.23598345 -1.41438795 ... -0.47471168 -0.23363223
  -0.40746592]]


In [13]:
best_features=fpa_feature_selection(scaled_data, y)

print(f"Selected features: {best_features}")
counter=0
for i in range(0, len(best_features)):
    if best_features[i]==1:
        counter+=1
print(f"Total number of selected features: {counter}")

fpa_feature_selection method has started.
num_features:  45
max_generations:  100
population_size:  22
Switch probability p:  0.8
initialize_population method has started
Here we initialize a population of n flowers with random solutions
initialize_population method has ended
Initial population:  [[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0], [1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1], [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0], [0, 1, 1, 0, 1, 