## Anomaly Detection with Isolation Forest

In [1]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
CICIDS_data = pd.read_csv('../Wednesday-workingHours.pcap_ISCX.csv')

In [None]:
# dropping duplications
def find_duplicated_columns(df):
    grouped_columns = []
    seen_columns = set()
    for col in df.columns:
        if col not in seen_columns:
            duplicates = [col] + [c for c in df.columns if c != col and df[col].equals(df[c])]
            if len(duplicates)>1:
                grouped_columns.extend(duplicates[1:])
            seen_columns.update(duplicates)
    return grouped_columns

In [None]:
# data preproccessing 
CICIDS_data.columns = [col.strip() for col in CICIDS_data.columns]
CICIDS_data.drop(CICIDS_data.columns[CICIDS_data.eq(0).all()], axis=1, inplace=True)
CICIDS_data.drop(find_duplicated_columns(CICIDS_data), axis=1, inplace=True)

In [None]:
# encoding the labels to be 1 for BENIGN and -1 for all else
CICIDS_data['Label_encoded'] = CICIDS_data['Label'].apply(lambda x:1 if x=='BENIGN' else -1)

In [None]:
# splitting my data into features (X) and target (y) ignoring the columns with inf values
X = CICIDS_data.drop(['Label','Flow Bytes/s', 'Flow Packets/s', 'Label_encoded'],axis=1).copy()
y = CICIDS_data['Label_encoded'].copy()

In [None]:
# filling nan values with -1
X.fillna(-1, inplace=True) 

In [None]:
# scaling my features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# calculating the anomaly contamination
contamination = y.value_counts()[-1]/len(y)
contamination

In [None]:
# building the model
model = IsolationForest(contamination=contamination, random_state=42)

In [None]:
# training the model and predicting the anomalies
model.fit(X_scaled)
predictions = model.predict(X_scaled)

In [None]:
# evaluating the model performance
print("Classification Report:")
print(classification_report(y, predictions))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y, predictions)