### Overview of the dataset

Dataset Characteristics: CIC-IDS2017 dataset contains network traffic data for the development and evaluation of intrusion detection systems. The dataset is designed to be representative of modern network traffic and includes more than 2.8 million network packets captured over a period of seven days in a real network environment. The dataset includes normal traffic and seven different attack scenarios: Brute Force, Heartbleed, Botnet, DoS, DDoS, Web Attack and Infiltration. The dataset consists of 2830743 rows and 79 columns. In these columns, 78 of them are features that are numerical and the 'label' column is categorical

Data [link](https://csciitd-my.sharepoint.com/:f:/g/personal/tmangla_iitd_ac_in/EoNOWmPMifxDsInZccN-pugB_MQYztP4UWuyHrlV_N5LJQ?e=52ticB)

### Data characteristics 

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import seaborn as sns
from glob import glob
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, auc

### Step 1: Data exploration

In [None]:
def read_data(fname):
    df = pd.read_csv(fname)
    df.rename(columns={col: col.strip() for col in df.columns}, inplace=True)
    return df 
    
def pre_process(df):
    df = df[df.columns.tolist()[:-1]]
    df = df.fillna(0)
    df.replace([np.inf, -np.inf], -1, inplace=True)
    return df

In [None]:
# Loading the dataset
data1 = read_data('data/CIC-IDS-2017/Monday-WorkingHours.pcap_ISCX.csv')
data2 = read_data('data/CIC-IDS-2017/Tuesday-WorkingHours.pcap_ISCX.csv')
data3 = read_data('data/CIC-IDS-2017//Wednesday-workingHours.pcap_ISCX.csv')
data4 = read_data('data/CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
data5 = read_data('data/CIC-IDS-2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
data6 = read_data('data/CIC-IDS-2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')

In [None]:
data1.columns

In [None]:
data = pd.concat([data1, data2, data3, data4, data5, data6]).reset_index()

In [None]:
data1.info()

In [None]:
pd.options.display.max_rows = 80

print('Overview of Columns:')
data1.describe().transpose()

In [None]:
pd.options.display.max_columns = 80
data1

#### Labels

In [None]:
print(data1['Label'].unique())
print(data1['Label'].value_counts())

In [None]:
print(data['Label'].unique())
print(data['Label'].value_counts())

In [None]:
# Creating a dictionary that maps each label to its attack type
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bot',
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}
label_col = "Attack Type"
# Creating a new column 'Label1' in the DataFrame based on the attack_map dictionary
data['Attack Type'] = data['Label'].map(attack_map)
data['Attack Type'].value_counts()

### Step 2: Unsupervised learning

**Exercise**: Train an anomaly detection model using benign data from Monday and test it on the remaining days. Report the TPR, FPR and AUC for each day

In [None]:
#data = pd.concat([data2, data3, data4, data5, data6]).reset_index()
data = data2 ## Tuesday data 

In [None]:
## remove nan, inf, -inf val
def preprocess(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.dropna()
    return df

In [None]:
data = preprocess(data)
data1 = preprocess(data1)

In [None]:
print(data["Label"].value_counts())

In [None]:
# Creating a dictionary that maps each label to its attack type
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bot',
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}
label_col = "Attack Type"
# Creating a new column 'Label1' in the DataFrame based on the attack_map dictionary
data['Attack Type'] = data['Label'].map(attack_map)
data['Type'] = data['Attack Type'].apply(lambda x: x if x == "BENIGN" else "ATTACK")

data1['Attack Type'] = data1['Label'].map(attack_map)
data1['Type'] = data1['Attack Type'].apply(lambda x: x if x == "BENIGN" else "ATTACK")


In [None]:
print(data1['Type'].value_counts())
print(data['Type'].value_counts())

In [None]:
label_col = 'Type'
# Separate features and labels
X_train = data1[features_col]  # Training Features
y_train = data1[label_col]                 # Training Labels
X_test = data[features_col]   # Testing Features
y_test = data[label_col]   

In [None]:
## Unsupervised learning models
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest


In [None]:
# Encode the labels (Benign = 0, Attack = 1)
encoder = LabelEncoder()
y_test_encoded = encoder.fit_transform(y_test)  # Only needed for evaluation

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train your model

## CODE HERE




In [None]:
# Get the decision function values (anomaly scores)
y_pred_scores = model.decision_function(X_test_scaled)

# Convert scores to binary labels (-1 → 1 for anomaly, 1 → 0 for normal)
y_pred = np.where(y_pred_scores < 0, 1, 0)

# Compute TPR, FPR, and AUC based on the continuous scores
fpr, tpr, _ = roc_curve(y_test_encoded, -y_pred_scores)  # Use the raw decision function scores
roc_auc = roc_auc_score(y_test_encoded, -y_pred_scores)

print("AUC Score:", roc_auc)

# Plot ROC Curve
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random classifier line
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve for Anomaly Detection")
plt.legend(loc="lower right")
plt.show()

**Next Step**: Can you improve the accuracy of detection?

### Step 3: Supervised Learning [Ignore for now]

In [None]:
new_data = data
new_data.replace([np.inf, -np.inf], np.nan, inplace=True)
new_data = new_data.dropna()

In [None]:
features_col = ['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min']

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
new_data['Attack Number'] = le.fit_transform(new_data['Attack Type'])

print(new_data['Attack Number'].unique())

In [None]:
features = new_data[features_col]
labels = new_data['Attack Number']

In [None]:
# Standardizing the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, \
 roc_auc_score, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators = 10, max_depth = 6, max_features = None, random_state = 0, n_jobs=-1)
rf1.fit(X_train, y_train)

cv_rf1 = cross_val_score(rf1, X_train, y_train, cv = 5)
print('Random Forest Model 1')
print(f'\nCross-validation scores:', ', '.join(map(str, cv_rf1)))
print(f'\nMean cross-validation score: {cv_rf1.mean():.2f}')

In [None]:
# Compute confusion matrix
y_pred_rf1 = rf1.predict(X_test)
conf_matrix_model1 = confusion_matrix(y_test, y_pred_rf1)

# Normalize the confusion matrix
conf_matrix_normalized = conf_matrix_model1.astype('float') / conf_matrix_model1.sum(axis=1, keepdims=True)

# Compute support (true count for each class)
support = conf_matrix_model1.sum(axis=1)

# Modify class labels to include support count
class_labels = [f"{cls} (n={count})" for cls, count in zip(rf1.classes_, support)]

# Plot
fig, ax = plt.subplots(figsize=(8, 6))  # Adjust size if needed
sns.heatmap(conf_matrix_normalized, annot=True, cmap='Blues', ax=ax, fmt=".2f",
            xticklabels=rf1.classes_, yticklabels=class_labels)

# Labels
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
ax.set_title('Normalized Confusion Matrix with Support')

plt.show()