In [None]:
# ## Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [None]:
# ## Step 2: Load the Dataset
file_path = "../data/VNAT_Dataframe_release_1.csv"  # Update with the correct path
data = pd.read_csv(file_path)

### Preprocessing: convert string array to float

In [None]:
def get_float(str_array):
    # Remove brackets and split the string by commas
    cleaned_string = str_array.strip("[]")  # Remove [ and ]
    string_list = cleaned_string.split(",")    # Split by commas

    # Convert to a NumPy array of floats
    float_array = np.array(string_list, dtype=float)
    return float_array 

for field in ["timestamps", "sizes", "directions"]:
    print(field)
    data[field] = data[field].apply(lambda x:  get_float(x))
    

## Description
The dataset contains detailed logs of network connections extracted from a PCAP file. Each row represents a unique connection with the following columns:

- Unnamed: 0: Index of the row (IGNORE)
- connection: A tuple describing the connection (source IP, source port, destination IP, destination port, protocol).
- timestamps: A list of timestamps indicating when packets for the connection were captured.
- sizes: A list of packet sizes (in bytes) for the connection.
- directions: A list indicating packet directions (1 for source-to-destination, 0 for destination-to-source).
- file_names: The name of the PCAP file from which the data was extracted.

In [None]:
## Label extraction: Extract label from the filename
label_col = "label"
data[label_col] = data["file_names"].apply(lambda x: '_'.join(x.split('_')[:2]))

# ### Drop Classes with Less Than 5 Instances
class_counts = data[label_col].value_counts()
data = data[data[label_col].isin(class_counts[class_counts > 5].index)]

le = LabelEncoder()
data["label_encoded"] = le.fit_transform(data["label"])

In [None]:
data.info()

In [None]:
data.head(2)

## Feature Extraction

In [None]:
def get_avg(x):
    return np.mean(x)

def get_std(x):
    return np.std(x)

data["avg_pkt_size"] = data["sizes"].apply(get_avg)

## Get other features
### Packet-level: avg, std packet sizes
### Flow duration, total data, total pkts
### Intra-dlow: mean and std of inter-arrival time or relative variance




In [None]:
#### Feature Selection
feature_cols = []  # Adjust as needed

### Drop values with no data
data_tmp = data.dropna()
features = data_tmp[feature_cols]
labels = data_tmp["label_encoded"]


In [None]:
# ## Step 4: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

# ## Step 5: Standardize the Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
### ## Step 6: Train a Model
# Using Random Forest as an example
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
# ## Step 7: Evaluate the Model
y_pred = clf.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ## Step 8: Feature Importance
# Plot feature importances
importances = clf.feature_importances_
feature_names = features.columns

plt.figure(figsize=(12, 8))
plt.barh(feature_names, importances, color='skyblue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show()


In [None]:
## Can you think of methods to improve the accuracy? 