# Dynamic Graph Embeddings for Real-Time Anomaly Detection in Network Traffic

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import networkx as nx

### Data Acquisition

#### Load the KDD Cup 1999 dataset

In [17]:
gzipped_data = "data/kddcup.data.gz"

df = pd.read_csv(gzipped_data, compression='gzip')
df.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,238,1282,0,0,0,0,...,5,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,normal.


### Data Preprocessing

#### Data Cleaning

In [18]:
df = df.drop_duplicates()
df = df.dropna()

#### Encode categorical variables using one-hot encoding

In [19]:
categorical_cols = ['protocol_type', 'service', 'flag']
df_encoded = pd.get_dummies(df, columns=categorical_cols)

#### Normalize or scale numeric features

In [23]:
numeric_cols = [
    'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
    'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
    'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate'
]

scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])


#### Encode labels as binary (1 for attacks, 0 for normal)

In [24]:
label_encoder = LabelEncoder()
df_encoded['label'] = label_encoder.fit_transform(df_encoded['label'])

###  Data Splitting

In [38]:
X = df_encoded.drop('label', axis=1)
y = df_encoded['label']

#### Split the dataset into training, validation and testing sets

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

#### Creating a graph representation using NetworkX

In [30]:
G = nx.Graph()

#### Define nodes based on unique src_bytes and dst_bytes values

In [31]:
all_nodes = np.concatenate((df_encoded['src_bytes'].unique(), df_encoded['dst_bytes'].unique()))
G.add_nodes_from(all_nodes)

#### Define edges based on connections

In [32]:
edges = [(row['src_bytes'], row['dst_bytes']) for _, row in df_encoded.iterrows()]
G.add_edges_from(edges)

#### calculating degree centrality 

In [37]:
degree_centrality = nx.degree_centrality(G)
df_encoded['dg_src_bytes'] = df_encoded['src_bytes'].map(degree_centrality)
df_encoded['dg_dst_bytes'] = df_encoded['dst_bytes'].map(degree_centrality)

In [34]:
df_encoded['degree_centrality']

0          0.017046
1          0.058877
2          0.062572
3          0.050511
4          0.053822
             ...   
4898425    0.062781
4898426    0.067836
4898427    0.066929
4898428    0.067836
4898429    0.067836
Name: degree_centrality, Length: 1074991, dtype: float64