# CICIDS2017 Data Exploration and Graph Construction
## Phase 1: Understanding the Data and Building Initial Graphs

This notebook demonstrates:
1. Loading and exploring CICIDS2017 dataset
2. Preprocessing network flow data
3. Constructing graph representations
4. Visualizing network structures

In [None]:
# Import required libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import torch

# Import custom modules (make sure these files are in src/ directory)
from preprocessing.cicids_loader import CICIDS2017Loader
from preprocessing.graph_constructor import CyberGraphConstructor

# Setup plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")

## Step 1: Load and Explore Data

**Note**: Update the `data_dir` path to point to your CICIDS2017 CSV files

In [None]:
# Initialize data loader
loader = CICIDS2017Loader(data_dir="../data/raw/CICIDS2017")

# Load and preprocess data (using sample for testing - remove sample_size for full data)
df = loader.preprocess_pipeline(sample_size=50000)

# Display basic info
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Get dataset statistics
stats = loader.get_statistics()

print("=== Dataset Statistics ===")
for key, value in stats.items():
    if key != 'label_distribution':
        print(f"{key}: {value}")

print("\nLabel Distribution:")
for label, count in stats['label_distribution'].items():
    print(f"  {label}: {count}")

## Step 2: Visualize Data Distribution

In [None]:
# Plot attack vs benign distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Binary classification
attack_counts = df['is_attack'].value_counts()
axes[0].bar(['Benign', 'Attack'], attack_counts.values, color=['green', 'red'], alpha=0.7)
axes[0].set_title('Binary Classification Distribution')
axes[0].set_ylabel('Count')
axes[0].set_yscale('log')

# Multi-class distribution (top 10)
top_labels = df['label_encoded'].value_counts().head(10)
axes[1].barh(range(len(top_labels)), top_labels.values)
axes[1].set_yticks(range(len(top_labels)))
axes[1].set_yticklabels(top_labels.index)
axes[1].set_title('Top 10 Attack Types')
axes[1].set_xlabel('Count (log scale)')
axes[1].set_xscale('log')

plt.tight_layout()
plt.savefig('../results/visualizations/data_distribution.png', dpi=300)
plt.show()

In [None]:
# Analyze flow characteristics
numeric_cols = ['flow_duration', 'fwd_packets', 'bwd_packets', 'flow_bytes_per_sec']
available_cols = [col for col in numeric_cols if col in df.columns]

if available_cols:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(available_cols[:4]):
        # Compare benign vs attack
        benign = df[df['is_attack'] == 0][col].dropna()
        attack = df[df['is_attack'] == 1][col].dropna()
        
        axes[i].hist(benign, bins=50, alpha=0.5, label='Benign', color='green', density=True)
        axes[i].hist(attack, bins=50, alpha=0.5, label='Attack', color='red', density=True)
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Density')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].legend()
        axes[i].set_yscale('log')
    
    plt.tight_layout()
    plt.savefig('../results/visualizations/flow_characteristics.png', dpi=300)
    plt.show()

## Step 3: Construct Network Graph

In [None]:
# Initialize graph constructor
constructor = CyberGraphConstructor(time_window=300)

# Build NetworkX graph
print("Building graph from network flows...")
G = constructor.build_graph_from_flows(df, directed=True, aggregate=True)

print(f"\n=== Graph Summary ===")
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
print(f"Graph density: {nx.density(G):.4f}")

In [None]:
# Analyze graph properties
degrees = dict(G.degree())
attack_ratios = nx.get_node_attributes(G, 'attack_ratio')

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Degree distribution
axes[0].hist(list(degrees.values()), bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Degree')
axes[0].set_ylabel('Count')
axes[0].set_title('Node Degree Distribution')
axes[0].set_yscale('log')

# Attack ratio distribution
axes[1].hist(list(attack_ratios.values()), bins=50, edgecolor='black', alpha=0.7, color='red')
axes[1].set_xlabel('Attack Ratio')
axes[1].set_ylabel('Count')
axes[1].set_title('Node Attack Ratio Distribution')

# Degree vs Attack Ratio
deg_vals = [degrees[n] for n in attack_ratios.keys()]
att_vals = list(attack_ratios.values())
axes[2].scatter(deg_vals, att_vals, alpha=0.3)
axes[2].set_xlabel('Node Degree')
axes[2].set_ylabel('Attack Ratio')
axes[2].set_title('Degree vs Attack Involvement')
axes[2].set_xscale('log')

plt.tight_layout()
plt.savefig('../results/visualizations/graph_properties.png', dpi=300)
plt.show()

## Step 4: Convert to PyTorch Geometric Format

In [None]:
# Convert to PyTorch Geometric Data object
pyg_data = constructor.networkx_to_pyg(G)

print("=== PyTorch Geometric Data ===")
print(f"Number of nodes: {pyg_data.num_nodes}")
print(f"Number of edges: {pyg_data.num_edges}")
print(f"Node feature matrix shape: {pyg_data.x.shape}")
print(f"Edge feature matrix shape: {pyg_data.edge_attr.shape}")
print(f"Edge index shape: {pyg_data.edge_index.shape}")
print(f"\nLabel distribution:")
print(f"  Benign nodes: {(pyg_data.y == 0).sum().item()}")
print(f"  Malicious nodes: {(pyg_data.y == 1).sum().item()}")
print(f"  Class imbalance ratio: {(pyg_data.y == 1).sum().item() / (pyg_data.y == 0).sum().item():.4f}")

## Step 5: Visualize Graph Structure (Small Sample)

In [None]:
# Visualize a small subgraph
if G.number_of_nodes() > 50:
    # Sample high-degree nodes (likely important)
    top_nodes = sorted(degrees, key=degrees.get, reverse=True)[:50]
    G_sub = G.subgraph(top_nodes)
else:
    G_sub = G

constructor.visualize_graph(G_sub, 
                           output_path='../results/visualizations/network_graph_sample.png',
                           highlight_attacks=True)

print("Graph visualization saved!")

## Step 6: Save Processed Data and Graphs

In [None]:
# Save processed DataFrame
df.to_csv('../data/processed/cicids2017_processed.csv', index=False)
print("Processed data saved to: data/processed/cicids2017_processed.csv")

# Save NetworkX graph
constructor.save_graph(G, '../data/graphs/cicids_graph.gpickle')
print("NetworkX graph saved to: data/graphs/cicids_graph.gpickle")

# Save PyTorch Geometric data
torch.save(pyg_data, '../data/graphs/cicids_pyg_data.pt')
print("PyTorch Geometric data saved to: data/graphs/cicids_pyg_data.pt")

print("\nâœ… All data saved successfully!")

## Step 7: Summary Statistics and Next Steps

In [None]:
print("=== Project Status Summary ===")
print(f"âœ… Dataset loaded and preprocessed: {len(df)} records")
print(f"âœ… Graph constructed: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"âœ… PyTorch Geometric format ready for GNN training")
print(f"âœ… Visualizations created")
print("\n=== Next Steps ===")
print("1. Design GNN architecture (GCN, GAT, or GraphSAGE)")
print("2. Implement training pipeline")
print("3. Define evaluation metrics")
print("4. Train initial baseline model")
print("5. Hyperparameter tuning")
print("\nReady to move to Phase 2: Model Development! ðŸš€")