Installation et Versions 

In [9]:
# =====================================================
# SPECTRAL CLUSTERING ON KDDCUP99 DATASET - COMPLETE PROJECT
# Fixed seed ensures EXACT same results every run
# =====================================================

import numpy as np          # Numerical computing library (arrays, math operations)
import random               # Random number generator for Python built-ins
seed = 42                   # Magic number! Fixed seed = reproducible results
np.random.seed(seed)        # Set numpy's random seed to 42
random.seed(seed)           # Set Python's random seed to 42
import warnings             # Warning message control
warnings.filterwarnings('ignore')  # Hide non-critical warnings for clean output

# Print library versions for reproducibility (professors love this!)
import sklearn              # Machine learning library
import matplotlib           # Plotting library
import seaborn as sns       # Statistical data visualization
print(f"scikit-learn: {sklearn.__version__}")     # Show sklearn version
print(f"matplotlib: {matplotlib.__version__}")    # Show matplotlib version
print(f"seaborn: {sns.__version__}")              # Show seaborn version


scikit-learn: 1.7.2
matplotlib: 3.10.7
seaborn: 0.13.2


Chargement et Exploration des Données

In [10]:
# Import dataset loader and preprocessing tools
from sklearn.datasets import fetch_kddcup99       # KDDCup99 intrusion detection dataset
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Data preprocessing
import pandas as pd         # Data manipulation and analysis

# Load KDDCup99 dataset 
# subset='SA' = Simulated Attacks only (smaller, faster)
# percent10=True = Use only 10% of data (makes it Colab-friendly)
# random_state=seed = Same random subset every time
data = fetch_kddcup99(subset='SA', percent10=True, random_state=seed)

# Separate features (X) and labels (y)
X, y = data.data, data.target     # X=features matrix, y=true labels

# Print basic dataset info
print(f"Dataset dimensions: {X.shape}")              # Shape: (samples, features)
print(f"Unique classes and counts: {np.unique(y, return_counts=True)}")  # Class distribution
print(f"First 5 feature names: {data.feature_names[:5]}...")  # Show feature names [web:13]


Dataset dimensions: (100655, 41)
Unique classes and counts: (array([b'back.', b'buffer_overflow.', b'ipsweep.', b'land.', b'neptune.',
       b'normal.', b'pod.', b'portsweep.', b'satan.', b'smurf.',
       b'teardrop.', b'warezclient.'], dtype=object), array([   15,     1,    10,     1,   898, 97278,     3,     8,    15,
        2409,     9,     8]))
First 5 feature names: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes']...


Prétraitement Complet

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Convert to DataFrame to identify categorical columns easily
df = pd.DataFrame(X, columns=data.feature_names)
print("First few rows (shows string columns):")
print(df.head())
print("\nCategorical columns detected:")
print(df.select_dtypes(include=['object']).columns.tolist())

# Method 1: Simple approach - encode ALL columns numerically
# Identify categorical columns (object dtype)
categorical_cols = df.select_dtypes(include=['object']).columns
numeric_cols = df.select_dtypes(exclude=['object']).columns

print(f"Categorical: {categorical_cols.tolist()}")
print(f"Numeric: {len(numeric_cols)} columns")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),           # Scale numeric
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical_cols)  # One-hot categorical
    ])

# Encode labels first (for evaluation later)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply preprocessing to features
X_processed = preprocessor.fit_transform(df)

print(f"Original shape: {X.shape}")
print(f"Processed shape: {X_processed.shape}")
print("Preprocessing COMPLETE - No more string errors!")
print(f"Label range: {y_encoded.min()} to {y_encoded.max()}")

First few rows (shows string columns):
  duration protocol_type  service   flag src_bytes dst_bytes land  \
0        0        b'tcp'  b'http'  b'SF'       181      5450    0   
1        0        b'tcp'  b'http'  b'SF'       239       486    0   
2        0        b'tcp'  b'http'  b'SF'       235      1337    0   
3        0        b'tcp'  b'http'  b'SF'       219      1337    0   
4        0        b'tcp'  b'http'  b'SF'       217      2032    0   

  wrong_fragment urgent hot  ... dst_host_count dst_host_srv_count  \
0              0      0   0  ...              9                  9   
1              0      0   0  ...             19                 19   
2              0      0   0  ...             29                 29   
3              0      0   0  ...             39                 39   
4              0      0   0  ...             49                 49   

  dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate  \
0                    1.0                    0.

Baseline K-Means

In [None]:
#NEED TO UNDERSTAND THIS ON YOUTUBE



# Import clustering and evaluation metrics
from sklearn.cluster import KMeans                    # Standard K-means algorithm
from sklearn.metrics import silhouette_score, adjusted_rand_score  # Clustering metrics

# Determine optimal number of clusters from ground truth
n_clusters = len(np.unique(y_encoded))   # Number of unique classes = number of clusters
print(f"Number of clusters: {n_clusters}")

# Train K-means baseline (n_init=10 tries different random starts)
kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init=10)
kmeans_labels = kmeans.fit_predict(X_processed)  # Fit model AND get predictions

# Calculate quality metrics
silhouette_kmeans = silhouette_score(X_processed, kmeans_labels)   # How separated are clusters?
ari_kmeans = adjusted_rand_score(y_encoded, kmeans_labels)      # Agreement with ground truth

print(f"Baseline K-means - Silhouette: {silhouette_kmeans:.3f}, ARI: {ari_kmeans:.3f}")


Number of clusters: 12


 Implémentation Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering     # Spectral clustering algorithm

# Spectral Clustering with RBF kernel (Gaussian similarity)
# gamma controls how similar points must be to connect
spectral = SpectralClustering(
    n_clusters=n_clusters,           # Same number as baseline
    affinity='rbf',                  # RBF = Gaussian kernel (most common)
    gamma=0.1,                       # Kernel width parameter (tuned later)
    random_state=seed,               # Reproducibility
    n_init=10                        # Multiple K-means restarts internally
)

# Fit and predict (computes Laplacian → eigenvectors → K-means automatically)
spectral_labels = spectral.fit_predict(X_scaled)

# Evaluate same metrics as baseline
silhouette_spectral = silhouette_score(X_scaled, spectral_labels)
ari_spectral = adjusted_rand_score(y_encoded, spectral_labels)

print(f"Spectral Clustering - Silhouette: {silhouette_spectral:.3f}, ARI: {ari_spectral:.3f}")


Recherche Hyperparamètres (Grid Search)

In [None]:
from sklearn.model_selection import ParameterGrid  # Grid search utility

# Test different gamma values (RBF kernel width)
param_grid = {'gamma': [0.01, 0.1, 1.0, 10.0]}    # 4 values to test
best_score = -1                                    # Track best silhouette score
best_gamma = None                                  # Track best parameter

# Loop through each gamma value
for gamma in param_grid['gamma']:
    # Create spectral clustering model with current gamma
    sc = SpectralClustering(n_clusters=n_clusters, affinity='rbf', 
                           gamma=gamma, random_state=seed, n_init=10)
    labels = sc.fit_predict(X_scaled)              # Train and predict
    score = silhouette_score(X_scaled, labels)     # Calculate quality
    print(f"Gamma={gamma}: Silhouette={score:.3f}") # Show progress
    # Keep track of best performing gamma
    if score > best_score:
        best_score = score
        best_gamma = gamma

print(f"Best gamma: {best_gamma} (score: {best_score:.3f})")


Visualisation (2D avec PCA)

In [None]:
from sklearn.decomposition import PCA      # Principal Component Analysis
import matplotlib.pyplot as plt            # Plotting library

# Reduce 41 dimensions → 2D for visualization
pca = PCA(n_components=2, random_state=seed)  # Keep 2 principal components
X_pca = pca.fit_transform(X_scaled)           # Transform data to 2D

# Create 3 subplots side-by-side
fig, axes = plt.subplots(1, 3, figsize=(15, 4))  # 1 row, 3 columns, size 15x4

# Plot 1: Ground truth labels (what we want to recover)
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_encoded, cmap='tab10')
axes[0].set_title('Ground Truth')             # Chart title
axes[0].set_xlabel('PC1')                     # X-axis label
axes[0].set_ylabel('PC2')                     # Y-axis label

# Plot 2: K-means results
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='tab10')
axes[1].set_title(f'K-means (Silhouette: {silhouette_kmeans:.3f})')

# Plot 3: Spectral clustering results
axes[2].scatter(X_pca[:, 0], X_pca[:, 1], c=spectral_labels, cmap='tab10')
axes[2].set_title(f'Spectral (Silhouette: {silhouette_spectral:.3f})')

plt.tight_layout()  # Fix spacing between plots
plt.show()          # Display the figure


Métriques Détaillées + Matrice de Confusion


In [None]:
# Import additional evaluation tools
from sklearn.metrics import confusion_matrix, classification_report

print("=== DETAILED COMPARISON ===")
print(f"Silhouette Score - K-means: {silhouette_kmeans:.3f} | Spectral: {silhouette_spectral:.3f}")
print(f"Adjusted Rand Index - K-means: {ari_kmeans:.3f} | Spectral: {ari_spectral:.3f}")

# Normalized confusion matrix (rows=truth, columns=prediction)
cm = confusion_matrix(y_encoded, spectral_labels, normalize='true')  # Percentages
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues')  # Pretty heatmap
plt.title('Confusion Matrix - Spectral Clustering')   # Title
plt.ylabel('True Label')                              # Y-axis
plt.xlabel('Predicted Label')                         # X-axis
plt.show()                                            # Display
