# SPECTRAL CLUSTERING ON KDDCUP99 DATASET

**Nom Prénom :** Saad ait hamida - Taha laaribi - Youssef Oulaha - Dikra zizoune 

**Classe :**  4 iir6

Installation et Versions 

In [2]:


import numpy as np          # Numerical computing library (arrays, math operations)
import random               # Random number generator for Python built-ins
seed = 42                   # Magic number! Fixed seed = reproducible results
np.random.seed(seed)        # Set numpy's random seed to 42
random.seed(seed)           # Set Python's random seed to 42
import warnings             # Warning message control
warnings.filterwarnings('ignore')  # Hide non-critical warnings for clean output

# Print library versions for reproducibility (professors love this!)
import sklearn              # Machine learning library
import matplotlib           # Plotting library
import seaborn as sns       # Statistical data visualization
print(f"scikit-learn: {sklearn.__version__}")     # Show sklearn version
print(f"matplotlib: {matplotlib.__version__}")    # Show matplotlib version
print(f"seaborn: {sns.__version__}")              # Show seaborn version


scikit-learn: 1.7.2
matplotlib: 3.10.7
seaborn: 0.13.2


Chargement et Exploration des Données

In [3]:
# Import dataset loader and preprocessing tools

from sklearn.datasets import fetch_kddcup99

# Load KDDCup99 dataset 
# subset='SA' = Simulated Attacks only (smaller, faster)
# percent10=True = Use only 10% of data (makes it Colab-friendly)
# random_state=seed = Same random subset every time
data = fetch_kddcup99(subset='SA', percent10=True, random_state=42, as_frame=True)

X = data.data        # pandas DataFrame of features
y = data.target      # pandas Series of labels

print(f"Dataset dimensions: {X.shape}")
print(f"Unique classes and counts: {y.value_counts().to_dict()}")
print(f"First 5 rows of features:\n{X.head()}")



Dataset dimensions: (100655, 41)
Unique classes and counts: {b'normal.': 97278, b'smurf.': 2409, b'neptune.': 898, b'back.': 15, b'satan.': 15, b'ipsweep.': 10, b'teardrop.': 9, b'portsweep.': 8, b'warezclient.': 8, b'pod.': 3, b'buffer_overflow.': 1, b'land.': 1}
First 5 rows of features:
  duration protocol_type  service   flag src_bytes dst_bytes land  \
0        0        b'tcp'  b'http'  b'SF'       181      5450    0   
1        0        b'tcp'  b'http'  b'SF'       239       486    0   
2        0        b'tcp'  b'http'  b'SF'       235      1337    0   
3        0        b'tcp'  b'http'  b'SF'       219      1337    0   
4        0        b'tcp'  b'http'  b'SF'       217      2032    0   

  wrong_fragment urgent hot  ... dst_host_count dst_host_srv_count  \
0              0      0   0  ...              9                  9   
1              0      0   0  ...             19                 19   
2              0      0   0  ...             29                 29   
3             

Prétraitement Complet

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_kddcup99
import pandas as pd
import numpy as np

# 1) RELOAD: 10% subset to save memory
print("Loading 10% subset...")
data = fetch_kddcup99(subset="SA", percent10=True, random_state=42)
X, y = data.data, data.target  # X is numpy array now

# 2) Convert to DataFrame
df = pd.DataFrame(X, columns=data.feature_names)

# 3) KEEP ONLY numeric columns (drop categorical: protocol_type, service, flag)
numeric_cols = df.select_dtypes(include=[np.number]).columns
X_numeric = df[numeric_cols].values  # Convert to numpy array

# # 4) Scale
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X_numeric)

# 5) Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Original shape: {X.shape}")
# print(f"Numeric‑only shape: {X_scaled.shape}")  # (9739, 38) - perfect size!
print(f"Labels shape: {y_encoded.shape}")
print("Memory efficient preprocessing COMPLETE!")


Loading 10% subset...
Original shape: (100655, 41)
Labels shape: (100655,)
Memory efficient preprocessing COMPLETE!


Baseline K-Means

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Baseline: K-means avec k optimal estimé
n_clusters = len(np.unique(y))
kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init=10)
kmeans_labels = kmeans.fit_predict(X)

silhouette_kmeans = silhouette_score(X, kmeans_labels)
ari_kmeans = adjusted_rand_score(y, kmeans_labels)

print(f"Baseline K-means - Silhouette: {silhouette_kmeans:.3f}, ARI: {ari_kmeans:.3f}")


ValueError: could not convert string to float: b'tcp'

 Implémentation Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering     # Spectral clustering algorithm

# Spectral Clustering with RBF kernel (Gaussian similarity)
# gamma controls how similar points must be to connect
spectral = SpectralClustering(
    n_clusters=n_clusters,           # Same number as baseline
    affinity='rbf',                  # RBF = Gaussian kernel (most common)
    gamma=0.1,                       # Kernel width parameter (tuned later)
    random_state=seed,               # Reproducibility
    n_init=10                        # Multiple K-means restarts internally
)

# Fit and predict (computes Laplacian → eigenvectors → K-means automatically)
spectral_labels = spectral.fit_predict(X)

# Evaluate same metrics as baseline
silhouette_spectral = silhouette_score(X, spectral_labels)
ari_spectral = adjusted_rand_score(y, spectral_labels)

print(f"Spectral Clustering - Silhouette: {silhouette_spectral:.3f}, ARI: {ari_spectral:.3f}")


Recherche Hyperparamètres (Grid Search)

In [None]:
from sklearn.model_selection import ParameterGrid  # Grid search utility

# Test different gamma values (RBF kernel width)
param_grid = {'gamma': [0.01, 0.1, 1.0, 10.0]}    # 4 values to test
best_score = -1                                    # Track best silhouette score
best_gamma = None                                  # Track best parameter

# Loop through each gamma value
for gamma in param_grid['gamma']:
    # Create spectral clustering model with current gamma
    sc = SpectralClustering(n_clusters=n_clusters, affinity='rbf', 
                           gamma=gamma, random_state=seed, n_init=10)
    labels = sc.fit_predict(X)              # Train and predict
    score = silhouette_score(X, labels)     # Calculate quality
    print(f"Gamma={gamma}: Silhouette={score:.3f}") # Show progress
    # Keep track of best performing gamma
    if score > best_score:
        best_score = score
        best_gamma = gamma

print(f"Best gamma: {best_gamma} (score: {best_score:.3f})")


Visualisation (2D avec PCA)

In [None]:
from sklearn.decomposition import PCA      # Principal Component Analysis
import matplotlib.pyplot as plt            # Plotting library

# Reduce 41 dimensions → 2D for visualization
pca = PCA(n_components=2, random_state=seed)  # Keep 2 principal components
X_pca = pca.fit_transform(X)           # Transform data to 2D

# Create 3 subplots side-by-side
fig, axes = plt.subplots(1, 3, figsize=(15, 4))  # 1 row, 3 columns, size 15x4

# Plot 1: Ground truth labels (what we want to recover)
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_encoded, cmap='tab10')
axes[0].set_title('Ground Truth')             # Chart title
axes[0].set_xlabel('PC1')                     # X-axis label
axes[0].set_ylabel('PC2')                     # Y-axis label

# Plot 2: K-means results
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='tab10')
axes[1].set_title(f'K-means (Silhouette: {silhouette_kmeans:.3f})')

# Plot 3: Spectral clustering results
axes[2].scatter(X_pca[:, 0], X_pca[:, 1], c=spectral_labels, cmap='tab10')
axes[2].set_title(f'Spectral (Silhouette: {silhouette_spectral:.3f})')

plt.tight_layout()  # Fix spacing between plots
plt.show()          # Display the figure


Métriques Détaillées + Matrice de Confusion


In [None]:
# Import additional evaluation tools
from sklearn.metrics import confusion_matrix, classification_report

print("=== DETAILED COMPARISON ===")
print(f"Silhouette Score - K-means: {silhouette_kmeans:.3f} | Spectral: {silhouette_spectral:.3f}")
print(f"Adjusted Rand Index - K-means: {ari_kmeans:.3f} | Spectral: {ari_spectral:.3f}")

# Normalized confusion matrix (rows=truth, columns=prediction)
cm = confusion_matrix(y_encoded, spectral_labels, normalize='true')  # Percentages
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues')  # Pretty heatmap
plt.title('Confusion Matrix - Spectral Clustering')   # Title
plt.ylabel('True Label')                              # Y-axis
plt.xlabel('Predicted Label')                         # X-axis
plt.show()                                            # Display
