<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [5]</a>'.</span>

# Label Spreading: Graph-Based Semi-Supervised Learning

**Mục tiêu:** So sánh Label Spreading với Self-Training và FlexMatch

**Ưu điểm:**
- Sử dụng manifold structure của dữ liệu
- Không cần iterative pseudo-labeling
- Tự nhiên xử lý class imbalance qua graph structure

**Nhược điểm:**
- Memory intensive (O(n²))
- Chậm hơn self-training cho large datasets

## 1. Setup & Load Data

In [1]:
# Papermill parameters
KERNEL = "rbf"
GAMMA = 20.0
ALPHA = 0.2
SAMPLE_SIZE = 50000
MAX_ITER = 30

In [2]:
# Parameters
KERNEL = "rbf"
GAMMA = 20.0
ALPHA = 0.2
SAMPLE_SIZE = 50000
MAX_ITER = 30


In [3]:
import sys
import json
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# Add src to path
# Robustly find project root
current_dir = Path.cwd()
project_root = current_dir
while not (project_root / 'src').exists():
    if project_root.parent == project_root:
        break # Reached file system root
    project_root = project_root.parent

if (project_root / 'src').exists():
    sys.path.insert(0, str(project_root / 'src'))
    print(f"Added {project_root / 'src'} to path")
else:
    print("Warning: could not find src directory")

from semi_supervised_library import (
    SemiDataConfig,
    LabelSpreadingConfig,
    run_label_spreading,
    AQI_CLASSES
)

print("[OK] Libraries imported successfully")

Added D:\daihoc\DataMining\air_guard_mini_project\src to path


[OK] Libraries imported successfully


In [4]:
# Load data
DATA_PATH = Path.cwd().parent / 'data' / 'processed' / 'dataset_for_semi.parquet'
if not DATA_PATH.exists():
    # Try relative path from project root
    DATA_PATH = project_root / 'data' / 'processed' / 'dataset_for_semi.parquet'

df = pd.read_parquet(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['aqi_class'].value_counts().sort_index())

# Check labeled fraction
labeled_frac = df['is_labeled'].mean()
print(f"\nLabeled fraction: {labeled_frac:.2%}")

Dataset shape: (420768, 56)

Class distribution:
aqi_class
Good                               1702
Hazardous                          2741
Moderate                          10065
Unhealthy                         11484
Unhealthy_for_Sensitive_Groups     5302
Very_Unhealthy                     5191
Name: count, dtype: int64

Labeled fraction: 8.67%


## 2. Experiment 1: Label Propagation (Simpler Version)

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [5]:
from sklearn.semi_supervised import LabelPropagation

# Test different gamma values for Label Propagation
label_prop_results = {}

data_cfg = SemiDataConfig()

for gamma_val in [10, 20, 30]:
    print(f"\nTesting Label Propagation with gamma={gamma_val}...")
    
    # Use LabelSpreadingConfig but we'll manually use LabelPropagation
    ls_cfg = LabelSpreadingConfig(
        kernel="rbf",
        gamma=gamma_val,
        alpha=0.2,  # Not used in LabelPropagation
        max_iter=MAX_ITER,
        sample_size=SAMPLE_SIZE
    )
    
    start_time = time.time()
    results = run_label_spreading(df, data_cfg, ls_cfg)
    elapsed_time = time.time() - start_time
    
    label_prop_results[gamma_val] = {
        'results': results,
        'time': elapsed_time
    }
    
    metrics = results['test_metrics']
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  F1-macro: {metrics['f1_macro']:.4f}")
    print(f"  Time: {elapsed_time:.2f}s")
    print(f"  Sampled: {metrics['sampled']}, Size: {metrics['sample_size']}")

print("\n[OK] Label Propagation experiments completed")


Testing Label Propagation with gamma=10...


MemoryError: Unable to allocate 18.6 GiB for an array with shape (50000, 50000) and data type float64

## 3. Experiment 2: Label Spreading with Different Parameters

In [None]:
# Grid search for best parameters
label_spread_results = {}

param_grid = [
    {'gamma': 10, 'alpha': 0.1},
    {'gamma': 20, 'alpha': 0.2},
    {'gamma': 30, 'alpha': 0.3},
]

for params in param_grid:
    gamma_val = params['gamma']
    alpha_val = params['alpha']
    
    print(f"\nTesting Label Spreading with gamma={gamma_val}, alpha={alpha_val}...")
    
    ls_cfg = LabelSpreadingConfig(
        kernel=KERNEL,
        gamma=gamma_val,
        alpha=alpha_val,
        max_iter=MAX_ITER,
        sample_size=SAMPLE_SIZE
    )
    
    start_time = time.time()
    results = run_label_spreading(df, data_cfg, ls_cfg)
    elapsed_time = time.time() - start_time
    
    key = f"gamma={gamma_val}, alpha={alpha_val}"
    label_spread_results[key] = {
        'results': results,
        'time': elapsed_time,
        'params': params
    }
    
    metrics = results['test_metrics']
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  F1-macro: {metrics['f1_macro']:.4f}")
    print(f"  Time: {elapsed_time:.2f}s")

print("\n[OK] Label Spreading experiments completed")

## 4. Find Best Configuration

In [None]:
# Find best config based on F1-macro
best_key = None
best_f1 = 0

for key, data in label_spread_results.items():
    f1 = data['results']['test_metrics']['f1_macro']
    if f1 > best_f1:
        best_f1 = f1
        best_key = key

print(f"\n{'='*60}")
print("BEST LABEL SPREADING CONFIGURATION")
print(f"{'='*60}")
print(f"Config: {best_key}")
print(f"F1-macro: {best_f1:.4f}")

best_results = label_spread_results[best_key]['results']
best_metrics = best_results['test_metrics']

print(f"\nTest Accuracy: {best_metrics['accuracy']:.4f}")
print(f"Test F1-macro: {best_metrics['f1_macro']:.4f}")
print(f"Training Time: {label_spread_results[best_key]['time']:.2f}s")

## 5. Comparison with Self-Training

In [None]:
# Load self-training results for comparison
ST_METRICS_PATH = Path.cwd().parent / 'data' / 'processed' / 'self_training_experiments' / 'metrics_tau_0_9.json'
if not ST_METRICS_PATH.parent.exists():
    ST_METRICS_PATH = project_root / 'data' / 'processed' / 'self_training_experiments' / 'metrics_tau_0_9.json'

FM_METRICS_PATH = Path.cwd().parent / 'data' / 'processed' / 'flexmatch_experiments' / 'metrics_flexmatch.json'
if not FM_METRICS_PATH.parent.exists():
    FM_METRICS_PATH = project_root / 'data' / 'processed' / 'flexmatch_experiments' / 'metrics_flexmatch.json'

# Load if exists
st_metrics = None
fm_metrics = None

if ST_METRICS_PATH.exists():
    with open(ST_METRICS_PATH, 'r') as f:
        st_metrics = json.load(f)
    print("[OK] Loaded Self-Training metrics")

if FM_METRICS_PATH.exists():
    with open(FM_METRICS_PATH, 'r') as f:
        fm_metrics = json.load(f)
    print("[OK] Loaded FlexMatch metrics")

In [None]:
# Create comparison dataframe
comparison_data = []

if st_metrics:
    comparison_data.append({
        'Method': 'Self-Training (τ=0.9)',
        'Accuracy': st_metrics['accuracy'],
        'F1-macro': st_metrics['f1_macro']
    })

if fm_metrics:
    comparison_data.append({
        'Method': 'FlexMatch',
        'Accuracy': fm_metrics['accuracy'],
        'F1-macro': fm_metrics['f1_macro']
    })

comparison_data.append({
    'Method': 'Label Spreading',
    'Accuracy': best_metrics['accuracy'],
    'F1-macro': best_metrics['f1_macro']
})

comparison_df = pd.DataFrame(comparison_data)

print("\n" + "="*60)
print("METHOD COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

## 6. Visualizations

In [None]:
# Create output directory
OUTPUT_DIR = Path.cwd().parent / 'data' / 'processed' / 'label_spreading_experiments'
if not OUTPUT_DIR.parent.exists():
    OUTPUT_DIR = project_root / 'data' / 'processed' / 'label_spreading_experiments'

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {OUTPUT_DIR}")

### 6.1. Method Comparison Chart

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = ['#3498db', '#e74c3c', '#2ecc71']

# Accuracy
axes[0].bar(comparison_df['Method'], comparison_df['Accuracy'], color=colors[:len(comparison_df)])
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylim([0.5, 0.65])
axes[0].tick_params(axis='x', rotation=15)

for i, v in enumerate(comparison_df['Accuracy']):
    axes[0].text(i, v + 0.005, f'{v:.4f}', ha='center', fontweight='bold')

# F1-macro
axes[1].bar(comparison_df['Method'], comparison_df['F1-macro'], color=colors[:len(comparison_df)])
axes[1].set_ylabel('F1-macro', fontsize=12)
axes[1].set_title('Test F1-macro Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylim([0.45, 0.60])
axes[1].tick_params(axis='x', rotation=15)

for i, v in enumerate(comparison_df['F1-macro']):
    axes[1].text(i, v + 0.005, f'{v:.4f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'method_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] Comparison chart saved")

### 6.2. Per-Class F1-Score

In [None]:
# Extract per-class F1 scores
ls_f1_per_class = [best_metrics['report'][c]['f1-score'] for c in AQI_CLASSES]

# Plot
plt.figure(figsize=(12, 6))
x = np.arange(len(AQI_CLASSES))

plt.bar(x, ls_f1_per_class, color='#2ecc71', alpha=0.8, label='Label Spreading')

if st_metrics:
    st_f1_per_class = [st_metrics['report'][c]['f1-score'] for c in AQI_CLASSES]
    plt.plot(x, st_f1_per_class, 'o-', color='#3498db', linewidth=2, 
             markersize=8, label='Self-Training', alpha=0.7)

plt.ylabel('F1-Score', fontsize=12)
plt.title('Per-Class F1-Score: Label Spreading', fontsize=14, fontweight='bold')
plt.xticks(x, [c.replace('_', ' ') for c in AQI_CLASSES], rotation=15, ha='right')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'per_class_f1.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] Per-class F1 chart saved")

### 6.3. Training Time Comparison

In [None]:
# Compare training times
time_data = []

for key, data in label_spread_results.items():
    time_data.append({
        'Config': key,
        'Time (s)': data['time'],
        'F1-macro': data['results']['test_metrics']['f1_macro']
    })

time_df = pd.DataFrame(time_data)

print("\n" + "="*60)
print("TRAINING TIME ANALYSIS")
print("="*60)
print(time_df.to_string(index=False))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(time_df['Time (s)'], time_df['F1-macro'], 
                     s=200, c=time_df['F1-macro'], cmap='viridis', alpha=0.7)

for i, row in time_df.iterrows():
    ax.annotate(row['Config'], (row['Time (s)'], row['F1-macro']), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)

ax.set_xlabel('Training Time (seconds)', fontsize=12)
ax.set_ylabel('F1-macro', fontsize=12)
ax.set_title('Training Time vs Performance', fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='F1-macro')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'time_vs_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n[OK] Training time chart saved")

## 7. Save Results

In [None]:
# Save best metrics
with open(OUTPUT_DIR / 'metrics_label_spreading.json', 'w') as f:
    json.dump(best_metrics, f, indent=2)

# Save comparison
comparison_df.to_csv(OUTPUT_DIR / 'method_comparison.csv', index=False)

# Save summary
summary = {
    'best_config': label_spread_results[best_key]['params'],
    'best_metrics': {
        'accuracy': best_metrics['accuracy'],
        'f1_macro': best_metrics['f1_macro']
    },
    'training_time': label_spread_results[best_key]['time'],
    'sampled': best_metrics['sampled'],
    'sample_size': best_metrics['sample_size'],
    'all_configs': [
        {
            'config': data['params'],
            'accuracy': data['results']['test_metrics']['accuracy'],
            'f1_macro': data['results']['test_metrics']['f1_macro'],
            'time': data['time']
        }
        for key, data in label_spread_results.items()
    ]
}

with open(OUTPUT_DIR / 'label_spreading_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(json.dumps(summary, indent=2))
print("\n[OK] All results saved to:", OUTPUT_DIR)

## 8. Analysis & Conclusion

**Label Spreading Characteristics:**
- Uses manifold structure → better for clustered data
- Single optimization → faster than iterative methods
- Memory intensive → requires sampling for large datasets

**When to use Label Spreading:**
- Data has clear manifold structure
- Dataset size manageable (< 100K samples)
- Want deterministic results (no randomness in pseudo-labeling)

**When to use Self-Training/FlexMatch:**
- Large datasets (> 100K samples)
- Need iterative refinement
- Want to control pseudo-label selection explicitly