# Air Guard Advanced Semi-Supervised Learning
## FlexMatch-lite và Label Spreading

Notebook này implement và test 2 phương pháp semi-supervised nâng cao:

### 1. FlexMatch-lite
- **Dynamic threshold**: τ_c = AvgConf_c × τ_base
- **Focal loss** để xử lý class imbalance
- **Bias correction** cho các lớp hiếm

### 2. Label Spreading
- **Graph-based propagation** dựa trên feature similarity
- Tránh **confirmation bias** bằng cách sử dụng global structure
- Tự động handle **class imbalance** qua neighbor weights

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Import our custom libraries
import sys
sys.path.append('../src')

from semi_supervised_library import (
    SemiDataConfig, FlexMatchConfig, LabelSpreadingConfig,
    FlexMatchAQIClassifier, LabelSpreadingAQIClassifier,
    run_flexmatch, run_label_spreading,
    mask_labels_time_aware, AQI_CLASSES
)

In [None]:
# Load prepared data
DATA_PATH = Path("../data/processed")

# Load the feature-engineered dataset
df_features = pd.read_csv(DATA_PATH / "features_with_aqi.csv")
df_features['datetime'] = pd.to_datetime(df_features['datetime'])

print(f"Dataset shape: {df_features.shape}")
print(f"Date range: {df_features['datetime'].min()} to {df_features['datetime'].max()}")
print(f"\nAQI Class distribution:")
print(df_features['aqi_class'].value_counts())

df_features.head()

In [None]:
# Setup configurations
data_cfg = SemiDataConfig(
    target_col="aqi_class",
    cutoff="2017-01-01",
    random_state=42,
)

# FlexMatch configuration
flexmatch_cfg = FlexMatchConfig(
    tau_base=0.60,  # Lower base threshold
    max_iter=10,
    min_new_per_iter=20,
    focal_alpha=0.25,
    focal_gamma=2.0,
    threshold_warmup=3
)

# Label Spreading configuration
label_spreading_cfg = LabelSpreadingConfig(
    kernel="rbf",
    gamma=20,
    alpha=0.2,
    max_iter=30,
    n_neighbors=7
)

print("Configurations setup complete!")

In [None]:
# Create semi-supervised dataset (mask 95% labels in training set)
df_semi = mask_labels_time_aware(
    df_features, 
    cfg=data_cfg, 
    missing_fraction=0.95
)

# Check label distribution after masking
train_mask = df_semi['datetime'] < pd.Timestamp(data_cfg.cutoff)
labeled_mask = df_semi['is_labeled']

print(f"Training set size: {train_mask.sum():,}")
print(f"Labeled training samples: {(train_mask & labeled_mask).sum():,} ({(train_mask & labeled_mask).sum() / train_mask.sum() * 100:.1f}%)")
print(f"Unlabeled training samples: {(train_mask & ~labeled_mask).sum():,}")

print(f"\nLabeled training class distribution:")
labeled_train = df_semi[train_mask & labeled_mask]
print(labeled_train['aqi_class'].value_counts())

## FlexMatch-lite Training

In [None]:
# Train FlexMatch model
print("Training FlexMatch-lite model...")
flexmatch_results = run_flexmatch(df_semi, data_cfg, flexmatch_cfg)

print(f"\nFlexMatch Test Results:")
print(f"Accuracy: {flexmatch_results['test_metrics']['accuracy']:.4f}")
print(f"F1-macro: {flexmatch_results['test_metrics']['f1_macro']:.4f}")

# Show training history
fm_history = pd.DataFrame(flexmatch_results['history'])
print("\nFlexMatch Training History:")
print(fm_history[['iter', 'val_accuracy', 'val_f1_macro', 'new_pseudo', 'unlabeled_pool']])

In [None]:
# Visualize FlexMatch dynamic thresholds
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('FlexMatch-lite Training Analysis', fontsize=16, fontweight='bold')

# Plot 1: Dynamic thresholds over iterations
ax1 = axes[0, 0]
threshold_data = []
for i, record in enumerate(fm_history.iterrows()):
    row = record[1]
    if 'class_thresholds' in row and isinstance(row['class_thresholds'], dict):
        for cls, threshold in row['class_thresholds'].items():
            threshold_data.append({
                'iteration': row['iter'],
                'class': cls,
                'threshold': threshold
            })

if threshold_data:
    threshold_df = pd.DataFrame(threshold_data)
    for cls in AQI_CLASSES:
        cls_data = threshold_df[threshold_df['class'] == cls]
        if len(cls_data) > 0:
            ax1.plot(cls_data['iteration'], cls_data['threshold'], 
                    marker='o', label=cls, linewidth=2)
    ax1.set_xlabel('Iteration')
    ax1.set_ylabel('Threshold')
    ax1.set_title('Dynamic Thresholds by Class')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.grid(True, alpha=0.3)

# Plot 2: Validation performance
ax2 = axes[0, 1]
ax2.plot(fm_history['iter'], fm_history['val_accuracy'], 'b-o', label='Accuracy', linewidth=2)
ax2.plot(fm_history['iter'], fm_history['val_f1_macro'], 'r-s', label='F1-macro', linewidth=2)
ax2.set_xlabel('Iteration')
ax2.set_ylabel('Score')
ax2.set_title('Validation Performance')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: Pseudo labels added per iteration
ax3 = axes[1, 0]
ax3.bar(fm_history['iter'], fm_history['new_pseudo'], alpha=0.7, color='green')
ax3.set_xlabel('Iteration')
ax3.set_ylabel('New Pseudo Labels')
ax3.set_title('Pseudo Labels Added per Iteration')
ax3.grid(True, alpha=0.3)

# Plot 4: Unlabeled pool size
ax4 = axes[1, 1]
ax4.plot(fm_history['iter'], fm_history['unlabeled_pool'], 'purple', marker='d', linewidth=2)
ax4.set_xlabel('Iteration')
ax4.set_ylabel('Unlabeled Pool Size')
ax4.set_title('Remaining Unlabeled Samples')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Label Spreading Training

In [None]:
# Train Label Spreading model
print("Training Label Spreading model...")
label_spreading_results = run_label_spreading(df_semi, data_cfg, label_spreading_cfg)

print(f"\nLabel Spreading Test Results:")
print(f"Accuracy: {label_spreading_results['test_metrics']['accuracy']:.4f}")
print(f"F1-macro: {label_spreading_results['test_metrics']['f1_macro']:.4f}")

# Show propagation info
ls_history = pd.DataFrame(label_spreading_results['history'])
print("\nLabel Spreading Results:")
print(ls_history[['method', 'val_accuracy', 'val_f1_macro', 'originally_unlabeled', 'labels_propagated']])

## Comparison with Baseline Methods

In [None]:
# Load baseline results for comparison
try:
    # Try to load existing baseline results
    with open('../data/processed/baseline_results.json', 'r') as f:
        baseline_results = json.load(f)
    
    with open('../data/processed/self_training_results.json', 'r') as f:
        self_training_results = json.load(f)
        
    with open('../data/processed/co_training_results.json', 'r') as f:
        co_training_results = json.load(f)
    
    baseline_loaded = True
    print("Baseline results loaded successfully!")
    
except FileNotFoundError:
    print("Baseline results not found. Running comparison with FlexMatch and Label Spreading only.")
    baseline_loaded = False

In [None]:
# Create comprehensive comparison
comparison_results = []

# Add FlexMatch results
comparison_results.append({
    'Method': 'FlexMatch-lite',
    'Type': 'Advanced Semi-Supervised',
    'Accuracy': flexmatch_results['test_metrics']['accuracy'],
    'F1-Macro': flexmatch_results['test_metrics']['f1_macro'],
    'Key Feature': 'Dynamic thresholds + Focal loss'
})

# Add Label Spreading results
comparison_results.append({
    'Method': 'Label Spreading',
    'Type': 'Advanced Semi-Supervised',
    'Accuracy': label_spreading_results['test_metrics']['accuracy'],
    'F1-Macro': label_spreading_results['test_metrics']['f1_macro'],
    'Key Feature': 'Graph-based propagation'
})

# Add baseline methods if available
if baseline_loaded:
    comparison_results.append({
        'Method': 'Supervised Baseline',
        'Type': 'Supervised',
        'Accuracy': baseline_results['test_metrics']['accuracy'],
        'F1-Macro': baseline_results['test_metrics']['f1_macro'],
        'Key Feature': 'Traditional supervised learning'
    })
    
    comparison_results.append({
        'Method': 'Self-Training',
        'Type': 'Basic Semi-Supervised',
        'Accuracy': self_training_results['test_metrics']['accuracy'],
        'F1-Macro': self_training_results['test_metrics']['f1_macro'],
        'Key Feature': 'Fixed threshold pseudo-labeling'
    })
    
    comparison_results.append({
        'Method': 'Co-Training',
        'Type': 'Basic Semi-Supervised',
        'Accuracy': co_training_results['test_metrics']['accuracy'],
        'F1-Macro': co_training_results['test_metrics']['f1_macro'],
        'Key Feature': 'Two-view collaboration'
    })

# Create comparison DataFrame
comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df.sort_values('F1-Macro', ascending=False)

print("\n" + "="*80)
print("COMPREHENSIVE METHOD COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False, float_format='%.4f'))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Advanced Semi-Supervised Methods Comparison', fontsize=16, fontweight='bold')

# Accuracy comparison
ax1 = axes[0]
bars1 = ax1.bar(comparison_df['Method'], comparison_df['Accuracy'], 
                color=['red' if 'Advanced' in t else 'blue' if 'Basic' in t else 'green' 
                      for t in comparison_df['Type']], alpha=0.7)
ax1.set_title('Test Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.set_xticklabels(comparison_df['Method'], rotation=45, ha='right')
ax1.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars1, comparison_df['Accuracy']):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

# F1-Macro comparison
ax2 = axes[1]
bars2 = ax2.bar(comparison_df['Method'], comparison_df['F1-Macro'], 
                color=['red' if 'Advanced' in t else 'blue' if 'Basic' in t else 'green' 
                      for t in comparison_df['Type']], alpha=0.7)
ax2.set_title('F1-Macro Score Comparison')
ax2.set_ylabel('F1-Macro')
ax2.set_xticklabels(comparison_df['Method'], rotation=45, ha='right')
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars2, comparison_df['F1-Macro']):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

# Create legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='red', alpha=0.7, label='Advanced Semi-Supervised'),
    Patch(facecolor='blue', alpha=0.7, label='Basic Semi-Supervised'),
    Patch(facecolor='green', alpha=0.7, label='Supervised')
]
fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.02), ncol=3)

plt.tight_layout()
plt.show()

## Detailed Analysis

In [None]:
# Analyze class-wise performance
from sklearn.metrics import classification_report

print("\n" + "="*60)
print("CLASS-WISE PERFORMANCE ANALYSIS")
print("="*60)

print("\nFlexMatch-lite Classification Report:")
print(classification_report(
    flexmatch_results['pred_df']['y_true'], 
    flexmatch_results['pred_df']['y_pred'],
    target_names=AQI_CLASSES
))

print("\nLabel Spreading Classification Report:")
print(classification_report(
    label_spreading_results['pred_df']['y_true'], 
    label_spreading_results['pred_df']['y_pred'],
    target_names=AQI_CLASSES
))

In [None]:
# Confusion Matrix Comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Confusion Matrix Comparison: Advanced Methods', fontsize=16, fontweight='bold')

# FlexMatch confusion matrix
fm_cm = np.array(flexmatch_results['test_metrics']['confusion_matrix'])
sns.heatmap(fm_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=AQI_CLASSES, yticklabels=AQI_CLASSES, ax=axes[0])
axes[0].set_title('FlexMatch-lite')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Label Spreading confusion matrix
ls_cm = np.array(label_spreading_results['test_metrics']['confusion_matrix'])
sns.heatmap(ls_cm, annot=True, fmt='d', cmap='Oranges', 
            xticklabels=AQI_CLASSES, yticklabels=AQI_CLASSES, ax=axes[1])
axes[1].set_title('Label Spreading')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## Key Findings and Insights

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS AND INSIGHTS")
print("="*80)

print("\n1. FLEXMATCH-LITE ANALYSIS:")
print("-" * 40)
if len(fm_history) > 0:
    total_pseudo = fm_history['new_pseudo'].sum()
    final_acc = fm_history['val_f1_macro'].iloc[-1]
    print(f"   • Total pseudo labels generated: {total_pseudo:,}")
    print(f"   • Final validation F1-macro: {final_acc:.4f}")
    print(f"   • Training iterations: {len(fm_history)}")
    print(f"   • Dynamic thresholds adapted based on class confidence")
    print(f"   • Addressed class imbalance through lower thresholds for rare classes")

print("\n2. LABEL SPREADING ANALYSIS:")
print("-" * 40)
if len(ls_history) > 0:
    ls_info = ls_history.iloc[0]
    print(f"   • Originally unlabeled: {ls_info['originally_unlabeled']:,}")
    print(f"   • Labels propagated: {ls_info['labels_propagated']:,}")
    print(f"   • Kernel: {ls_info['kernel']} (gamma={ls_info['gamma']})")
    print(f"   • Alpha parameter: {ls_info['alpha']} (regularization)")
    print(f"   • Avoided confirmation bias through graph-based approach")

print("\n3. PERFORMANCE COMPARISON:")
print("-" * 40)
fm_acc = flexmatch_results['test_metrics']['accuracy']
fm_f1 = flexmatch_results['test_metrics']['f1_macro']
ls_acc = label_spreading_results['test_metrics']['accuracy']
ls_f1 = label_spreading_results['test_metrics']['f1_macro']

better_method = "FlexMatch-lite" if fm_f1 > ls_f1 else "Label Spreading"
print(f"   • Best performing method: {better_method}")
print(f"   • FlexMatch - Accuracy: {fm_acc:.4f}, F1-macro: {fm_f1:.4f}")
print(f"   • Label Spreading - Accuracy: {ls_acc:.4f}, F1-macro: {ls_f1:.4f}")

print("\n4. ADVANTAGES OBSERVED:")
print("-" * 40)
print("   FlexMatch-lite:")
print("   • Dynamic thresholds improved recall for rare AQI classes")
print("   • Iterative learning allowed gradual confidence building")
print("   • Focal loss helped balance class representation")
print("\n   Label Spreading:")
print("   • Global graph structure reduced local bias")
print("   • One-shot propagation more computationally efficient")
print("   • Natural smoothness assumptions for time-series data")

print("\n" + "="*80)

In [None]:
# Save advanced results
results_dir = Path("../data/processed/advanced_semi_results/")
results_dir.mkdir(exist_ok=True)

# Save FlexMatch results
flexmatch_save = {
    'config': {
        'tau_base': flexmatch_cfg.tau_base,
        'max_iter': flexmatch_cfg.max_iter,
        'focal_alpha': flexmatch_cfg.focal_alpha,
        'focal_gamma': flexmatch_cfg.focal_gamma
    },
    'test_metrics': flexmatch_results['test_metrics'],
    'history': flexmatch_results['history']
}

with open(results_dir / "flexmatch_results.json", 'w') as f:
    json.dump(flexmatch_save, f, indent=2, default=str)

# Save Label Spreading results
label_spreading_save = {
    'config': {
        'kernel': label_spreading_cfg.kernel,
        'gamma': label_spreading_cfg.gamma,
        'alpha': label_spreading_cfg.alpha,
        'n_neighbors': label_spreading_cfg.n_neighbors
    },
    'test_metrics': label_spreading_results['test_metrics'],
    'history': label_spreading_results['history']
}

with open(results_dir / "label_spreading_results.json", 'w') as f:
    json.dump(label_spreading_save, f, indent=2, default=str)

# Save comparison results
comparison_df.to_csv(results_dir / "method_comparison.csv", index=False)

print(f"\nResults saved to {results_dir}")
print("Files created:")
print("  • flexmatch_results.json")
print("  • label_spreading_results.json")
print("  • method_comparison.csv")