In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from collections import defaultdict
import time

In [8]:
df = pd.read_parquet(r"C:\Users\ziyad\Downloads\final_air_quality_data.parquet")

In [None]:
lat_min, lat_max = df['lat'].min(), df['lat'].max()
lon_min, lon_max = df['lon'].min(), df['lon'].max()
grid_size = 20
lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)
lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)

lat_positions = pd.cut(df['lat'], bins=lat_bins, labels=False, include_lowest=True)
lon_positions = pd.cut(df['lon'], bins=lon_bins, labels=False, include_lowest=True)

df['location'] = lat_positions * grid_size + lon_positions

df['location'] = df['location'].fillna(0).astype(int)
df['class'] = df['class'].isin(['Good', 'Moderate']).astype(int)
df['time'] = pd.to_datetime(df['time'])

df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df['time'].dt.hour
df.drop(columns=['lat', 'lon', 'source_file', 'time'], inplace=True)


In [15]:
def load_and_prepare_data(df, year):
    df = df[(df['year'] == year)]
    print(f"Dataset shape: {df.shape}")    
    le_location = LabelEncoder()
    df['location_encoded'] = le_location.fit_transform(df['location'])
    df = df.sort_values(['location_encoded', 'month', 'day', 'hour'])
    feature_columns = [col for col in df.columns if col not in [
        'class', 'location', 'month', 'day', 'hour', 'location_encoded'
    ]]
    feature_columns.append('location_encoded')
    
    print(f"Feature columns: {feature_columns}")
    print(f"Number of features: {len(feature_columns)}")
    
    return df, feature_columns, le_location

In [10]:
def plot_training_history(history):
    """Plot training history for multi-step forecasting"""
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history['loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history['accuracy'], label='Training Accuracy')
    plt.plot(history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (per timestep)')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('classical_training_history_multistep.png', dpi=300, bbox_inches='tight')
    plt.show()

def plot_example_predictions(model, X_test, y_test, num_examples=3):
    """Plot example predictions vs actual values"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    with torch.no_grad():
        sample_indices = np.random.choice(len(X_test), num_examples, replace=False)
        
        plt.figure(figsize=(15, 5 * num_examples))
        
        for i, idx in enumerate(sample_indices):
            X_sample = torch.tensor(X_test[idx:idx+1]).to(device)
            y_true = y_test[idx]
            y_pred = model(X_sample).cpu().numpy()[0]
            
            plt.subplot(num_examples, 1, i+1)
            hours = np.arange(1, OUTPUT_SEQUENCE_LENGTH + 1)
            plt.plot(hours, y_true, 'bo-', label='Actual')
            plt.plot(hours, y_pred, 'ro--', label='Predicted')
            plt.title(f'Example {i+1}: Air Quality Prediction (Next {OUTPUT_SEQUENCE_LENGTH} Hours)')
            plt.xlabel('Hours Ahead')
            plt.ylabel('Air Quality Class')
            plt.yticks([0, 1], ['Good', 'Poor'])
            plt.legend()
            plt.grid(True)
        
        plt.tight_layout()
        plt.savefig('example_predictions.png', dpi=300, bbox_inches='tight')
        plt.show()

In [11]:
INPUT_SEQUENCE_LENGTH = 168
OUTPUT_SEQUENCE_LENGTH = 72 

In [12]:
class ClassicalLSTMModel(nn.Module):
    """
    Builds a purely classical LSTM model for multi-step time-series forecasting.
    Predicts the next 3 days (72 hours) of air quality classes.
    """
    def __init__(self, n_features, n_lstm_units=4, output_len=OUTPUT_SEQUENCE_LENGTH):
        super(ClassicalLSTMModel, self).__init__()
        
        # 1. Standard LSTM Layer
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_lstm_units,
            num_layers=1,
            batch_first=True,
            dropout=0.2
        )
        
        # 2. Feed-Forward Classifier for multi-step prediction
        self.classifier = nn.Linear(n_lstm_units, output_len)
        
    def forward(self, x):        
        # 1. Pass data through the classical LSTM
        lstm_out, _ = self.lstm(x)
        
        # 2. Extract features from the last timestep
        final_lstm_output = lstm_out[:, -1, :]
        
        # 3. Pass through classifier to get predictions for all timesteps
        output = self.classifier(final_lstm_output)
        
        # 4. Apply sigmoid activation to get probabilities for each timestep
        return torch.sigmoid(output)

In [None]:
def train_model_pytorch(model, train_loader, val_loader, optimizer, epochs, initial_epoch=0, patience=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    history = {
        'loss': [],
        'val_loss': [],
        'accuracy': [],
        'val_accuracy': []
    }

    criterion = torch.nn.BCELoss() 
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(initial_epoch, epochs):
        # Training
        model.train()
        running_loss, correct, total = 0.0, 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch.float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)
            preds = (outputs > 0.5).float()
            correct += (preds == y_batch).sum().item()
            total += y_batch.numel()

        train_loss = running_loss / len(train_loader.dataset)
        train_acc = correct / total

        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch.float())
                val_loss += loss.item() * X_batch.size(0)
                preds = (outputs > 0.5).float()
                val_correct += (preds == y_batch).sum().item()
                val_total += y_batch.numel()

        val_loss /= len(val_loader.dataset)
        val_acc = val_correct / val_total

        # Append to history
        history['loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['accuracy'].append(train_acc)
        history['val_accuracy'].append(val_acc)

        print(f"Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Acc: {train_acc:.4f} Val Acc: {val_acc:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    return model, history, optimizer


In [18]:
def create_sequences_memory_efficient(df, feature_columns, 
                                    input_len=INPUT_SEQUENCE_LENGTH,
                                    output_len=OUTPUT_SEQUENCE_LENGTH,
                                    stride=24):
    """
    Create sequences for multi-step forecasting.
    Input: `input_len` hours of data
    Output: `output_len` hours of predictions
    """
    print(f"Creating sequences with input length={input_len}, output length={output_len}...")
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[feature_columns])
    X_scaled = pd.DataFrame(X_scaled, columns=feature_columns, index=df.index)
    
    X_sequences, y_sequences, location_indices = [], [], []
    unique_locations = df['location_encoded'].unique()
    
    for i, loc in enumerate(unique_locations):
        loc_df = df[df['location_encoded'] == loc]
        loc_X = X_scaled.loc[loc_df.index]
        loc_y = loc_df['class'].values
        
        # Calculate valid range to ensure we have enough data for output sequence
        max_start_idx = len(loc_df) - input_len - output_len
        
        for j in range(0, max_start_idx, stride):
            X_seq = loc_X.iloc[j : j + input_len].values
            # Get the next `output_len` hours of class data
            y_target = loc_y[j + input_len : j + input_len + output_len]
            
            X_sequences.append(X_seq)
            y_sequences.append(y_target)
            location_indices.append(loc)
        
        if (i+1) % 100 == 0:
            print(f"Processed location {i+1}/{len(unique_locations)}")
    
    X_sequences = np.array(X_sequences, dtype=np.float32)
    y_sequences = np.array(y_sequences, dtype=np.float32)
    location_indices = np.array(location_indices)
    
    print(f"Total sequences: {X_sequences.shape[0]}")
    print(f"Input sequence shape: {X_sequences.shape}")
    print(f"Output sequence shape: {y_sequences.shape}")
    
    return X_sequences, y_sequences, location_indices, scaler

In [None]:
model = None
optimizer = None

for idx, year in enumerate(years):
    print(f"\nProcessing year: {year}")
    
    df_year, feature_columns, le_location = load_and_prepare_data(df, year)
    
    if model is None:  # First year, build the model
        n_lstm_units = 8
        model = ClassicalLSTMModel(
            n_features=len(feature_columns), 
            n_lstm_units=n_lstm_units,  
            output_len=OUTPUT_SEQUENCE_LENGTH
        )
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    unique_locs_subset = df_year['location_encoded'].unique()[:400]
    df_subset = df_year[df_year['location_encoded'].isin(unique_locs_subset)]
    
    # Create sequences
    X_sequences, y_sequences, location_indices, scaler = create_sequences_memory_efficient(
        df_subset, 
        feature_columns, 
        input_len=INPUT_SEQUENCE_LENGTH,
        output_len=OUTPUT_SEQUENCE_LENGTH,
        stride=24
    )
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_sequences, y_sequences, test_size=0.2, random_state=42, stratify=location_indices
    )
    
    # Create DataLoaders
    train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
    test_dataset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Free memory
    del X_sequences, y_sequences, location_indices, X_train, X_test, y_train, y_test
    gc.collect()
    
    # Train for this year
    epochs_this_year = 20
    model, history, optimizer = train_model_pytorch(
        model, train_loader, val_loader, optimizer,
        epochs=total_epochs_done + epochs_this_year,
        initial_epoch=total_epochs_done,
        patience=5
    )
    plot_training_history(history)
    total_epochs_done += epochs_this_year
    
    # Merge history
    for key in full_history.keys():
        full_history[key].extend(history[key])



Processing year: 2021
Dataset shape: (2035200, 58)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location_encoded'] = le_location.fit_transform(df['location'])


Feature columns: ['DUEXTTAU', 'BCFLUXU', 'OCFLUXV', 'BCANGSTR', 'SUFLUXV', 'SSSMASS25', 'SSSMASS', 'OCSMASS', 'BCCMASS', 'BCSMASS', 'SO4CMASS', 'SSFLUXU', 'DUCMASS', 'SSEXTTAU', 'SO2CMASS', 'DUSCATAU', 'OCANGSTR', 'OCCMASS', 'TOTEXTTAU', 'DUSCAT25', 'TOTANGSTR', 'DMSCMASS', 'SSEXTT25', 'DUANGSTR', 'DMSSMASS', 'BCEXTTAU', 'SSSCATAU', 'DUFLUXV', 'DUFLUXU', 'SUEXTTAU', 'SSFLUXV', 'BCSCATAU', 'DUCMASS25', 'OCEXTTAU', 'SUANGSTR', 'SSSCAT25', 'SSCMASS25', 'SO4SMASS', 'DUSMASS', 'SUFLUXU', 'BCFLUXV', 'DUSMASS25', 'SSCMASS', 'SUSCATAU', 'SO2SMASS', 'SSANGSTR', 'DUEXTT25', 'OCFLUXU', 'OCSCATAU', 'TOTSCATAU', 'PM25_MERRA2', 'PM25_ug_m3', 'year', 'location_encoded']
Number of features: 54
Creating sequences with input length=168, output length=72...




Processed location 100/400
Processed location 200/400
Processed location 300/400
Processed location 400/400
Total sequences: 80800
Input sequence shape: (80800, 168, 54)
Output sequence shape: (80800, 72)


KeyboardInterrupt: 

In [None]:
plot_training_history(full_history)


X_test_plot = []
y_test_plot = []
for X_batch, y_batch in val_loader:
    X_test_plot.append(X_batch)
    y_test_plot.append(y_batch)
X_test_plot = torch.cat(X_test_plot, dim=0).numpy()
y_test_plot = torch.cat(y_test_plot, dim=0).numpy()

plot_example_predictions(model, X_test_plot, y_test_plot, num_examples=3)