In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from collections import defaultdict
import time
import gc
import pickle
import joblib
import matplotlib.pyplot as plt

In [1]:
train_data_path = 'air_quality_data.parquet'

In [2]:
def transform_data(file_path, lat_bins=None, lon_bins=None, le_location=None):
    """Transform raw data into features for model training/testing"""
    df = pd.read_parquet(file_path)
    
    # Use provided bins or compute from data
    if lat_bins is None or lon_bins is None:
        lat_min, lat_max = df['lat'].min(), df['lat'].max()
        lon_min, lon_max = df['lon'].min(), df['lon'].max()
        grid_size = 20
        lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)
        lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)
    
    # Create grid positions
    lat_positions = pd.cut(df['lat'], bins=lat_bins, labels=False, include_lowest=True)
    lon_positions = pd.cut(df['lon'], bins=lon_bins, labels=False, include_lowest=True)
    df['location'] = lat_positions * 20 + lon_positions
    df['location'] = df['location'].fillna(0).astype(int)
    
    # Process class and time features
    df['class'] = df['class'].isin(['Good', 'Moderate']).astype(int)
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    
    # Select columns
    list_of_columns = ['class', 'PM25_MERRA2', 'DUCMASS', 'TOTANGSTR', 'DUFLUXV', 'SSFLUXV', 'DUFLUXU', 'BCCMASS', 'SSSMASS25', 'location']
    selected_columns = list_of_columns + ['year', 'month', 'day', 'hour']
    df = df[selected_columns]
    
    # Encode location
    if le_location is None:
        le_location = LabelEncoder()
        df['location_encoded'] = le_location.fit_transform(df['location'])
    else:
        # Handle unseen locations in test data
        df['location_encoded'] = df['location'].map(
            lambda x: le_location.transform([x])[0] if x in le_location.classes_ else -1
        )
    
    df = df.sort_values(['location_encoded', 'year', 'month', 'day', 'hour'])
    
    # Define feature columns
    feature_columns = [col for col in df.columns if col not in [
        'class', 'location', 'year', 'month', 'day', 'hour', 'location_encoded'
    ]]
    df.drop(columns='location', inplace=True)
    feature_columns.append('location_encoded')
    
    return df, feature_columns, lat_bins, lon_bins, le_location

In [3]:
def create_sequences_memory_efficient(df, feature_columns, scaler=None, 
                                    input_len=168, output_len=72, stride=24):
    """Create sequences for multi-step forecasting"""
    print(f"Creating sequences with input length={input_len}, output length={output_len}...")
    
    if scaler is None:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(df[feature_columns])
    else:
        X_scaled = scaler.transform(df[feature_columns])
    
    X_scaled = pd.DataFrame(X_scaled, columns=feature_columns, index=df.index)
    X_sequences, y_sequences, location_indices = [], [], []
    unique_locations = df['location_encoded'].unique()

    for i, loc in enumerate(unique_locations):
        loc_df = df[df['location_encoded'] == loc]
        loc_X = X_scaled.loc[loc_df.index]
        loc_y = loc_df['class'].values

        max_start_idx = len(loc_df) - input_len - output_len

        for j in range(0, max_start_idx, stride):
            X_seq = loc_X.iloc[j : j + input_len].values
            y_target = loc_y[j + input_len : j + input_len + output_len]

            X_sequences.append(X_seq)
            y_sequences.append(y_target)
            location_indices.append(loc)

        if (i+1) % 100 == 0:
            print(f"Processed location {i+1}/{len(unique_locations)}")

    X_sequences = np.array(X_sequences, dtype=np.float32)
    y_sequences = np.array(y_sequences, dtype=np.float32)
    location_indices = np.array(location_indices)

    print(f"Total sequences: {X_sequences.shape[0]}")
    print(f"Input sequence shape: {X_sequences.shape}")
    print(f"Output sequence shape: {y_sequences.shape}")

    return X_sequences, y_sequences, location_indices, scaler

In [4]:
class ClassicalLSTMModel(nn.Module):
    """LSTM model for multi-step time-series forecasting"""
    def __init__(self, n_features, n_lstm_units=128, num_layers=4, output_len=72):
        super(ClassicalLSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_lstm_units,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.classifier = nn.Linear(n_lstm_units, output_len)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        final_lstm_output = lstm_out[:, -1, :]
        output = self.classifier(final_lstm_output)
        return torch.sigmoid(output)

In [5]:
def train_model_pytorch(model, train_loader, val_loader, epochs=25, patience=5):
    """Train the PyTorch model"""
    print("Starting training with PyTorch...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

    history = defaultdict(list)
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        train_loss, train_correct, train_samples = 0, 0, 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * X_batch.size(0)
            preds = (outputs > 0.5).float()
            train_correct += (preds == y_batch).sum().item()
            train_samples += y_batch.size(0) * y_batch.size(1)

        train_loss /= train_samples
        train_acc = train_correct / train_samples
        history['loss'].append(train_loss)
        history['accuracy'].append(train_acc)

        # Validation
        model.eval()
        val_loss, val_correct, val_samples = 0, 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)

                val_loss += loss.item() * X_batch.size(0)
                preds = (outputs > 0.5).float()
                val_correct += (preds == y_batch).sum().item()
                val_samples += y_batch.size(0) * y_batch.size(1)

        val_loss /= val_samples
        val_acc = val_correct / val_samples
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_acc)

        scheduler.step(val_loss)

        epoch_duration = time.time() - epoch_start_time
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Loss: {train_loss:.4f}, Acc: {train_acc:.4f} - "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f} - "
              f"Duration: {epoch_duration:.2f}s")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_classical_lstm_model.pth')
            patience_counter = 0
            print("  -> Validation loss improved. Saving model.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("  -> Early stopping triggered.")
                break

    model.load_state_dict(torch.load('best_classical_lstm_model.pth'))
    return model, history

In [6]:
def plot_training_history(history):
    """Plot training history"""
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history['loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['accuracy'], label='Training Accuracy')
    plt.plot(history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (per timestep)')
    plt.legend()

    plt.tight_layout()
    plt.savefig('classical_training_history_multistep.png', dpi=300, bbox_inches='tight')
    plt.show()

In [7]:
if torch.cuda.is_available():
    print(f"✅ Found GPU: {torch.cuda.get_device_name(0)}. Using CUDA.")
else:
    print("❌ No GPU found. The script will run on the CPU.")

✅ Found GPU: NVIDIA GeForce RTX 3050 Laptop GPU. Using CUDA.


In [None]:
df, feature_columns, lat_bins, lon_bins, le_location = transform_data(train_data_path)

# Create sequences
X_sequences, y_sequences, location_indices, scaler = create_sequences_memory_efficient(
    df, feature_columns, input_len=168, output_len=72, stride=24
)

# Split data
X_train, X_val, y_train, y_val, loc_train, loc_val = train_test_split(
    X_sequences, y_sequences, location_indices,
    test_size=0.2, random_state=42, stratify=location_indices
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Validation target shape: {y_val.shape}")

# Clean up memory
del X_sequences, y_sequences, location_indices
gc.collect()

Creating sequences with input length=168, output length=72...
Processed location 100/400
Processed location 200/400
Processed location 300/400
Processed location 400/400
Total sequences: 579600
Input sequence shape: (579600, 168, 9)
Output sequence shape: (579600, 72)
Training set shape: (463680, 168, 9)
Validation set shape: (115920, 168, 9)
Training target shape: (463680, 72)
Validation target shape: (115920, 72)


0

In [9]:
batch_size = 512
train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_dataset = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
n_lstm_units = 128
num_layers = 4
model = ClassicalLSTMModel(
    n_features=len(feature_columns),
    n_lstm_units=n_lstm_units,
    num_layers=num_layers,
    output_len=72
)

In [12]:
model, history = train_model_pytorch(model, train_loader, val_loader, epochs=30, patience=5)

Starting training with PyTorch...


KeyboardInterrupt: 

In [None]:
plot_training_history(history)

In [13]:
preprocessing_state = {
    'lat_bins': lat_bins,
    'lon_bins': lon_bins,
    'le_location': le_location,
    'feature_columns': feature_columns,
    'scaler': scaler
}

with open('preprocessing_state.pkl', 'wb') as f:
    pickle.dump(preprocessing_state, f)

joblib.dump(scaler, 'scaler.pkl')

print("Training complete. Model and preprocessing objects saved.")

Training complete. Model and preprocessing objects saved.
