# =============================================================================
# BASELINE MODEL TRAINING NOTEBOOK
# =============================================================================
## Purpose:
    - Load and preprocess the raw weather dataset.
    - Define the Gated Recurrent Unit (GRU) model architecture.
    - Train the baseline GRU model using specified hyperparameters and save its state.
    - Save the preprocessed and split data tensors for subsequent use.
# =============================================================================

# === Clone Repository & Install Dependencies ===

In [None]:
!rm -rf Sustainable_AI_Agent_Project
!git clone https://github.com/trongjhuongwr/Sustainable_AI_Agent_Project.git
%cd Sustainable_AI_Agent_Project

In [None]:
!pip install -q --extra-index-url https://download.pytorch.org/whl/cu121 -r /kaggle/working/Sustainable_AI_Agent_Project/requirements.txt

# 1. Import Libraries and Configuration

In [None]:
import os
import warnings
import logging
import copy
import random

# Suppress warnings for cleaner output
os.environ["GYM_DISABLE_WARNINGS"] = "true"
warnings.filterwarnings("ignore", module="gymnasium")
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("gymnasium").setLevel(logging.ERROR)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR
from builtins import print as builtin_print

print("Libraries imported successfully.")

# 2. Configuration Class

In [None]:
# Defines hyperparameters, file paths, and other parameters used throughout the baseline training process.
class Config:
    # --- Data Paths ---
    DATA_PATH = '/kaggle/input/seattle-weather/seattle-weather.csv' # Path to the raw dataset CSV file
    PROCESSED_DATA_SAVE_PATH = '/kaggle/working/processed_data.pt' # Output path for saving processed data tensors
    BASELINE_MODEL_SAVE_PATH = '/kaggle/working/baseline_model.pth' # Output path for saving the trained baseline model state dictionary

    # --- Data Preprocessing Parameters ---
    SEQUENCE_LENGTH = 30 # Number of past days used to predict the next day
    TEST_SIZE = 0.2 # Proportion of data reserved for the final test set
    VAL_SIZE_FROM_TEMP = 0.1 # Proportion of the remaining data (after test split) used for validation
    SEED = 42 # Random seed for reproducibility

    # --- Model Architecture Parameters ---
    INPUT_DIM = 4      # Number of input features: precipitation, temp_max, temp_min, wind
    HIDDEN_DIM = 256   # Dimensionality of the GRU hidden state
    N_LAYERS = 2       # Number of stacked GRU layers
    OUTPUT_DIM = 1     # Output dimension (binary classification: rain probability)
    DROPOUT = 0.2      # Dropout rate applied between GRU layers

    # --- Training Hyperparameters ---
    BATCH_SIZE = 64
    EPOCHS = 500       # Number of training epochs
    LEARNING_RATE = 0.0001
    WEIGHT_DECAY = 1e-4 # Weight decay for AdamW optimizer
    SCHEDULER_T_MAX = 50  # T_max for CosineAnnealingLR scheduler (cycle length)
    SCHEDULER_ETA_MIN = 1e-6 # Minimum learning rate for scheduler

    # --- Computation Device ---
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
random.seed(Config.SEED)
np.random.seed(Config.SEED)
torch.manual_seed(Config.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(Config.SEED)
    torch.cuda.manual_seed_all(Config.SEED)
    # Optional: Enable deterministic algorithms for full reproducibility, may impact performance
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

print(f"Configuration loaded. Using device: {Config.DEVICE}")
print(f"Seed set to: {Config.SEED}")
print(f"Processed data will be saved to: {Config.PROCESSED_DATA_SAVE_PATH}")
print(f"Baseline model will be saved to: {Config.BASELINE_MODEL_SAVE_PATH}")

# 3. Data Loading and Preprocessing

In [None]:
# Loads the dataset, performs feature engineering, scales features, creates time sequences,
# splits data into training, validation, and test sets, and converts them to PyTorch tensors.

def create_sequences(input_data, target_data, sequence_length):
    """
    Generates sequences suitable for time-series forecasting with RNNs.
    Args:
        input_data (np.ndarray): Array of input features.
        target_data (np.ndarray): Array of target values.
        sequence_length (int): The length of each input sequence.
    Returns:
        tuple: (np.ndarray, np.ndarray) containing input sequences and corresponding targets.
    """
    xs, ys = [], []
    for i in range(len(input_data) - sequence_length):
        x = input_data[i:(i + sequence_length)]
        y = target_data[i + sequence_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Load the dataset
try:
    df = pd.read_csv(Config.DATA_PATH)
    builtin_print(f"Dataset loaded successfully from {Config.DATA_PATH}. Shape: {df.shape}")
except FileNotFoundError:
    builtin_print(f"Error: Dataset file not found at {Config.DATA_PATH}. Please ensure the dataset is correctly added.")
    raise

# Feature Engineering: Convert categorical weather to binary target
df['weather_numeric'] = df['weather'].apply(lambda x: 1 if x in ['rain', 'drizzle'] else 0)
df = df.drop(columns=['date', 'weather']) # Drop original date and weather columns

# Scaling: Normalize input features to [0, 1] range
scaler = MinMaxScaler()
features_to_scale = ['precipitation', 'temp_max', 'temp_min', 'wind'] # Explicitly list features
scaled_features_np = scaler.fit_transform(df[features_to_scale])
target_np = df['weather_numeric'].values
builtin_print("Input features scaled using MinMaxScaler.")

# Sequence Creation: Generate input sequences and corresponding targets
X_np, y_np = create_sequences(scaled_features_np, target_np, Config.SEQUENCE_LENGTH)
builtin_print(f"Sequences created with length {Config.SEQUENCE_LENGTH}. Shape X: {X_np.shape}, Shape y: {y_np.shape}")

# Data Splitting: Stratified split into train, validation, and test sets
# First split: Separate the test set (20%)
X_temp, X_test_np, y_temp, y_test_np = train_test_split(
    X_np, y_np,
    test_size=Config.TEST_SIZE,
    random_state=Config.SEED,
    stratify=y_np # Ensure proportional target distribution
)
# Second split: Split the remaining data into train (90% of remainder) and validation (10% of remainder)
X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
    X_temp, y_temp,
    test_size=Config.VAL_SIZE_FROM_TEMP,
    random_state=Config.SEED,
    stratify=y_temp # Ensure proportional target distribution
)
builtin_print(f"Data split completed: Train={len(X_train_np)}, Validation={len(X_val_np)}, Test={len(X_test_np)}")

# Convert NumPy arrays to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32).unsqueeze(1) # Add channel dim for BCELoss
X_val_tensor = torch.tensor(X_val_np, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_np, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).unsqueeze(1)
builtin_print("Data successfully converted to PyTorch tensors.")

# Save the processed data tensors for use by other notebooks
processed_data = {
    'X_train': X_train_tensor, 'y_train': y_train_tensor,
    'X_val': X_val_tensor, 'y_val': y_val_tensor,
    'X_test': X_test_tensor, 'y_test': y_test_tensor,
}
try:
    torch.save(processed_data, Config.PROCESSED_DATA_SAVE_PATH)
    builtin_print(f"Processed data tensors saved to {Config.PROCESSED_DATA_SAVE_PATH}")
except Exception as e:
    builtin_print(f"Error saving processed data: {e}")

# 4. GRU Model Definition

In [None]:
# Defines the architecture of the Gated Recurrent Unit network used for weather prediction.

class WeatherGRU(nn.Module):
    """
    GRU model for binary weather prediction based on past sequence data.
    Args:
        input_dim (int): Number of input features.
        hidden_dim (int): Dimension of the GRU hidden state.
        n_layers (int): Number of stacked GRU layers.
        output_dim (int): Number of output units (1 for binary classification).
        dropout (float): Dropout probability applied between GRU layers.
    """
    def __init__(self, input_dim, hidden_dim, n_layers, output_dim, dropout):
        super(WeatherGRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # GRU Layer: batch_first=True expects input shape (batch, seq_len, features)
        # Dropout is applied only between layers if n_layers > 1
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers,
                          batch_first=True, dropout=dropout if n_layers > 1 else 0)

        # Fully Connected Layer: Maps the last hidden state to the output dimension
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Sigmoid Activation: Outputs a probability for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """Forward pass through the GRU network.
        Args:
            x (torch.Tensor): Input tensor of shape (batch, sequence_length, input_dim).
        Returns:
            torch.Tensor: Output tensor of shape (batch, output_dim) representing probabilities.
        """
        # GRU layer processes the sequence
        # We don't explicitly need the final hidden state `h_n` here
        gru_out, _ = self.gru(x)

        # Select the output from the last time step of the sequence
        # gru_out shape: (batch, seq_len, hidden_dim) -> gru_out[:, -1, :] shape: (batch, hidden_dim)
        last_time_step_output = gru_out[:, -1, :]

        # Pass through the fully connected layer
        out = self.fc(last_time_step_output)

        # Apply sigmoid activation
        return self.sigmoid(out)

print("WeatherGRU model class defined.")

# 5. Baseline Model Training Function

In [None]:
# Encapsulates the training loop, including loss calculation, optimization,
# learning rate scheduling, validation, and saving the best model state based on validation loss.

def train_baseline_model(model, train_loader, val_loader, config):
    """
    Trains the baseline GRU model.
    Args:
        model (nn.Module): The WeatherGRU model instance.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        config (Config): Configuration object containing hyperparameters.
    Returns:
        nn.Module: The trained model loaded with the best state observed during validation.
    """
    criterion = nn.BCELoss() # Binary Cross-Entropy Loss for binary classification
    # AdamW optimizer with specified learning rate and weight decay
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY)
    # Cosine annealing learning rate scheduler
    scheduler = CosineAnnealingLR(optimizer, T_max=config.SCHEDULER_T_MAX, eta_min=config.SCHEDULER_ETA_MIN)

    model.to(config.DEVICE) # Move model to the configured device (GPU or CPU)
    best_val_loss = float('inf')
    best_model_state = None # To store the state_dict of the best model

    print("\n--- Starting Baseline Model Training ---")
    for epoch in range(config.EPOCHS):
        model.train() # Set model to training mode
        total_train_loss = 0.0

        # Progress bar for training batches
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS} [Train]", leave=False)
        for inputs, labels in train_bar:
            inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)

            optimizer.zero_grad() # Clear previous gradients
            outputs = model(inputs) # Forward pass
            loss = criterion(outputs, labels) # Calculate loss
            loss.backward() # Backpropagation
            optimizer.step() # Update weights

            total_train_loss += loss.item()
            train_bar.set_postfix(loss=f"{loss.item():.4f}") # Update progress bar description

        avg_train_loss = total_train_loss / len(train_loader)

        # --- Validation Phase ---
        model.eval() # Set model to evaluation mode
        total_val_loss = 0.0
        with torch.no_grad(): # Disable gradient calculations for validation
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        current_lr = optimizer.param_groups[0]['lr'] # Get current learning rate

        builtin_print(f"Epoch {epoch+1}/{config.EPOCHS}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, LR={current_lr:.6f}")

        # --- Learning Rate Scheduler Step ---
        scheduler.step()

        # --- Save Best Model State ---
        # Keep track of the model state that yields the lowest validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Use deepcopy to ensure the state isn't affected by further training
            best_model_state = copy.deepcopy(model.state_dict())
            builtin_print(f"  New best validation loss: {best_val_loss:.4f}. Saving model state.")

    # --- Load Best Model State ---
    # After training completes, load the best state found during validation
    if best_model_state:
        model.load_state_dict(best_model_state)
        builtin_print(f"\n--- Best model state loaded (Validation Loss: {best_val_loss:.4f}) ---")
    else:
        builtin_print("\n--- Warning: No best model state was saved. Check validation loss behavior. ---")

    print("--- Baseline Model Training Finished ---")
    return model

print("Baseline model training function defined.")

# 6. Execute Baseline Training

In [None]:
# Initializes the DataLoaders, instantiates the WeatherGRU model, trains it using the
# `train_baseline_model` function, and saves the final trained model state dictionary.

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
builtin_print("Train and Validation DataLoaders created.")

# Initialize the WeatherGRU model
baseline_model = WeatherGRU(
    input_dim=Config.INPUT_DIM,
    hidden_dim=Config.HIDDEN_DIM,
    n_layers=Config.N_LAYERS,
    output_dim=Config.OUTPUT_DIM,
    dropout=Config.DROPOUT
)
builtin_print(f"Baseline WeatherGRU model initialized with {count_parameters(baseline_model):,} parameters.")

# Train the model
baseline_model_trained = train_baseline_model(
    model=baseline_model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=Config # Pass the whole config object
)

# Save the state dictionary of the trained baseline model
try:
    # It's generally recommended to save only the state_dict
    torch.save(baseline_model_trained.state_dict(), Config.BASELINE_MODEL_SAVE_PATH)
    builtin_print(f"\nBaseline model state dictionary saved successfully to {Config.BASELINE_MODEL_SAVE_PATH}")
except Exception as e:
    builtin_print(f"\nError saving baseline model state dictionary: {e}")