In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
from torch.nn.init import orthogonal_
import torch.linalg
from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress convergence warnings from scikit-learn
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
pip install sktime


Collecting sktime
  Downloading sktime-0.40.1-py3-none-any.whl.metadata (33 kB)
Collecting scikit-base<0.14.0,>=0.6.1 (from sktime)
  Downloading scikit_base-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Downloading sktime-0.40.1-py3-none-any.whl (36.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_base-0.13.0-py3-none-any.whl (151 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.5/151.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-base, sktime
Successfully installed scikit-base-0.13.0 sktime-0.40.1
Note: you may need to restart the kernel to use updated packages.


## First we implement all the algorithms in code: NW-RNN, NW-ESN, RNN and ESN, then run these algorithms on 2 different datasets: FloodModeling1, FloodModeling2

## NW Cell Network

In [3]:
class NWCell(nn.Module):
    """
    Implements a single step of the Nanowire Neural Network cell.
    Based on Equations 7-11 from the paper.
    """
    def __init__(self, input_size, hidden_size, hp):
        super(NWCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # --- Weights and Biases ---
        # W_x * x(t+1) [cite: 85]
        self.W_x = nn.Linear(input_size, hidden_size, bias=True) 
        # W_h * h(t) [cite: 85]
        # Note: The paper adds the bias 'b' to the sum[cite: 85],
        # so we only add bias to one of the linear layers (W_x here).
        self.W_h = nn.Linear(hidden_size, hidden_size, bias=False)

        # --- Nanowire Hyperparameters [cite: 93, 128] ---
        self.Kp0 = hp['Kp0']
        self.Kd0 = hp['Kd0']
        self.eta_p = hp['eta_p']
        self.eta_d = hp['eta_d']
        self.delta_t = hp['delta_t']
        self.gamma = hp['gamma']
        
        # --- RESCALE Hyperparameters (Eq. 11) [cite: 102, 103] ---
        self.rescale_a = hp.get('rescale_a', 0.35)
        self.rescale_b = hp.get('rescale_b', 1.15)
        self.rescale_s = hp['rescale_s']

    def rescale(self, y):
        """
        Implements the RESCALE function (Eq. 11) [cite: 100-103].
        Uses torch.sigmoid(y*s) which is equivalent to 1 / (1 + exp(-y*s)).
        """
        return (self.rescale_b - self.rescale_a) * torch.sigmoid(y * self.rescale_s) + self.rescale_a

    def forward(self, input, hx):
        """
        Performs one time step.
        :param input: x(t+1), tensor of shape (batch_size, input_size)
        :param hx: h(t), tensor of shape (batch_size, hidden_size)
        :return: h(t+1), tensor of shape (batch_size, hidden_size)
        """
        
        # 1. Calculate z(t+1) = W_h*h(t) + W_x*x(t+1) + b
        #    (Eq. 7) [cite: 85]
        z_t1_affine = self.W_h(hx) + self.W_x(input)
        
        # 2. Apply RESCALE
        #    z(t+1) = RESCALE(...) (Eq. 7) [cite: 85, 98]
        z_t1 = self.rescale(z_t1_affine)
        
        # 3. Calculate potentiation and depression rates
        #    (Eq. 5, 6) [cite: 58, 59]
        Kp = self.Kp0 * torch.exp(self.eta_p * z_t1)
        Kd = self.Kd0 * torch.exp(-self.eta_d * z_t1)

        # 4. Calculate r(t+1)
        #    This combines Eq. 8 and 9 [cite: 86, 87]
        #    r(t+1) = Kp(z) - q(t+1)
        #    q(t+1) = (Kp(z) + Kd(z)) * h(t)
        #    So: r(t+1) = Kp(z) - (Kp(z) + Kd(z)) * h(t)
        #    r(t+1) = Kp(z) * (1 - h(t)) - Kd(z) * h(t)
        #    This matches the continuous form in Eq. 4 [cite: 57]
        r_t1 = Kp * (1.0 - hx) - Kd * hx
        
        # 5. Calculate the next hidden state h(t+1)
        #    (Eq. 10) [cite: 88]
        h_t1 = self.gamma * hx + r_t1 * self.delta_t
        
        # Ensure state h stays within [0, 1] as mentioned in [cite: 48, 62]
        # This is a practical constraint to maintain stability
        h_t1_clipped = torch.clamp(h_t1, 0.0, 1.0)

        return h_t1_clipped


In [4]:
class NW_Network(nn.Module):
    """
    A full Nanowire Network model that processes sequences.
    This can be configured as an NW-RNN (fully trainable) or
    an NW-ESN (fixed reservoir).
    """
    def __init__(self, input_size, hidden_size, output_size, hp):
        super(NW_Network, self).__init__()
        self.hidden_size = hidden_size
        self.cell = NWCell(input_size, hidden_size, hp)
        
        # Readout layer applied to the final state 
        self.readout = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0=None):
        """
        Processes a batch of sequences.
        :param x: Input tensor of shape (batch_size, seq_length, input_size)
        :param h0: (Optional) Initial hidden state
        :return: Output tensor of shape (batch_size, output_size)
        """
        batch_size = x.size(0)
        seq_length = x.size(1)

        # Initialize hidden state h(0)
        if h0 is None:
            h = torch.full((batch_size, self.hidden_size), 0.5, device=x.device)
        else:
            h = h0

        # Loop over the sequence
        for t in range(seq_length):
            h = self.cell(x[:, t, :], h)
            
        output = self.readout(h)
        
        return output

In [5]:
def count_parameters(model):
    """Counts the total and trainable parameters of a PyTorch model."""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

## RNN model

In [6]:
import torch
import torch.nn as nn

class StandardRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,
                 rnn_type="rnn", dropout=0.2, use_layernorm=True):
        """
        Generic RNN/GRU model for regression or classification.
        - output_size = 1 for regression
        - output_size = num_classes for classification
        """
        super().__init__()
        if rnn_type.lower() == "gru":
            self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(input_size, hidden_size, nonlinearity="tanh", batch_first=True)

        self.use_layernorm = use_layernorm
        self.ln = nn.LayerNorm(hidden_size) if use_layernorm else nn.Identity()
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, output_size)  # renamed from num_classes

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(-1)  # (B, T) -> (B, T, 1)
        out, _ = self.rnn(x)
        last = out[:, -1, :]
        last = self.ln(last)
        last = self.dropout(last)
        return self.fc(last)

## ESN Network

In [7]:
import torch.nn as nn
from torch.nn.init import orthogonal_
import torch

class ESNCell(nn.Module):
    """Leaky-Integrator ESN: h(t+1) = (1-a)h(t) + a*tanh(W_h h(t) + W_x x(t+1) + b)"""
    def __init__(self, input_size, hidden_size, hp):
        super().__init__()
        self.hidden_size = hidden_size
        self.leaking_rate = hp['leaking_rate']
        self.W_x = nn.Linear(input_size, hidden_size, bias=True)
        self.W_h = nn.Linear(hidden_size, hidden_size, bias=False)

    def forward(self, x_t, h_t):
        pre = self.W_h(h_t) + self.W_x(x_t)
        return (1.0 - self.leaking_rate) * h_t + self.leaking_rate * torch.tanh(pre)

class ESN_Network(nn.Module):
    """ESN wrapper with a linear readout"""
    def __init__(self, input_size, hidden_size, output_size, hp):
        super().__init__()
        self.hidden_size = hidden_size
        self.cell = ESNCell(input_size, hidden_size, hp)
        self.readout = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0=None):
        if x.dim() == 2:
            x = x.unsqueeze(-1)  # (B,T) -> (B,T,1)
        B, T, _ = x.shape
        h = torch.zeros(B, self.hidden_size, device=x.device) if h0 is None else h0
        for t in range(T):
            h = self.cell(x[:, t, :], h)
        return self.readout(h)  # (B, O)

## Training ESN, NW-ESN network

In [8]:
import numpy as np
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

@torch.no_grad()
def _collect_final_states(model: nn.Module, loader, device):
    """Run the reservoir and return the final hidden state for each sample."""
    model.eval()
    X_states_list, y_list = [], []
    for Xb, yb in loader:
        Xb = Xb.to(device)

        # bypass readout to get final state
        original = model.readout
        model.readout = nn.Identity()
        h_final = model(Xb)                 # (B, H)
        model.readout = original

        X_states_list.append(h_final.cpu().numpy())
        y_list.append(yb.cpu().numpy())
    X_states = np.concatenate(X_states_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    if y.ndim == 1:
        y = y[:, None]                      # (N,1)
    return X_states, y                      # (N,H), (N,1 or N,O)

def train_reservoir_readout_regression(
    model: nn.Module,
    train_loader,
    val_loader,
    device,
    alpha: float = 1.0,
    verbose: bool = False,
    return_sklearn_pipeline: bool = False
):
    """
    One-shot training of ESN/NW-ESN readout for REGRESSION using Ridge (with StandardScaler).
    Copies weights back to model.readout (unscaled-state space).
    Returns: (model, {'mse','mae','r2'}, pipeline_or_None)
    """
    if verbose:
        print(f"[One-shot Ridge] alpha={alpha}")

    # 1) collect final states & targets
    X_states, y = train_reservoir_readout_regression._cache \
                  if hasattr(train_reservoir_readout_regression, "_cache") \
                  else _collect_final_states(model, train_loader, device)

    # Cache states within this process to speed up grid-search (optional)
    train_reservoir_readout_regression._cache = (X_states, y)

    # 2) fit scaler + ridge
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('ridge',  Ridge(alpha=alpha))
    ])
    pipe.fit(X_states, y)

    scaler = pipe.named_steps['scaler']
    ridge  = pipe.named_steps['ridge']

    W = ridge.coef_.astype(np.float32)              # (O,H)
    b = ridge.intercept_.astype(np.float32)         # (O,)
    mean = scaler.mean_.astype(np.float32)          # (H,)
    std  = scaler.scale_.astype(np.float32)         # (H,)
    std[std == 0.0] = 1.0

    # Map to unscaled-state readout: y = (W/std) h + (b - sum(W*mean/std))
    W_torch = torch.tensor(W / std[None, :], device=device)
    b_torch = torch.tensor(b - (W * (mean[None, :] / std[None, :])).sum(axis=1), device=device)

    with torch.no_grad():
        model.readout.weight.data = W_torch
        model.readout.bias.data   = b_torch

    # 3) validate
    model.eval()
    preds, targs = [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            yhat = model(Xb)                     # (B, O)
            preds.append(yhat.cpu().numpy())
            targs.append(yb.cpu().numpy())

    yp = np.concatenate(preds, axis=0).squeeze()
    yt = np.concatenate(targs, axis=0).squeeze()

    def _agg(metric):
        if yp.ndim == 1:
            return metric(yt, yp)
        vals = [metric(yt[:, i], yp[:, i]) for i in range(yp.shape[1])]
        return float(np.mean(vals))

    mse = _agg(mean_squared_error)
    mae = _agg(mean_absolute_error)
    r2  = _agg(r2_score)
    if verbose:
        print(f"[Val] MSE={mse:.6f} | MAE={mae:.6f} | R²={r2:.4f}")

    return model, {'mse': mse, 'mae': mae, 'r2': r2}, (pipe if return_sklearn_pipeline else None)

## Preprocessing the data

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

def per_series_znorm(X):
    """Per-series z-normalization for univariate (N, T)."""
    mu = X.mean(axis=1, keepdims=True)
    sd = X.std(axis=1, keepdims=True) + 1e-8
    return (X - mu) / sd

def read_ts_univariate_label_last_colon(path):
    """
    Reads lines after @data for a .ts file with:
      - @univariate true
      - @timestamps false
      - label appended to the *last* token as 'last_value:label'
    Returns: X (N, T), y (N,)
    """
    X_list, y_list = [], []
    in_data = False
    with open(path, 'r') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith('#'):
                continue
            if line.lower() == '@data':
                in_data = True
                continue
            if not in_data or line.startswith('@'):
                continue

            tokens = [t.strip() for t in line.split(',') if t.strip() != '']
            # Expect last token to be "value:label"
            last_tok = tokens[-1]
            if ':' not in last_tok:
                raise ValueError("Expected 'value:label' in the last token, got: " + last_tok)

            left, label_str = last_tok.rsplit(':', 1)
            # Replace last token with the left part (the actual last series value)
            tokens[-1] = left

            # Parse series + label
            try:
                series = [float(t) for t in tokens]
                label = float(label_str)
            except ValueError as e:
                raise ValueError(f"Failed to parse line: {line}") from e

            X_list.append(series)
            y_list.append(label)

    X = np.asarray(X_list, dtype=np.float32)
    y = np.asarray(y_list, dtype=np.float32)
    return X, y

def load_and_preprocess_ucr_ts(train_path, test_path, validation_split=0.2, random_state=42):
    """
    Wrapper matching your earlier signature. Uses the special reader above.
    """
    X_train_full, y_train_full = read_ts_univariate_label_last_colon(train_path)
    X_test, y_test = read_ts_univariate_label_last_colon(test_path)

    # Per-series z-normalization (regression-safe)
    X_train_full = per_series_znorm(X_train_full)
    X_test = per_series_znorm(X_test)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full,
        test_size=validation_split,
        random_state=random_state
    )
    print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
    return X_train, y_train, X_val, y_val, X_test, y_test


## Create dataloaders

In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

def create_dataloaders(X, y, batch_size=64, is_classification=True):
    """
    Converts numpy arrays (X, y) into PyTorch DataLoader.
    Handles both classification and regression.
    Returns:
      dataloader, (input_size, output_size)
    """
    # Ensure 3D shape: (N, T, F)
    X_tensor = torch.tensor(X, dtype=torch.float32)
    if X_tensor.ndim == 2:
        X_tensor = X_tensor.unsqueeze(-1)

    if is_classification:
        # Encode categorical labels
        unique_labels = sorted(list(np.unique(y)))
        label_map = {label: i for i, label in enumerate(unique_labels)}
        y_int = np.array([label_map[label] for label in y])
        y_tensor = torch.tensor(y_int, dtype=torch.long)
        output_size = len(unique_labels)
    else:
        # Regression → continuous float targets
        y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(-1)
        output_size = 1

    dataset = TensorDataset(X_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_size = X_tensor.shape[2]
    return dataloader, (input_size, output_size)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
N_NEURONS = 100

In [12]:
import random
# ---- seeds for reproducibility ----
def run_rnn_experiments(TRAIN_FILE_PATH, TEST_FILE_PATH):
    seed = 42
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    N_NEURONS = 100
    # --- Load and Process Data ---
    try:
    
        X_train, y_train, X_val, y_val, X_test, y_test = load_and_preprocess_ucr_ts(
            TRAIN_FILE_PATH, TEST_FILE_PATH
        )
    
    except FileNotFoundError:
        print("Error: Download FloodModeling1_TRAIN.ts and FloodModeling1_TEST.ts from the UCR Archive")
        print("and place them in the same directory as this notebook.")
    
    # --- Create DataLoaders for Classification ---
    train_loader, (input_size, output_size) = create_dataloaders(
        X_train, y_train, batch_size=64, is_classification=False
    )
    val_loader, _ = create_dataloaders(X_val, y_val, batch_size=64, is_classification=False)
    test_loader, _ = create_dataloaders(X_test, y_test, batch_size=64, is_classification=False)
    from tqdm.auto import tqdm
    print(f"\nData ready for PyTorch:")
    print(f"Input size: {input_size}")
    print(f"Output dimension: {output_size}")
    
    # ---- model ----
    N_NEURONS = 100
    std_rnn_regressor = StandardRNN(
        input_size=input_size,
        hidden_size=N_NEURONS,
        output_size=1,          
        rnn_type="rnn",         
        dropout=0.2,
        use_layernorm=True
    ).to(device)
    
    # ---- training setup ----
    learning_rate = 5e-4          
    epochs = 200                  
    patience = 10
    criterion = nn.MSELoss()  
    optimizer = torch.optim.AdamW(std_rnn_regressor.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=3, min_lr=1e-6, verbose=True
    )
    
    rnn_train_loss_history, rnn_val_loss_history = [], []
    best_val_loss = float('inf')
    patience_counter = 0
    
    print(f"\nStarting Standard RNN regression training for up to {epochs} epochs (patience={patience})...")
    
    
    for epoch in range(epochs):
        
        std_rnn_regressor.train()
        total_train_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)", leave=False)
        
        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            output = std_rnn_regressor(X_batch)
            # ensure same shape: (B, 1)
            y_batch = y_batch.view_as(output)
            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(std_rnn_regressor.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            progress_bar.set_postfix(loss=f"{loss.item():.6f}")
    
        avg_train_loss = total_train_loss / len(train_loader)
        rnn_train_loss_history.append(avg_train_loss) 
    
        # --- VALIDATION ---
        std_rnn_regressor.eval()
        total_val_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = std_rnn_regressor(X_batch)
                y_batch = y_batch.view_as(output)
                loss = criterion(output, y_batch)
                total_val_loss += loss.item()
    
        avg_val_loss = total_val_loss / len(val_loader)
        rnn_val_loss_history.append(avg_val_loss)
        scheduler.step(avg_val_loss)
    
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")
    
        # --- Early stopping ---
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = std_rnn_regressor.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered after {epoch+1} epochs "
                      f"(no val improvement for {patience} epochs).")
                break
    
    std_rnn_regressor.load_state_dict(best_model_state)
    print("Training complete.")
    
    
    # --- Evaluation ---
    std_rnn_regressor.eval()
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in tqdm(test_loader, desc="Evaluating RNN (regression)"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = std_rnn_regressor(X_batch)
            preds.append(output.cpu().numpy())
            targets.append(y_batch.cpu().numpy())
    
    preds = np.concatenate(preds).flatten()
    targets = np.concatenate(targets).flatten()
    
    # --- Compute Regression Metrics ---
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(targets, preds)
    mae = mean_absolute_error(targets, preds)
    r2 = r2_score(targets, preds)
    
    print(f"\nTest Results:")
    print(f"  MSE: {mse:.6f}")
    print(f"  MAE: {mae:.6f}")
    print(f"  R² : {r2:.4f}")

    print("Training NW-RNN model:")
    print(f"\nData ready for PyTorch:")
    print(f"Input size: {input_size}")
    print(f"Output size: 1 (regression target)")
    
    # --- Initialize NW-RNN Model for Regression ---
    hp_rnn = {
        'Kp0': 0.0001, 'Kd0': 0.5, 'eta_p': 10.0, 'eta_d': 1.0,
        'gamma': 0.95, 'rescale_s': 5.0, 'delta_t': 0.1
    }
    N_NEURONS = 100 
    
    nw_rnn_regressor = NW_Network(
        input_size=input_size,
        hidden_size=N_NEURONS,
        output_size=1,         
        hp=hp_rnn
    ).to(device)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    nw_rnn_regressor.to(device)
    print(f"Using device: {device}")
    
    # --- Training Setup ---
    learning_rate = 5e-4
    epochs = 200
    patience = 10
    criterion = nn.MSELoss() 
    optimizer = torch.optim.Adam(nw_rnn_regressor.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=3, verbose=True, min_lr=1e-6
    )
    
    print(f"\nStarting NW-RNN Regression Training for {epochs} epochs...")
    
    # --- Training Loop ---
    nw_rnn_train_loss_history, nw_rnn_val_loss_history = [], []
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(epochs):
        # --- TRAIN PHASE ---
        nw_rnn_regressor.train()
        total_train_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)", leave=False)
        
        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = nw_rnn_regressor(X_batch)
            y_batch = y_batch.view_as(output)
            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(nw_rnn_regressor.parameters(), max_norm=1.0)
            optimizer.step()
            total_train_loss += loss.item()
            progress_bar.set_postfix(loss=f"{loss.item():.6f}")
    
        avg_train_loss = total_train_loss / len(train_loader)
        nw_rnn_train_loss_history.append(avg_train_loss)
    
        # --- VALIDATION PHASE ---
        nw_rnn_regressor.eval()
        total_val_loss = 0.0
    
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = nw_rnn_regressor(X_batch)
                y_batch = y_batch.view_as(output)
                loss = criterion(output, y_batch)
                total_val_loss += loss.item()
    
        avg_val_loss = total_val_loss / len(val_loader)
        nw_rnn_val_loss_history.append(avg_val_loss)
        scheduler.step(avg_val_loss)
    
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")
    
        # --- EARLY STOPPING ---
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = nw_rnn_regressor.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered at epoch {epoch+1} "
                      f"(no val improvement for {patience} epochs).")
                break
    
    nw_rnn_regressor.load_state_dict(best_model_state)
    print("Training complete.")
    
    # --- EVALUATION ---
    nw_rnn_regressor.eval()
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in tqdm(test_loader, desc="Evaluating NW-RNN (Regression)"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = nw_rnn_regressor(X_batch)
            preds.append(output.cpu().numpy())
            targets.append(y_batch.cpu().numpy())
    
    preds = np.concatenate(preds).flatten()
    targets = np.concatenate(targets).flatten()
    
    # --- Regression Metrics ---
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(targets, preds)
    mae = mean_absolute_error(targets, preds)
    r2 = r2_score(targets, preds)
    
    print(f"\nTest Results:")
    print(f"  MSE: {mse:.6f}")
    print(f"  MAE: {mae:.6f}")
    print(f"  R² : {r2:.4f}")
    return train_loader, test_loader, val_loader, input_size, std_rnn_regressor, nw_rnn_regressor

In [13]:
TRAIN_FILE_PATH = '/kaggle/input/floodmodeling1/FloodModeling1_TRAIN.ts'
TEST_FILE_PATH = '/kaggle/input/floodmodeling1/FloodModeling1_TEST.ts'
train_loader, test_loader, val_loader, input_size, std_rnn_regressor, nw_rnn_regressor = run_rnn_experiments(TRAIN_FILE_PATH, TEST_FILE_PATH)

Train: (376, 266), Val: (95, 266), Test: (202, 266)

Data ready for PyTorch:
Input size: 1
Output dimension: 1

Starting Standard RNN regression training for up to 200 epochs (patience=10)...




Epoch 1/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1/200 | Train Loss: 0.200196 | Val Loss: 0.017542


Epoch 2/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2/200 | Train Loss: 0.091416 | Val Loss: 0.031221


Epoch 3/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3/200 | Train Loss: 0.094320 | Val Loss: 0.018689


Epoch 4/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 4/200 | Train Loss: 0.072271 | Val Loss: 0.010852


Epoch 5/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 5/200 | Train Loss: 0.065018 | Val Loss: 0.003943


Epoch 6/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 6/200 | Train Loss: 0.060411 | Val Loss: 0.001504


Epoch 7/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 7/200 | Train Loss: 0.043118 | Val Loss: 0.002904


Epoch 8/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 8/200 | Train Loss: 0.038965 | Val Loss: 0.003770


Epoch 9/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 9/200 | Train Loss: 0.034652 | Val Loss: 0.000406


Epoch 10/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 10/200 | Train Loss: 0.025105 | Val Loss: 0.000968


Epoch 11/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 11/200 | Train Loss: 0.021402 | Val Loss: 0.000351


Epoch 12/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 12/200 | Train Loss: 0.018065 | Val Loss: 0.002758


Epoch 13/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 13/200 | Train Loss: 0.018885 | Val Loss: 0.000270


Epoch 14/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 14/200 | Train Loss: 0.016375 | Val Loss: 0.000923


Epoch 15/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 15/200 | Train Loss: 0.014288 | Val Loss: 0.000571


Epoch 16/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 16/200 | Train Loss: 0.012346 | Val Loss: 0.002238


Epoch 17/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 17/200 | Train Loss: 0.011401 | Val Loss: 0.000263


Epoch 18/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 18/200 | Train Loss: 0.008210 | Val Loss: 0.000790


Epoch 19/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 19/200 | Train Loss: 0.008803 | Val Loss: 0.000835


Epoch 20/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 20/200 | Train Loss: 0.007049 | Val Loss: 0.000307


Epoch 21/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 21/200 | Train Loss: 0.007093 | Val Loss: 0.000287


Epoch 22/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 22/200 | Train Loss: 0.006439 | Val Loss: 0.000321


Epoch 23/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 23/200 | Train Loss: 0.005911 | Val Loss: 0.000314


Epoch 24/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 24/200 | Train Loss: 0.005211 | Val Loss: 0.000307


Epoch 25/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 25/200 | Train Loss: 0.005620 | Val Loss: 0.000342


Epoch 26/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 26/200 | Train Loss: 0.004921 | Val Loss: 0.000243


Epoch 27/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 27/200 | Train Loss: 0.004676 | Val Loss: 0.000283


Epoch 28/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 28/200 | Train Loss: 0.005229 | Val Loss: 0.000227


Epoch 29/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 29/200 | Train Loss: 0.005104 | Val Loss: 0.000269


Epoch 30/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 30/200 | Train Loss: 0.005121 | Val Loss: 0.000359


Epoch 31/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 31/200 | Train Loss: 0.004807 | Val Loss: 0.000260


Epoch 32/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 32/200 | Train Loss: 0.004685 | Val Loss: 0.000291


Epoch 33/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 33/200 | Train Loss: 0.003884 | Val Loss: 0.000286


Epoch 34/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 34/200 | Train Loss: 0.004662 | Val Loss: 0.000213


Epoch 35/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 35/200 | Train Loss: 0.004297 | Val Loss: 0.000284


Epoch 36/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 36/200 | Train Loss: 0.004202 | Val Loss: 0.000280


Epoch 37/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 37/200 | Train Loss: 0.004395 | Val Loss: 0.000298


Epoch 38/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 38/200 | Train Loss: 0.004329 | Val Loss: 0.000250


Epoch 39/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 39/200 | Train Loss: 0.004252 | Val Loss: 0.000385


Epoch 40/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 40/200 | Train Loss: 0.004504 | Val Loss: 0.000322


Epoch 41/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 41/200 | Train Loss: 0.004308 | Val Loss: 0.000286


Epoch 42/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 42/200 | Train Loss: 0.004561 | Val Loss: 0.000244


Epoch 43/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 43/200 | Train Loss: 0.003980 | Val Loss: 0.000209


Epoch 44/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 44/200 | Train Loss: 0.003955 | Val Loss: 0.000272


Epoch 45/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 45/200 | Train Loss: 0.004015 | Val Loss: 0.000342


Epoch 46/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 46/200 | Train Loss: 0.004351 | Val Loss: 0.000339


Epoch 47/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 47/200 | Train Loss: 0.004410 | Val Loss: 0.000287


Epoch 48/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 48/200 | Train Loss: 0.005022 | Val Loss: 0.000266


Epoch 49/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 49/200 | Train Loss: 0.003231 | Val Loss: 0.000245


Epoch 50/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 50/200 | Train Loss: 0.004121 | Val Loss: 0.000229


Epoch 51/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 51/200 | Train Loss: 0.003582 | Val Loss: 0.000258


Epoch 52/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 52/200 | Train Loss: 0.004435 | Val Loss: 0.000248


Epoch 53/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 53/200 | Train Loss: 0.004123 | Val Loss: 0.000259

Early stopping triggered after 53 epochs (no val improvement for 10 epochs).
Training complete.


Evaluating RNN (regression):   0%|          | 0/4 [00:00<?, ?it/s]


Test Results:
  MSE: 0.000323
  MAE: 0.013027
  R² : 0.0915
Training NW-RNN model:

Data ready for PyTorch:
Input size: 1
Output size: 1 (regression target)
Using device: cuda

Starting NW-RNN Regression Training for 200 epochs...




Epoch 1/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1/200 | Train Loss: 0.266202 | Val Loss: 0.063461


Epoch 2/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2/200 | Train Loss: 0.020618 | Val Loss: 0.014246


Epoch 3/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3/200 | Train Loss: 0.011749 | Val Loss: 0.002552


Epoch 4/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 4/200 | Train Loss: 0.001862 | Val Loss: 0.002907


Epoch 5/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 5/200 | Train Loss: 0.002516 | Val Loss: 0.000317


Epoch 6/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 6/200 | Train Loss: 0.000967 | Val Loss: 0.001586


Epoch 7/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 7/200 | Train Loss: 0.000975 | Val Loss: 0.000267


Epoch 8/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 8/200 | Train Loss: 0.000703 | Val Loss: 0.000356


Epoch 9/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 9/200 | Train Loss: 0.000595 | Val Loss: 0.000375


Epoch 10/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 10/200 | Train Loss: 0.000574 | Val Loss: 0.000355


Epoch 11/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 11/200 | Train Loss: 0.000542 | Val Loss: 0.000238


Epoch 12/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 12/200 | Train Loss: 0.000509 | Val Loss: 0.000257


Epoch 13/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 13/200 | Train Loss: 0.000489 | Val Loss: 0.000315


Epoch 14/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 14/200 | Train Loss: 0.000473 | Val Loss: 0.000270


Epoch 15/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 15/200 | Train Loss: 0.000476 | Val Loss: 0.000248


Epoch 16/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 16/200 | Train Loss: 0.000470 | Val Loss: 0.000254


Epoch 17/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 17/200 | Train Loss: 0.000479 | Val Loss: 0.000292


Epoch 18/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 18/200 | Train Loss: 0.000469 | Val Loss: 0.000276


Epoch 19/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 19/200 | Train Loss: 0.000463 | Val Loss: 0.000244


Epoch 20/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 20/200 | Train Loss: 0.000466 | Val Loss: 0.000259


Epoch 21/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 21/200 | Train Loss: 0.000463 | Val Loss: 0.000284

Early stopping triggered at epoch 21 (no val improvement for 10 epochs).
Training complete.


Evaluating NW-RNN (Regression):   0%|          | 0/4 [00:00<?, ?it/s]


Test Results:
  MSE: 0.000340
  MAE: 0.013329
  R² : 0.0417


## Carrying out Hyperparamter grid search for ESN, NW-ESN networks


In [14]:
import copy
from itertools import product

def _freeze_reservoir_params(model):
    for n, p in model.named_parameters():
        if "readout" not in n:
            p.requires_grad = False

def _init_reservoir_weights(module, spectral_radius, input_scaling, bias_scaling):
    """
    - Orthogonal W_h, rescaled to given spectral_radius
    - Uniform W_x in [-input_scaling, input_scaling]
    - Uniform bias in  [-bias_scaling,  bias_scaling]
    """
    for name, param in module.named_parameters():
        if name.endswith("W_h.weight"):
            orthogonal_(param)
            with torch.no_grad():
                eigvals = torch.linalg.eigvals(param.data)
                sr = torch.max(torch.abs(eigvals))
                if torch.is_complex(sr):
                    sr = sr.real
                if sr > 0:
                    param.data *= (spectral_radius / sr)
        elif name.endswith("W_x.weight"):
            with torch.no_grad():
                param.data.uniform_(-input_scaling, input_scaling)
        elif name.endswith("W_x.bias"):
            with torch.no_grad():
                param.data.uniform_(-bias_scaling, bias_scaling)

def run_reservoir_experiments_regression(train_loader, val_loader, test_loader,
                                         input_size, device,
                                         hidden_size=100,
                                         ridge_alphas=(0.1, 1.0, 10.0)):
    """
    Grid-search both NW-ESN and ESN for REGRESSION via one-shot Ridge readout.
    Select best by validation MSE. Returns dict of best models and metrics.
    """
    print(f"Running regression experiments on device: {device}\n")

    best_models = {}

    # ---------- 1) NW-ESN grid ----------
    print("--- 1) NW-ESN (frozen reservoir) grid ---")
    hp_space_nw_esn = {
        'input_scaling':  [1.0, 10.0],
        'bias_scaling':   [0.001, 0.1, 1.0],
        'spectral_radius':[0.8, 0.9, 0.95, 0.99],
        'gamma':          [0.1, 0.5, 0.8, 0.95, 1.0],
        'delta_t':        [0.1, 0.01, 0.001],
        # fixed “nanowire-ish” params if your NW_Network needs them:
        'Kp0': [0.0001], 'Kd0': [0.5], 'eta_p': [10.0], 'eta_d': [1.0], 'rescale_s': [5.0]
    }

    combos_nw = list(product(
        hp_space_nw_esn['input_scaling'],
        hp_space_nw_esn['bias_scaling'],
        hp_space_nw_esn['spectral_radius'],
        hp_space_nw_esn['gamma'],
        hp_space_nw_esn['delta_t'],
        hp_space_nw_esn['Kp0'],
        hp_space_nw_esn['Kd0'],
        hp_space_nw_esn['eta_p'],
        hp_space_nw_esn['eta_d'],
        hp_space_nw_esn['rescale_s'],
    ))
    print(f"Total NW-ESN combos: {len(combos_nw)}")

    best_val_mse = float("inf")
    best_state = None
    best_hp = None
    best_metrics = None

    for (w, beta, p, gamma, dt, Kp0, Kd0, eta_p, eta_d, res_s) in tqdm(combos_nw, desc="NW-ESN Grid"):
        hp = dict(Kp0=Kp0, Kd0=Kd0, eta_p=eta_p, eta_d=eta_d,
                  gamma=gamma, delta_t=dt, rescale_s=res_s)

        # Build reservoir model
        nw_esn = NW_Network(input_size=input_size, hidden_size=hidden_size, output_size=1, hp=hp).to(device)
        _init_reservoir_weights(nw_esn.cell, p, w, beta)
        _freeze_reservoir_params(nw_esn)

        # Try a small alpha grid for Ridge readout
        best_alpha_metrics = None
        best_alpha = None
        best_alpha_state = None
        for alpha in ridge_alphas:
            trained, metrics, _ = train_reservoir_readout_regression(
                model=nw_esn, train_loader=train_loader, val_loader=val_loader,
                device=device, alpha=alpha, verbose=False
            )
            if metrics['mse'] < (best_alpha_metrics['mse'] if best_alpha_metrics else float("inf")):
                best_alpha_metrics = metrics
                best_alpha = alpha
                best_alpha_state = copy.deepcopy(trained.state_dict())

        # keep global best
        if best_alpha_metrics['mse'] < best_val_mse:
            best_val_mse = best_alpha_metrics['mse']
            best_state = best_alpha_state
            best_hp = dict(w=w, beta=beta, p=p, alpha=best_alpha, **hp)
            best_metrics = best_alpha_metrics

    print(f"NW-ESN best: MSE={best_val_mse:.6f} | Params={best_hp}")
    best_models['NW-ESN'] = ('val', best_metrics, best_hp, best_state)

    # ---------- 2) ESN grid ----------
    print("\n--- 2) ESN (frozen reservoir) grid ---")
    '''hp_space_esn = {
        'input_scaling':  [1.0, 10.0],
        'bias_scaling':   [0.001, 0.1, 1.0],
        'spectral_radius':[0.8, 0.9, 0.95, 0.99],
        'leaking_rate':   [0.1, 0.3, 0.5],
    }'''
    hp_space_esn = {
    'input_scaling': [0.1, 0.5, 1.0],
    'bias_scaling': [0.001, 0.01, 0.1],
    'spectral_radius': [0.6, 0.7, 0.8, 0.9],
    'leaking_rate': [0.1, 0.2, 0.3],
    'ridge_alpha': [0.1, 1.0, 10.0, 100.0]
    }

    combos_esn = list(product(
        hp_space_esn['input_scaling'],
        hp_space_esn['bias_scaling'],
        hp_space_esn['spectral_radius'],
        hp_space_esn['leaking_rate'],
    ))
    print(f"Total ESN combos: {len(combos_esn)}")

    best_val_mse_esn = float("inf")
    best_state_esn = None
    best_hp_esn = None
    best_metrics_esn = None

    for (w, beta, p, a) in tqdm(combos_esn, desc="ESN Grid"):
        hp = dict(leaking_rate=a)
        esn = ESN_Network(input_size=input_size, hidden_size=hidden_size, output_size=1, hp=hp).to(device)
        _init_reservoir_weights(esn.cell, p, w, beta)
        _freeze_reservoir_params(esn)

        best_alpha_metrics = None
        best_alpha = None
        best_alpha_state = None
        for alpha in ridge_alphas:
            trained, metrics, _ = train_reservoir_readout_regression(
                model=esn, train_loader=train_loader, val_loader=val_loader,
                device=device, alpha=alpha, verbose=False
            )
            if metrics['mse'] < (best_alpha_metrics['mse'] if best_alpha_metrics else float("inf")):
                best_alpha_metrics = metrics
                best_alpha = alpha
                best_alpha_state = copy.deepcopy(trained.state_dict())

        if best_alpha_metrics['mse'] < best_val_mse_esn:
            best_val_mse_esn = best_alpha_metrics['mse']
            best_state_esn = best_alpha_state
            best_hp_esn = dict(w=w, beta=beta, p=p, a=a, alpha=best_alpha)
            best_metrics_esn = best_alpha_metrics

    print(f"ESN best: MSE={best_val_mse_esn:.6f} | Params={best_hp_esn}")
    best_models['ESN'] = ('val', best_metrics_esn, best_hp_esn, best_state_esn)


    # ---------- 3) Final test evaluation ----------
    print("\n--- 3) Final Test Evaluation ---")
    out = {}
    for model_name, (_, val_metrics, params, state) in best_models.items():
        if model_name == 'NW-ESN':
            hp = {k: v for k, v in params.items() if k in ('Kp0','Kd0','eta_p','eta_d','gamma','delta_t','rescale_s')}
            w, beta, p, alpha = params['w'], params['beta'], params['p'], params['alpha']
            model = NW_Network(input_size=input_size, hidden_size=hidden_size, output_size=1, hp=hp).to(device)
            _init_reservoir_weights(model.cell, p, w, beta)
        else:
            a, w, beta, p, alpha = params['a'], params['w'], params['beta'], params['p'], params['alpha']
            model = ESN_Network(input_size=input_size, hidden_size=hidden_size, output_size=1, hp={'leaking_rate': a}).to(device)
            _init_reservoir_weights(model.cell, p, w, beta)

        _freeze_reservoir_params(model)
        model.load_state_dict(state)  # loaded readout from best alpha

        # Evaluate on test
        model.eval()
        preds, targs = [], []
        with torch.no_grad():
            for Xb, yb in tqdm(test_loader, desc=f"Testing {model_name}"):
                Xb, yb = Xb.to(device), yb.to(device)
                yhat = model(Xb)
                preds.append(yhat.cpu().numpy())
                targs.append(yb.cpu().numpy())

        yp = np.concatenate(preds, axis=0).squeeze()
        yt = np.concatenate(targs, axis=0).squeeze()

        mse = mean_squared_error(yt, yp)
        mae = mean_absolute_error(yt, yp)
        r2  = r2_score(yt, yp)

        print(f"  {model_name}: Test MSE={mse:.6f} | MAE={mae:.6f} | R²={r2:.4f} | Val MSE(best)={val_metrics['mse']:.6f}")
        out[model_name] = dict(test_mse=mse, test_mae=mae, test_r2=r2, best_val=val_metrics, params=params)

    return out

    





In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = run_reservoir_experiments_regression(
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    input_size=input_size,
    device=device,
    hidden_size=100,
    ridge_alphas=(0.1, 1.0, 10.0)
)

Running regression experiments on device: cuda

--- 1) NW-ESN (frozen reservoir) grid ---
Total NW-ESN combos: 360


NW-ESN Grid:   0%|          | 0/360 [00:00<?, ?it/s]

NW-ESN best: MSE=0.000237 | Params={'w': 1.0, 'beta': 0.001, 'p': 0.8, 'alpha': 10.0, 'Kp0': 0.0001, 'Kd0': 0.5, 'eta_p': 10.0, 'eta_d': 1.0, 'gamma': 0.1, 'delta_t': 0.1, 'rescale_s': 5.0}

--- 2) ESN (frozen reservoir) grid ---
Total ESN combos: 108


ESN Grid:   0%|          | 0/108 [00:00<?, ?it/s]

ESN best: MSE=0.005702 | Params={'w': 0.1, 'beta': 0.01, 'p': 0.8, 'a': 0.3, 'alpha': 10.0}

--- 3) Final Test Evaluation ---


Testing NW-ESN:   0%|          | 0/4 [00:00<?, ?it/s]

  NW-ESN: Test MSE=0.000314 | MAE=0.012896 | R²=0.1169 | Val MSE(best)=0.000237


Testing ESN:   0%|          | 0/4 [00:00<?, ?it/s]

  ESN: Test MSE=0.010789 | MAE=0.067432 | R²=-29.3703 | Val MSE(best)=0.005702


### Evaluating all the models

In [16]:
import math
import random
import numpy as np
import torch
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("--- Generating Final Model Comparison (REGRESSION) ---")
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def eval_regression(model, loader, device):
    model.eval()
    preds, targs = [], []
    with torch.no_grad():
        for Xb, yb in tqdm(loader, desc="Evaluating", leave=False):
            Xb, yb = Xb.to(device), yb.to(device)
            yhat = model(Xb)                 # (B,1) or (B,)
            preds.append(yhat.detach().cpu().numpy())
            targs.append(yb.detach().cpu().numpy())
    yp = np.concatenate(preds, axis=0).squeeze()
    yt = np.concatenate(targs, axis=0).squeeze()
    mse = mean_squared_error(yt, yp)
    mae = mean_absolute_error(yt, yp)
    r2  = r2_score(yt, yp)
    return mse, mae, r2

def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

summary_rows = []  # (name, test_mse, test_mae, test_r2, total_p, train_p)

# ---- 1) Standard RNN regressor ----
std_rnn_regressor.eval()
rnn_mse, rnn_mae, rnn_r2 = eval_regression(std_rnn_regressor, test_loader, device)
rnn_total_p, rnn_train_p = count_parameters(std_rnn_regressor)
summary_rows.append(("RNN", rnn_mse, rnn_mae, rnn_r2, rnn_total_p, rnn_train_p))

# ---- 2) NW-RNN regressor ----
nw_rnn_regressor.eval()
nwrnn_mse, nwrnn_mae, nwrnn_r2 = eval_regression(nw_rnn_regressor, test_loader, device)
nwrnn_total_p, nwrnn_train_p = count_parameters(nw_rnn_regressor)
summary_rows.append(("NW-RNN (ours)", nwrnn_mse, nwrnn_mae, nwrnn_r2, nwrnn_total_p, nwrnn_train_p))

if 'results' in globals():
    for model_name in ['ESN', 'NW-ESN']:
        if model_name in results:
            rec = results[model_name]
            test_mse = rec['test_mse']
            test_mae = rec['test_mae']
            test_r2  = rec['test_r2']
            params   = rec['params']

            # Reconstruct model shape to count params
            if model_name == 'ESN':
                a = params['a']
                esn_tmp = ESN_Network(input_size=input_size, hidden_size=N_NEURONS, output_size=1,
                                      hp={'leaking_rate': a}).to(device)
                tot_p, tr_p = count_parameters(esn_tmp)
                summary_rows.append(("ESN", test_mse, test_mae, test_r2, tot_p, tr_p))
                del esn_tmp
            else:  # NW-ESN
                # minimal hp needed to construct NW_Network
                hp = {k: params[k] for k in ('Kp0','Kd0','eta_p','eta_d','gamma','delta_t','rescale_s')}
                nw_esn_tmp = NW_Network(input_size=input_size, hidden_size=N_NEURONS, output_size=1, hp=hp).to(device)
                tot_p, tr_p = count_parameters(nw_esn_tmp)
                summary_rows.append(("NW-ESN (ours)", test_mse, test_mae, test_r2, tot_p, tr_p))
                del nw_esn_tmp

# ---- Pretty print ----
print("\n--- Final Model Comparison (Regression) ---")
print("==========================================================================================")
print(f"{'Model':<16} | {'Test MSE':>10} | {'Test MAE':>9} | {'Test R²':>7} | {'Total Params':>14} | {'Trainable':>10}")
print("------------------------------------------------------------------------------------------")
order = ['ESN', 'NW-ESN (ours)', 'RNN', 'NW-RNN (ours)']
# fallback to whatever is present
present = {name for (name, *_rest) in summary_rows}
for name in order + [n for n in present if n not in order]:
    for row in summary_rows:
        if row[0] == name:
            _, mse, mae, r2, tot_p, tr_p = row
            print(f"{name:<16} | {mse:>10.6f} | {mae:>9.6f} | {r2:>7.4f} | {tot_p:>14,} | {tr_p:>10,}")
print("==========================================================================================")


--- Generating Final Model Comparison (REGRESSION) ---


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


--- Final Model Comparison (Regression) ---
Model            |   Test MSE |  Test MAE | Test R² |   Total Params |  Trainable
------------------------------------------------------------------------------------------
ESN              |   0.010789 |  0.067432 | -29.3703 |         10,301 |     10,301
NW-ESN (ours)    |   0.000314 |  0.012896 |  0.1169 |         10,301 |     10,301
RNN              |   0.000323 |  0.013027 |  0.0915 |         10,601 |     10,601
NW-RNN (ours)    |   0.000340 |  0.013329 |  0.0417 |         10,301 |     10,301


## Training on dataset FloodModeling2


In [17]:
print("Training on dataset FloodModeling2")
TRAIN_FILE_PATH = '/kaggle/input/floodmodeling2/FloodModeling2_TRAIN.ts'
TEST_FILE_PATH = '/kaggle/input/floodmodeling2/FloodModeling2_TEST.ts'
train_loader, test_loader, val_loader, input_size, std_rnn_regressor, nw_rnn_regressor = run_rnn_experiments(TRAIN_FILE_PATH, TEST_FILE_PATH)

Training on dataset FloodModeling2
Train: (372, 266), Val: (94, 266), Test: (201, 266)

Data ready for PyTorch:
Input size: 1
Output dimension: 1

Starting Standard RNN regression training for up to 200 epochs (patience=10)...




Epoch 1/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1/200 | Train Loss: 0.182454 | Val Loss: 0.001708


Epoch 2/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2/200 | Train Loss: 0.078741 | Val Loss: 0.010167


Epoch 3/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3/200 | Train Loss: 0.085819 | Val Loss: 0.002418


Epoch 4/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 4/200 | Train Loss: 0.061458 | Val Loss: 0.000319


Epoch 5/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 5/200 | Train Loss: 0.059781 | Val Loss: 0.003555


Epoch 6/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 6/200 | Train Loss: 0.053650 | Val Loss: 0.007600


Epoch 7/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 7/200 | Train Loss: 0.042668 | Val Loss: 0.007698


Epoch 8/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 8/200 | Train Loss: 0.035326 | Val Loss: 0.007387


Epoch 9/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 9/200 | Train Loss: 0.034165 | Val Loss: 0.001398


Epoch 10/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 10/200 | Train Loss: 0.024670 | Val Loss: 0.000702


Epoch 11/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 11/200 | Train Loss: 0.022812 | Val Loss: 0.002360


Epoch 12/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 12/200 | Train Loss: 0.021096 | Val Loss: 0.000020


Epoch 13/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 13/200 | Train Loss: 0.021522 | Val Loss: 0.000038


Epoch 14/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 14/200 | Train Loss: 0.020795 | Val Loss: 0.000093


Epoch 15/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 15/200 | Train Loss: 0.017547 | Val Loss: 0.000017


Epoch 16/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 16/200 | Train Loss: 0.015990 | Val Loss: 0.000100


Epoch 17/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 17/200 | Train Loss: 0.014884 | Val Loss: 0.000377


Epoch 18/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 18/200 | Train Loss: 0.012806 | Val Loss: 0.000097


Epoch 19/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 19/200 | Train Loss: 0.013919 | Val Loss: 0.000032


Epoch 20/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 20/200 | Train Loss: 0.012439 | Val Loss: 0.000172


Epoch 21/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 21/200 | Train Loss: 0.012231 | Val Loss: 0.000169


Epoch 22/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 22/200 | Train Loss: 0.011513 | Val Loss: 0.000032


Epoch 23/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 23/200 | Train Loss: 0.009798 | Val Loss: 0.000036


Epoch 24/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 24/200 | Train Loss: 0.009514 | Val Loss: 0.000038


Epoch 25/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 25/200 | Train Loss: 0.010668 | Val Loss: 0.000026

Early stopping triggered after 25 epochs (no val improvement for 10 epochs).
Training complete.


Evaluating RNN (regression):   0%|          | 0/4 [00:00<?, ?it/s]


Test Results:
  MSE: 0.000333
  MAE: 0.005695
  R² : 0.0269
Training NW-RNN model:

Data ready for PyTorch:
Input size: 1
Output size: 1 (regression target)
Using device: cuda

Starting NW-RNN Regression Training for 200 epochs...




Epoch 1/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1/200 | Train Loss: 0.004783 | Val Loss: 0.001775


Epoch 2/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2/200 | Train Loss: 0.001459 | Val Loss: 0.001399


Epoch 3/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3/200 | Train Loss: 0.000667 | Val Loss: 0.000891


Epoch 4/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 4/200 | Train Loss: 0.000535 | Val Loss: 0.000250


Epoch 5/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 5/200 | Train Loss: 0.000393 | Val Loss: 0.000058


Epoch 6/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 6/200 | Train Loss: 0.000256 | Val Loss: 0.000100


Epoch 7/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 7/200 | Train Loss: 0.000189 | Val Loss: 0.000102


Epoch 8/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 8/200 | Train Loss: 0.000163 | Val Loss: 0.000095


Epoch 9/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 9/200 | Train Loss: 0.000149 | Val Loss: 0.000067


Epoch 10/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 10/200 | Train Loss: 0.000142 | Val Loss: 0.000060


Epoch 11/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 11/200 | Train Loss: 0.000128 | Val Loss: 0.000070


Epoch 12/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 12/200 | Train Loss: 0.000124 | Val Loss: 0.000053


Epoch 13/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 13/200 | Train Loss: 0.000117 | Val Loss: 0.000051


Epoch 14/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 14/200 | Train Loss: 0.000112 | Val Loss: 0.000069


Epoch 15/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 15/200 | Train Loss: 0.000107 | Val Loss: 0.000063


Epoch 16/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 16/200 | Train Loss: 0.000104 | Val Loss: 0.000049


Epoch 17/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 17/200 | Train Loss: 0.000109 | Val Loss: 0.000064


Epoch 18/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 18/200 | Train Loss: 0.000095 | Val Loss: 0.000062


Epoch 19/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 19/200 | Train Loss: 0.000089 | Val Loss: 0.000070


Epoch 20/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 20/200 | Train Loss: 0.000085 | Val Loss: 0.000083


Epoch 21/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 21/200 | Train Loss: 0.000082 | Val Loss: 0.000067


Epoch 22/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 22/200 | Train Loss: 0.000079 | Val Loss: 0.000067


Epoch 23/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 23/200 | Train Loss: 0.000075 | Val Loss: 0.000073


Epoch 24/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 24/200 | Train Loss: 0.000076 | Val Loss: 0.000083


Epoch 25/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 25/200 | Train Loss: 0.000072 | Val Loss: 0.000091


Epoch 26/200 (Train):   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 26/200 | Train Loss: 0.000070 | Val Loss: 0.000075

Early stopping triggered at epoch 26 (no val improvement for 10 epochs).
Training complete.


Evaluating NW-RNN (Regression):   0%|          | 0/4 [00:00<?, ?it/s]


Test Results:
  MSE: 0.000204
  MAE: 0.005415
  R² : 0.4034


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = run_reservoir_experiments_regression(
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    input_size=input_size,
    device=device,
    hidden_size=100,
    ridge_alphas=(0.1, 1.0, 10.0)
)

Running regression experiments on device: cuda

--- 1) NW-ESN (frozen reservoir) grid ---
Total NW-ESN combos: 360


NW-ESN Grid:   0%|          | 0/360 [00:00<?, ?it/s]

NW-ESN best: MSE=0.008916 | Params={'w': 1.0, 'beta': 1.0, 'p': 0.9, 'alpha': 1.0, 'Kp0': 0.0001, 'Kd0': 0.5, 'eta_p': 10.0, 'eta_d': 1.0, 'gamma': 0.8, 'delta_t': 0.001, 'rescale_s': 5.0}

--- 2) ESN (frozen reservoir) grid ---
Total ESN combos: 108


ESN Grid:   0%|          | 0/108 [00:00<?, ?it/s]

ESN best: MSE=0.008622 | Params={'w': 0.1, 'beta': 0.001, 'p': 0.6, 'a': 0.2, 'alpha': 10.0}

--- 3) Final Test Evaluation ---


Testing NW-ESN:   0%|          | 0/4 [00:00<?, ?it/s]

  NW-ESN: Test MSE=0.009204 | MAE=0.078964 | R²=-25.8673 | Val MSE(best)=0.008916


Testing ESN:   0%|          | 0/4 [00:00<?, ?it/s]

  ESN: Test MSE=0.008328 | MAE=0.074244 | R²=-23.3091 | Val MSE(best)=0.008622


In [19]:
import math
import random
import numpy as np
import torch
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("--- Generating Final Model Comparison (REGRESSION) ---")
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def eval_regression(model, loader, device):
    model.eval()
    preds, targs = [], []
    with torch.no_grad():
        for Xb, yb in tqdm(loader, desc="Evaluating", leave=False):
            Xb, yb = Xb.to(device), yb.to(device)
            yhat = model(Xb)                 # (B,1) or (B,)
            preds.append(yhat.detach().cpu().numpy())
            targs.append(yb.detach().cpu().numpy())
    yp = np.concatenate(preds, axis=0).squeeze()
    yt = np.concatenate(targs, axis=0).squeeze()
    mse = mean_squared_error(yt, yp)
    mae = mean_absolute_error(yt, yp)
    r2  = r2_score(yt, yp)
    return mse, mae, r2

def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

summary_rows = []  # (name, test_mse, test_mae, test_r2, total_p, train_p)

# ---- 1) Standard RNN regressor ----
std_rnn_regressor.eval()
rnn_mse, rnn_mae, rnn_r2 = eval_regression(std_rnn_regressor, test_loader, device)
rnn_total_p, rnn_train_p = count_parameters(std_rnn_regressor)
summary_rows.append(("RNN", rnn_mse, rnn_mae, rnn_r2, rnn_total_p, rnn_train_p))

# ---- 2) NW-RNN regressor ----
nw_rnn_regressor.eval()
nwrnn_mse, nwrnn_mae, nwrnn_r2 = eval_regression(nw_rnn_regressor, test_loader, device)
nwrnn_total_p, nwrnn_train_p = count_parameters(nw_rnn_regressor)
summary_rows.append(("NW-RNN (ours)", nwrnn_mse, nwrnn_mae, nwrnn_r2, nwrnn_total_p, nwrnn_train_p))

if 'results' in globals():
    for model_name in ['ESN', 'NW-ESN']:
        if model_name in results:
            rec = results[model_name]
            test_mse = rec['test_mse']
            test_mae = rec['test_mae']
            test_r2  = rec['test_r2']
            params   = rec['params']

            # Reconstruct model shape to count params
            if model_name == 'ESN':
                a = params['a']
                esn_tmp = ESN_Network(input_size=input_size, hidden_size=N_NEURONS, output_size=1,
                                      hp={'leaking_rate': a}).to(device)
                tot_p, tr_p = count_parameters(esn_tmp)
                summary_rows.append(("ESN", test_mse, test_mae, test_r2, tot_p, tr_p))
                del esn_tmp
            else:  # NW-ESN
                # minimal hp needed to construct NW_Network
                hp = {k: params[k] for k in ('Kp0','Kd0','eta_p','eta_d','gamma','delta_t','rescale_s')}
                nw_esn_tmp = NW_Network(input_size=input_size, hidden_size=N_NEURONS, output_size=1, hp=hp).to(device)
                tot_p, tr_p = count_parameters(nw_esn_tmp)
                summary_rows.append(("NW-ESN (ours)", test_mse, test_mae, test_r2, tot_p, tr_p))
                del nw_esn_tmp

# ---- Pretty print ----
print("\n--- Final Model Comparison (Regression) ---")
print("==========================================================================================")
print(f"{'Model':<16} | {'Test MSE':>10} | {'Test MAE':>9} | {'Test R²':>7} | {'Total Params':>14} | {'Trainable':>10}")
print("------------------------------------------------------------------------------------------")
order = ['ESN', 'NW-ESN (ours)', 'RNN', 'NW-RNN (ours)']
# fallback to whatever is present
present = {name for (name, *_rest) in summary_rows}
for name in order + [n for n in present if n not in order]:
    for row in summary_rows:
        if row[0] == name:
            _, mse, mae, r2, tot_p, tr_p = row
            print(f"{name:<16} | {mse:>10.6f} | {mae:>9.6f} | {r2:>7.4f} | {tot_p:>14,} | {tr_p:>10,}")
print("==========================================================================================")


--- Generating Final Model Comparison (REGRESSION) ---


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


--- Final Model Comparison (Regression) ---
Model            |   Test MSE |  Test MAE | Test R² |   Total Params |  Trainable
------------------------------------------------------------------------------------------
ESN              |   0.008328 |  0.074244 | -23.3091 |         10,301 |     10,301
NW-ESN (ours)    |   0.009204 |  0.078964 | -25.8673 |         10,301 |     10,301
RNN              |   0.000333 |  0.005695 |  0.0269 |         10,601 |     10,601
NW-RNN (ours)    |   0.000204 |  0.005415 |  0.4034 |         10,301 |     10,301
