NOTE: This notebook assumes that you have downloaded the competition data and saved it in `./data/speed-and-structure-train-data` and `./data/speed-and-structure-train-data-extended` directories. The 1_eda.ipynb and 2_data_gen.ipynb notebook also must be run before this notebook to generate the `fold_info_all_with_synth.csv` file.

# Speed and Structure Competition

## Part 3: Training

---

This notebook runs one fold of training based on the `cfg` from `config.py`. My final submission used 3 folds of training. I only changed the `cfg.holdout_idx` variable to 1, 2, and 3. Therefore, for reproduction, you must run this notebook 3 times. The resulting models are saved in the `./model_checkpoints` directory.

---
### Some notes for reproducibility
- `config.py` file is same as my final submission. You just need to change the `cfg.holdout_idx` variable and rerun.
- Originally, I used batch_size=2 and grad_accum=4 having an effective batch size of 8. Depending on your GPU RAM, you may want to change this. The model is not affected by batch statistics, so it should be theoretically equivalent.
- Dataset class loads all the data. That is not much of a problem initially but if you want to generate a lot of new data, you may want to modify this class to load data lazily.
- `cfg.target_len` is the number of input time steps to interpolate down to (e.g. 10001 -> 2048 for the original data or 6461 -> 2048 for the synthetic data). You can increase this value as much as your GPU can handle. I realize now that this naming is terrible, sorry about that :(
- Although the training is seeded, there are randomness from data augmentation (horizontal flip).

In [None]:
import os
import random
from copy import deepcopy
from typing import Tuple, List

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.interpolate import interp1d

from config import cfg
from model import Net, ModelEMA

In [None]:
OUTPUT_DIR = "./model_checkpoints"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

In [None]:
class CustomDataset(Dataset):
    def __init__(self, mode='train', fold_info_file=None, cfg=None):
        self.mode = mode
        self.data = []
        self.labels = []
        self.receiver_ids = [1, 75, 150, 225, 300]
        self.fold_info_file = fold_info_file
        self.holdout_idx = cfg.holdout_idx
        self.cfg = cfg

        self.load_data()

    def load_data(self):
        fold_df = pd.read_csv(self.fold_info_file)
        if self.mode == 'train':
            vel_files = fold_df[fold_df['fold'] != self.holdout_idx]['vel_file'].tolist()
            rec_files = fold_df[fold_df['fold'] != self.holdout_idx][[f"rec_{rec_id}" for rec_id in self.receiver_ids]].values.tolist()
        else:
            vel_files = fold_df[fold_df['fold'] == self.holdout_idx]['vel_file'].tolist()
            rec_files = fold_df[fold_df['fold'] == self.holdout_idx][[f"rec_{rec_id}" for rec_id in self.receiver_ids]].values.tolist()

        for vel_file in vel_files:
            vel_np = np.load(vel_file)
            self.labels.append(vel_np)

        for dir_rec_files in rec_files:
            recs = []
            for dir_rec_file in dir_rec_files:
                rec = np.load(dir_rec_file) # rec.shape = (10001, 31)
                
                if cfg.target_len is not None:
                    original_rows = rec.shape[0]
                    target_rows = cfg.target_len
                    x_original = np.linspace(0, 1, original_rows)
                    x_new = np.linspace(0, 1, target_rows)

                    # Interpolate along axis 0 (rows), keeping columns unchanged
                    interpolator = interp1d(x_original, rec, axis=0, kind='linear')
                    rec = interpolator(x_new).astype(np.float32)
                    rec = np.pad(rec, ((0, 0), (0, 32 - rec.shape[1])), mode='constant', constant_values=0)

                recs.append(rec)
            if cfg.one_channel:
                self.data.append(np.expand_dims(np.concatenate(recs, axis=1), 0))
            else:
                self.data.append(np.stack(recs, axis=0))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx].copy()
        label = self.labels[idx].copy()

        if self.mode == 'train' and self.cfg.horizontal_flip and np.random.random() < self.cfg.hflip_prob:
            data = data[::-1, :, ::-1]
            label = label[::-1, ...]

        if self.cfg.x_norm:
            data = (data - self.cfg.x_mean) / self.cfg.x_std

        if self.cfg.y_norm:
            label = (label - self.cfg.y_median) / self.cfg.y_std
        
        if self.cfg.y_min_max_norm:
            label = (label - self.cfg.y_min) / (self.cfg.y_max - self.cfg.y_min)
        
        return data.copy(), label.copy()

### Training Loss
Directly optimizing for MAPE loss seems to give slightly better results. We just need to check if there is any normalization and undo that before calculating the loss.

In [None]:
def mape_loss(y_true, y_pred):
    # de-normalize, then calculate the loss
    if cfg.y_norm:
        output = y_pred * cfg.y_std + cfg.y_median
        label = y_true * cfg.y_std + cfg.y_median
    if cfg.y_min_max_norm:
        output = y_pred * (cfg.y_max - cfg.y_min) + cfg.y_min
        label = y_true * (cfg.y_max - cfg.y_min) + cfg.y_min
    return torch.mean(torch.abs((label - output) / (label)))
criterion_metric = mape_loss

### Model Initialization

In [None]:
set_seed(cfg.seed)

model = Net(backbone=cfg.backbone, 
            pretrained=cfg.backbone_pretrained, 
            fuse_ch=cfg.fuse_ch,
            one_channel=cfg.one_channel,
            norm_layer=cfg.norm_layer,
            dropout=cfg.dropout,
            y_min_max_norm=cfg.y_min_max_norm,
            horizontal_tta=cfg.horizontal_tta
            ).to(cfg.device)

In [None]:
if cfg.loss == 'mape':
    criterion = mape_loss
elif cfg.loss == "mae":
    criterion = nn.L1Loss()
else:
    raise NotImplementedError

In [None]:
if cfg.load_model_path is not None:
    model.load_state_dict(torch.load(cfg.load_model_path, 
                                        map_location=cfg.device
                                        ))
    print(f"Loaded model from {cfg.load_model_path}")

### EMA model
Exponential Moving Average model is a very simple and easy-to-plug technique. See [here](https://timm.fast.ai/training_modelEMA).

In [None]:
if cfg.ema:
    print("Initializing EMA model..")
    ema_model = ModelEMA(
        model, 
        decay=cfg.ema_decay, 
        device=cfg.device,
    )
else:
    ema_model = None

In [None]:
train_dataset = CustomDataset(mode='train', fold_info_file=cfg.fold_info_file, cfg=cfg)
val_dataset = CustomDataset(mode='val', fold_info_file=cfg.fold_info_file, cfg=cfg)
train_dl = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, 
                    num_workers=cfg.num_workers)
val_dl = DataLoader(val_dataset, batch_size=cfg.batch_size, shuffle=False, 
                    num_workers=cfg.num_workers)

In [None]:
if cfg.optimizer == "adam":
    optim = torch.optim.Adam(model.parameters(), 
                            lr=cfg.lr, 
                            weight_decay=cfg.weight_decay,
                            )
elif cfg.optimizer == "adamw":
    optim = torch.optim.AdamW(model.parameters(), 
                            lr=cfg.lr, 
                            weight_decay=cfg.weight_decay,
                            )
else:
    raise NotImplementedError

if cfg.scheduler == "cosine":
    t_max = len(train_dl) * cfg.n_epochs
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=t_max, eta_min=cfg.eta_min)
else:
    scheduler = None

### The training loop

In [None]:
best_val_loss = float('inf')
print(f"Started training on {cfg.device}")
for epoch in range(cfg.n_epochs + 1):
    if epoch != 0:
        print(f"Epoch: {epoch}")
        model.train()
        train_losses = []
        train_mape_losses = []
        optim.zero_grad()

        for batch_idx, (data, label) in enumerate(tqdm(train_dl, leave=False)):
            data = data.to(cfg.device)
            label = label.to(cfg.device)

            output = model(data)
            loss = criterion(label, output)
            loss.backward()

            if ( batch_idx + 1 ) % cfg.grad_accum == 0:
                if cfg.grad_clip is not None:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
                optim.step()
                optim.zero_grad()

            cur_train_loss = loss.item()
            if scheduler is not None:
                scheduler.step()
            train_losses.append(cur_train_loss)

            if ema_model is not None:
                ema_model.update(model)

            cur_mape_loss = criterion_metric(label, output)

            train_mape_losses.append(cur_mape_loss.item())
            if batch_idx % cfg.print_every == 0:
                print(f"Epoch {epoch+1}/{cfg.n_epochs+1}, Batch {batch_idx}/{len(train_dl)}, ")
                print(f"Loss: {np.mean(train_losses):.4f}, MAPE Loss: {np.mean(train_mape_losses):.4f}, LR: {scheduler.get_last_lr()[0]:.3e}")
                train_losses = []
                train_mape_losses = []
                
    model.eval()
    val_losses = []
    val_mape_losses = []
    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(tqdm(val_dl, leave=False)):
            data = data.to(cfg.device)
            label = label.to(cfg.device)
            if ema_model is not None:
                output = ema_model.module(data)
            else:
                output = model(data)
            loss = criterion(label, output)
            cur_val_loss = loss.item()
            val_losses.append(cur_val_loss)
            cur_mape_loss = criterion_metric(label, output)

            val_mape_losses.append(cur_mape_loss.item())
    val_mape_loss = np.mean(val_mape_losses)
    val_loss = np.mean(val_losses)
    print(f"Epoch {epoch+1}/{cfg.n_epochs+1}, Val Loss: {val_loss:.4f}")
    print(f"Epoch {epoch+1}/{cfg.n_epochs+1}, Val MAPE Loss: {val_mape_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print(f"Saving best model")
        if ema_model is not None:
            torch.save(ema_model.module.state_dict(), f'{OUTPUT_DIR}/best_model_{cfg.seed}.pt')
        else:
            torch.save(model.state_dict(), f"{OUTPUT_DIR}/best_model_{cfg.seed}.pth")
