# HMS - PyTorch Baseline Training

**Comments welcome!**

One of my goals in this competition is to learn more PyTorch.

This is a **training** notebook; the respective inference notebook is [HMS - PyTorch Baseline Inference](https://www.kaggle.com/code/morodertobias/hms-pytorch-baseline-inference) and the saved models are uploaded as a dataset [HMS - PyTorch Baseline Training Dataset](https://www.kaggle.com/datasets/morodertobias/hms-pytorch-baseline-training-dataset) to keep versions clean.

Current version fine-tunes EfficientNetB0 from noisy student weights in a 5-folding manner. It uses squashed spectrograms, as already done in the reference notebooks. I try to use my way of coding, but naturally it is similar.


## References
- [HMS baseline_resnet34d(512*512 Training 5 folds)](https://www.kaggle.com/code/yunsuxiaozi/hms-baseline-resnet34d-512-512-training-5-folds)
- [HMS: Train EfficientNetB0](https://www.kaggle.com/code/andreasbis/hms-train-efficientnetb0/notebook)
- [HMS-HBAC: ResNet34d Baseline [Training]](https://www.kaggle.com/code/ttahara/hms-hbac-resnet34d-baseline-training/)


## Table of Contents
- [Imports](#Imports)
- [Config](#Config)
- [Load data](#Load-data)
- [Data Handling](#Data-Handling)
- [Model](#Model)
- [Training Utils](#Training-Utils)
- [Training](#Training)
- [OOF](#OOF)

# Imports

In [None]:
import os
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import timm

# Config

In [None]:
class CFG:
    seed = 888
    debug = False
    one_fold = False
    base_dir = pathlib.Path("/kaggle/input/hms-harmful-brain-activity-classification")
    path_train = base_dir / "train.csv"
    spec_dir = base_dir / "train_spectrograms"
    transform = transforms.Resize((512, 512), antialias=False)
    model_name = "tf_efficientnet_b0_ns"
    ckpt_name = "tf_efficientnet_b0_ns_v1"
    n_fold = 5
    epochs = 10
    batch_size = 16    
    lr = 0.0005

# Load data
- We classify the spectrogram directly.
- As a label we use the aggregated the votes per spectrogram slice

In [None]:
train_df = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv")
train_df

In [None]:
label_columns = train_df.filter(like="_vote").columns.to_list()
label_columns

In [None]:
data = train_df.groupby("spectrogram_id")[label_columns].sum()
data = data.div(data.sum(axis=1), axis=0)
data["path"] = data.index.map(lambda x: CFG.spec_dir / f"{x}.parquet")
data = data.reset_index()
data

# Data Handling
- Spectrogram is loaded, which contains spectrograms in 4 different regions.
- All together they are clipped, log-transformed and the standardized.
- Finally it is resized as an image.
- Note, since the spectrograms have sometimes very different lenghts this creates a certain distortion.

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
def preprocess(x):
    x = np.clip(x, np.exp(-6), np.exp(10))
    x = np.log(x)
    m, s = x.mean(), x.std()
    x = (x - m) / (s + 1e-6)
    return x


class SpecDataset(Dataset):
    
    def __init__(self, df, transform=CFG.transform):
        self.df = df
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        # input
        x = pd.read_parquet(row.path)
        x = x.fillna(-1).values[:, 1:].T
        x = preprocess(x)
        x = torch.Tensor(x[None, :])
        if self.transform:
            x = self.transform(x)
        # output
        y = np.array(row.loc[label_columns].values, 'float32')
        y = torch.Tensor(y)
        return x, y

In [None]:
ds = SpecDataset(df=data.iloc[:50])
ds, len(ds)

In [None]:
x, y = ds[0]
x.shape, x, y.shape, y

In [None]:
ld = DataLoader(dataset=ds, batch_size=CFG.batch_size, drop_last=True, num_workers=os.cpu_count())
ld, len(ld)

In [None]:
x, y = next(iter(ld))
x.shape, x, y.shape, y

In [None]:
plt.imshow(x[0, 0])
plt.show()

# Model

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"DEVICE: {DEVICE}")

In [None]:
torch.cuda.memory_allocated()

In [None]:
model = timm.create_model(model_name=CFG.model_name, pretrained=True, num_classes=6, in_chans=1)
model.to(DEVICE)
num_parameter = sum(x.numel() for x in model.parameters())
print(f"Model has {num_parameter} parameters.")

In [None]:
y_out = model(x.to(DEVICE))
y_out

# Training Utils
- Create Kullback-Leibler Divergence loss from logits.
- Compute loss utility.

In [None]:
def KLDivLoss(logit, target):
    log_prob = F.log_softmax(logit, dim=1)
    return F.kl_div(log_prob, target, reduction="batchmean")

In [None]:
# from reference
def KL_loss(p,q):
    epsilon = 10**(-15)
    p = torch.clip(p,epsilon,1-epsilon)
    q = nn.functional.log_softmax(q,dim=1)
    return torch.mean(torch.sum(p*(torch.log(p)-q),dim=1))

In [None]:
KLDivLoss(y_out, y.to(DEVICE)), KL_loss(p=y.to(DEVICE), q=y_out)

In [None]:
def compute_loss(model, data_loader):
    model.eval()
    l_loss = []
    with torch.no_grad():
        for x, y in data_loader:
            y_pred = model(x.to(DEVICE))
            loss = KLDivLoss(y_pred, y.to(DEVICE))
            l_loss.append(loss.item())
    return np.mean(l_loss) 

In [None]:
compute_loss(model, ld)

In [None]:
torch.cuda.memory_allocated()

In [None]:
del model, x, y, y_out
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_allocated()

# Training
- Uses plain 5 fold training strategy.
- Runs for all epochs and checkpoints model weights if the validation loss improves.

In [None]:
from sklearn.model_selection import KFold

In [None]:
if CFG.debug:
    data = data.iloc[:400]

In [None]:
%%time

kf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

l_best_loss = []
for fold, (iloc_train, iloc_valid) in enumerate(kf.split(data)):
    print(f"Fold {fold}:")

    # prepare data
    train_ds = SpecDataset(df=data.iloc[iloc_train])
    valid_ds = SpecDataset(df=data.iloc[iloc_valid])
    train_loader = DataLoader(dataset=train_ds, shuffle=True, batch_size=CFG.batch_size, num_workers=os.cpu_count(), drop_last=True)
    valid_loader = DataLoader(dataset=valid_ds, batch_size=CFG.batch_size, num_workers=os.cpu_count())
    
    # init training
    model = timm.create_model(model_name=CFG.model_name, pretrained=True, num_classes=6, in_chans=1)
    model.to(DEVICE)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
    scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=CFG.epochs)
    optimizer, scheduler
    best_loss = float("inf")
    history = []
    
    # run training
    for epoch in tqdm(range(CFG.epochs)):
        model.train()
        l_loss = []
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            y_pred = model(x)
            loss = KLDivLoss(y_pred, y)
            l_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        train_loss = np.mean(l_loss)
        valid_loss = compute_loss(model, valid_loader)
        history.append((epoch, train_loss, valid_loss))
        print(f"Epoch {epoch}")
        print(f"Train Loss: {train_loss:>10.6f}, Valid Loss: {valid_loss:>10.6}")
        if valid_loss < best_loss:
            print(f"Loss improves from {best_loss:>10.6f} to {valid_loss:>10.6}")
            torch.save(model.state_dict(), f"{CFG.ckpt_name}__{fold}.pt")
            best_loss = valid_loss
    print(f"\nBest loss Model training with {best_loss}\n")
    l_best_loss.append(best_loss)
    
    # plot
    history = pd.DataFrame(history, columns=["epoch", "loss", "val_loss"]).set_index("epoch")
    history.plot(subplots=True, layout=(1, 2), sharey="row", figsize=(14, 6))
    plt.show()
    
    if CFG.one_fold:
        break

# OOF

In [None]:
l_best_loss, np.mean(l_best_loss)