### **ℹ️Update Info(2024/01/23)**

* **forked original great work kernels**
    * [Inference] https://www.kaggle.com/code/ttahara/hms-hbac-resnet34d-baseline-inference
    * [Training] https://www.kaggle.com/code/ttahara/hms-hbac-resnet34d-baseline-training/


* **My Train Info**
    * tf_efficientnetv2_s
    * Split 10Fold(SGKF)
    * resize x512
    * CV:0.6753573036121466

## Import

In [None]:
import sys
import os
import gc
import copy
import yaml
import random
import shutil
from time import time
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold

import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp

import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
OUTPUT = ROOT / "output"
SRC = ROOT / "src"

DATA = INPUT / "hms-harmful-brain-activity-classification"
TRAIN_SPEC = DATA / "train_spectrograms"
TEST_SPEC = DATA / "test_spectrograms"
TRAINED_MODEL = INPUT / "hms-train-a-20111-20240118113811"

TMP = ROOT / "tmp"
TRAIN_SPEC_SPLIT = TMP / "train_spectrograms_split"
TEST_SPEC_SPLIT = TMP / "test_spectrograms_split"
TMP.mkdir(exist_ok=True)
TRAIN_SPEC_SPLIT.mkdir(exist_ok=True)
TEST_SPEC_SPLIT.mkdir(exist_ok=True)


RANDAM_SEED = 1086
CLASSES = ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
N_CLASSES = len(CLASSES)
FOLDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
N_FOLDS = len(FOLDS)

## Read Data, Convert Spectrograms to Numpy file

In [None]:
test = pd.read_csv(DATA / "test.csv")

In [None]:
test.head()

### convert sepectogram files to numpy files

In [None]:
for spec_id in test["spectrogram_id"]:
    spec = pd.read_parquet(TEST_SPEC / f"{spec_id}.parquet")
    
    spec_arr = spec.fillna(0).values[:, 1:].T.astype("float32")  # (Hz, Time) = (400, 300)
    
    np.save(TEST_SPEC_SPLIT / f"{spec_id}.npy", spec_arr)

## Difinition, Model, Dataset

### model

In [None]:
class HMSHBACSpecModel(nn.Module):

    def __init__(
            self,
            model_name: str,
            pretrained: bool,
            in_channels: int,
            num_classes: int,
        ):
        super().__init__()
        self.model = timm.create_model(
            model_name=model_name, pretrained=pretrained,
            num_classes=num_classes, in_chans=in_channels)

    def forward(self, x):
        h = self.model(x)      

        return h

## dataset

In [None]:
FilePath = tp.Union[str, Path]
Label = tp.Union[int, float, np.ndarray]

class HMSHBACSpecDataset(torch.utils.data.Dataset):

    def __init__(
        self,
        image_paths: tp.Sequence[FilePath],
        labels: tp.Sequence[Label],
        transform: A.Compose,
    ):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index: int):
        img_path = self.image_paths[index]
        label = self.labels[index]

        img = np.load(img_path)  # shape: (Hz, Time) = (400, 300)
        
        # log transform
        img = np.clip(img,np.exp(-4), np.exp(8))
        img = np.log(img)
        
        # normalize per image
        eps = 1e-6
        img_mean = img.mean(axis=(0, 1))
        img = img - img_mean
        img_std = img.std(axis=(0, 1))
        img = img / (img_std + eps)

        img = img[..., None] # shape: (Hz, Time) -> (Hz, Time, Channel)
        img = self._apply_transform(img)

        return {"data": img, "target": label}

    def _apply_transform(self, img: np.ndarray):
        """apply transform to image and mask"""
        transformed = self.transform(image=img)
        img = transformed["image"]
        return img

# Inference Test Data

In [None]:
class CFG:
    model_name = "tf_efficientnetv2_s.in21k_ft_in1k"
    img_size = 512
    max_epoch = 9
    batch_size = 32
    lr = 1.0e-03
    weight_decay = 1.0e-02
    es_patience =  5
    seed = 1086
    deterministic = True
    enable_amp = True
    device = "cuda"

In [None]:
def to_device(
    tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]],
    device: torch.device, *args, **kwargs
):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)

    
def get_test_path_label(test: pd.DataFrame):
    """Get file path and dummy target info."""
    
    img_paths = []
    labels = np.full((len(test), 6), -1, dtype="float32")
    for spec_id in test["spectrogram_id"].values:
        img_path = TEST_SPEC_SPLIT / f"{spec_id}.npy"
        img_paths.append(img_path)
        
    test_data = {
        "image_paths": img_paths,
        "labels": [l for l in labels]}
    
    return test_data

def get_test_transforms(CFG):
    test_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size),
        ToTensorV2(p=1.0)
    ])
    return test_transform

In [None]:
def run_inference_loop(model, loader, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x = to_device(batch["data"], device)
            y = model(x)
            pred_list.append(y.softmax(dim=1).detach().cpu().numpy())
        
    pred_arr = np.concatenate(pred_list)
    del pred_list
    return pred_arr

In [None]:
test_preds_arr = np.zeros((N_FOLDS, len(test), N_CLASSES))

test_path_label = get_test_path_label(test)
test_transform = get_test_transforms(CFG)
test_dataset = HMSHBACSpecDataset(**test_path_label, transform=test_transform)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)

device = torch.device(CFG.device)

for fold_id in range(N_FOLDS):
    print(f"\n[fold {fold_id}]")
    
    # # get model
    model_path = TRAINED_MODEL / f"best_model_fold{fold_id}.pth"
    model = HMSHBACSpecModel(
        model_name=CFG.model_name, pretrained=False, num_classes=6, in_channels=1)
    model.load_state_dict(torch.load(model_path, map_location=device))
    
    # # inference
    test_pred = run_inference_loop(model, test_loader, device)
    test_preds_arr[fold_id] = test_pred
    
    del model
    torch.cuda.empty_cache()
    gc.collect()

# Make Submission

In [None]:
test_pred = test_preds_arr.mean(axis=0)

test_pred_df = pd.DataFrame(
    test_pred, columns=CLASSES
)

test_pred_df = pd.concat([test[["eeg_id"]], test_pred_df], axis=1)

In [None]:
smpl_sub = pd.read_csv(DATA / "sample_submission.csv")

sub = pd.merge(
    smpl_sub[["eeg_id"]], test_pred_df, on="eeg_id", how="left")

sub.to_csv("submission.csv", index=False)

sub.head()

# EOF