#### Hello. Thank you reading my notebook. I am planning to add PyTorch Code in this notebook.

# 01 Pathing and EDA

In [None]:
import os
directory_path = '/kaggle/input'
csv_file_paths = []

for root, dirs, files in os.walk(directory_path):
    for file in files:
        if file.endswith(".csv"):
            csv_file_path = os.path.join(root, file)
            csv_file_paths.append(csv_file_path)

for csv_path in csv_file_paths:
    print(csv_path)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
sample_submission = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv')
                       
train_df

## 01.1: Pie Chart for Data Distribution

In [None]:
plt.figure(figsize=(8, 6))
train_df['expert_consensus'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of expert_consensus')
plt.xlabel('Expert Consensus')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

unique_expert_consensus = train_df['expert_consensus'].unique()
print("Unique values in 'expert_consensus' column:", unique_expert_consensus)

> Check the percentage of each consensus in training data. They are distributed equally in approximation.

## 01.2: Correlation Heat Map for Consensus

In [None]:
train_df_encoded = pd.get_dummies(train_df, columns=['expert_consensus'], drop_first=True)
correlation_matrix = train_df_encoded[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between vote columns and expert_consensus')
plt.show()

## 01.3 Kernel Distribution Plot for Label Second 

In [None]:
plt.figure(figsize=(12, 6))
sns.kdeplot(train_df['spectrogram_label_offset_seconds'], shade=True)
plt.title('Distribution of spectrogram_label_offset_seconds (KDE Plot)')
plt.xlabel('Offset Seconds')
plt.ylabel('Density')
plt.show()

In [None]:
vote_columns = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
vote_stats = train_df[vote_columns].describe()
print(vote_stats)


## 01.4 Box Plot

In [None]:
for column in vote_columns:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='expert_consensus', y=column, data=train_df)
    plt.title(f'Distribution of {column} by expert_consensus')
    plt.xlabel('expert_consensus')
    plt.ylabel(column)
    plt.xticks(rotation=45)
    plt.show()


# [On Progress] 02 PyTorch Modeling

In [None]:
import os
directory_path = '/kaggle/input'
non_csv_file_paths = []

for root, dirs, files in os.walk(directory_path):
    for file in files:
        if not file.endswith(".csv"): 
            file_path = os.path.join(root, file)
            non_csv_file_paths.append(file_path)


In [None]:
print(f"Length of non_csv_file_paths: {len(non_csv_file_paths)}")
print(f"Length of train_df: {len(train_df)}")
print(f"Length of test_df: {len(test_df)}")
print(f"Length of sample_submission: {len(sample_submission)}")

## 02.1 Data leakage or Data Overlap

In [None]:
overlap_ids = train_df[train_df['patient_id'].isin(test_df['patient_id'])]['patient_id']
train = train_df[~train_df['patient_id'].isin(overlap_ids)]

train

> No data leakage or overlap has been found

## 02.2 Data Split

> Audio or image data, such as spectrograms, typically take up a large amount of capacity, and it is inefficient to load and process the entire data into memory at once.

> Saving spectrogram data by dividing it into small pieces is a way to efficiently load and process data into memory. This gives you faster performance when processing the entire data by dividing it into small pieces than by processing it all at once.

In [None]:
import sys
import os
import gc
import copy
import yaml
import random
import shutil
from time import time
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold

import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp

import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
OUTPUT = ROOT / "output"
SRC = ROOT / "src"

DATA = INPUT / "hms-harmful-brain-activity-classification"
TRAIN_SPEC = DATA / "train_spectrograms"
TEST_SPEC = DATA / "test_spectrograms"

TMP = ROOT / "tmp"
TRAIN_SPEC_SPLIT = TMP / "train_spectrograms_split"
TEST_SPEC_SPLIT = TMP / "test_spectrograms_split"
TMP.mkdir(exist_ok=True)
TRAIN_SPEC_SPLIT.mkdir(exist_ok=True)
TEST_SPEC_SPLIT.mkdir(exist_ok=True)


RANDAM_SEED = 1086
CLASSES = ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
N_CLASSES = len(CLASSES)
FOLDS = [0, 1, 2, 3, 4]
N_FOLDS = len(FOLDS)

train = pd.read_csv(DATA / "train.csv")

# convert vote to probability
train[CLASSES] /= train[CLASSES].sum(axis=1).values[:, None]
train = train.groupby("spectrogram_id").head(1).reset_index(drop=True)
sgkf = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDAM_SEED)

train["fold"] = -1

for fold_id, (_, val_idx) in enumerate(
    sgkf.split(train, y=train["expert_consensus"], groups=train["patient_id"])
):
    train.loc[val_idx, "fold"] = fold_id
train.groupby("fold")[CLASSES].sum()
for spec_id, df in tqdm(train.groupby("spectrogram_id")):
    spec = pd.read_parquet(TRAIN_SPEC / f"{spec_id}.parquet")
    
    spec_arr = spec.fillna(0).values[:, 1:].T.astype("float32")  # (Hz, Time) = (400, 300)
    
    for spec_offset, label_id in df[
        ["spectrogram_label_offset_seconds", "label_id"]
    ].astype(int).values:
        spec_offset = spec_offset // 2
        split_spec_arr = spec_arr[:, spec_offset: spec_offset + 300]
        np.save(TRAIN_SPEC_SPLIT / f"{label_id}.npy" , split_spec_arr)

FilePath = tp.Union[str, Path]
Label = tp.Union[int, float, np.ndarray]

## 02.3  Dataset Load

In [None]:
class HMSHBACSpecModel(nn.Module):

    def __init__(
            self,
            model_name: str,
            pretrained: bool,
            in_channels: int,
            num_classes: int,
        ):
        super().__init__()
        self.model = timm.create_model(
            model_name=model_name, pretrained=pretrained,
            num_classes=num_classes, in_chans=in_channels)

    def forward(self, x):
        h = self.model(x)      

        return h

class HMSHBACSpecDataset(torch.utils.data.Dataset):

    def __init__(
        self,
        image_paths: tp.Sequence[FilePath],
        labels: tp.Sequence[Label],
        transform: A.Compose,
    ):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index: int):
        img_path = self.image_paths[index]
        label = self.labels[index]

        img = np.load(img_path)  # shape: (Hz, Time) = (400, 300)
        
        # log transform
        img = np.clip(img,np.exp(-4), np.exp(8))
        img = np.log(img)
        
        # normalize per image
        eps = 1e-6
        img_mean = img.mean(axis=(0, 1))
        img = img - img_mean
        img_std = img.std(axis=(0, 1))
        img = img / (img_std + eps)

        img = img[..., None] # shape: (Hz, Time) -> (Hz, Time, Channel)
        img = self._apply_transform(img)

        return {"data": img, "target": label}

    def _apply_transform(self, img: np.ndarray):
        """apply transform to image and mask"""
        transformed = self.transform(image=img)
        img = transformed["image"]
        return img

class CrossEntropyLossWithLogits(nn.CrossEntropyLoss):
    def __init__(self):
        super().__init__()

    def forward(self, y, t):
        loss = super().forward(y, torch.argmax(t, dim=1))
        return loss

class CrossEntropyLossWithLogitsForVal(nn.CrossEntropyLoss):
    def __init__(self):
        super().__init__()
        self.logits_list = []
        self.label_list = []

    def forward(self, y, t):
        self.logits_list.append(y.cpu().numpy())
        self.label_list.append(torch.argmax(t, dim=1).cpu().numpy())

    def compute(self):
        logits = np.concatenate(self.logits_list, axis=0)
        label = np.concatenate(self.label_list, axis=0)
        final_metric = super().forward(
            torch.from_numpy(logits),
            torch.from_numpy(label)
        ).item()
        self.logits_list = []
        self.label_list = []

        return final_metric

class CFG:
    model_name = "efficientnet_b0"
    img_size = 512
    max_epoch = 9
    batch_size = 32
    lr = 1.0e-03
    weight_decay = 1.0e-02
    es_patience =  5
    seed = 1086
    deterministic = True
    enable_amp = True
    device = "cuda"

def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore
    
def to_device(
    tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]],
    device: torch.device, *args, **kwargs
):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)

def get_path_label(train_all):
    train_size = int(len(train_all) * 0.8)
    val_size = len(train_all) - train_size
    
    # 데이터셋을 랜덤하게 분할하여 train_dataset과 val_dataset을 생성합니다.
    train_dataset, val_dataset = torch.utils.data.random_split(range(len(train_all)), [train_size, val_size])
    
    # train_dataset에서 이미지 경로와 레이블을 가져옵니다.
    train_data = {
        "image_paths": [TRAIN_SPEC_SPLIT / f"{train_all.loc[idx, 'label_id']}.npy" for idx in train_dataset],
        "labels": train_all.loc[train_dataset, CLASSES].values.astype("float32")
    }
    
    # val_dataset에서 이미지 경로와 레이블을 가져옵니다.
    val_data = {
        "image_paths": [TRAIN_SPEC_SPLIT / f"{train_all.loc[idx, 'label_id']}.npy" for idx in val_dataset],
        "labels": train_all.loc[val_dataset, CLASSES].values.astype("float32")
    }
    
    return train_data, val_data



def get_transforms(CFG):
    train_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size),
        ToTensorV2(p=1.0)
    ])
    val_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size),
        ToTensorV2(p=1.0)
    ])
    return train_transform, val_transform

def train_one_fold(CFG, train_all, output_path):
    """Main"""
    torch.backends.cudnn.benchmark = True
    set_random_seed(CFG.seed, deterministic=CFG.deterministic)
    device = torch.device(CFG.device)
    
    train_path_label, val_path_label = get_path_label(train_all)
    train_transform, val_transform = get_transforms(CFG)
    
    train_dataset = HMSHBACSpecDataset(**train_path_label, transform=train_transform)
    val_dataset = HMSHBACSpecDataset(**val_path_label, transform=val_transform)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=True, drop_last=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)
    
    model = HMSHBACSpecModel(
        model_name=CFG.model_name, pretrained=True, num_classes=6, in_channels=1)
    model.to(device)
    
    optimizer = optim.AdamW(params=model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = lr_scheduler.OneCycleLR(
        optimizer=optimizer, epochs=CFG.max_epoch,
        pct_start=0.0, steps_per_epoch=len(train_loader),
        max_lr=CFG.lr, div_factor=25, final_div_factor=4.0e-01
    )
    
    loss_func = nn.CrossEntropyLoss()
    loss_func.to(device)
    loss_func_val = CrossEntropyLossWithLogitsForVal()
    
    use_amp = CFG.enable_amp
    scaler = amp.GradScaler(enabled=use_amp)
    
    best_val_loss = 1.0e+09
    best_epoch = 0
    train_loss = 0
    
    for epoch in range(1, CFG.max_epoch + 1):
        epoch_start = time()
        model.train()
        for batch in train_loader:
            batch = to_device(batch, device)
            x, t = batch["data"], batch["target"]
                
            optimizer.zero_grad()
            with amp.autocast(use_amp):
                y = model(x)
                loss = loss_func(y, torch.argmax(t, dim=1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            train_loss += loss.item()
            
        train_loss /= len(train_loader)
            
        model.eval()
        for batch in val_loader:
            x, t = batch["data"], batch["target"]
            x = to_device(x, device)
            with torch.no_grad(), amp.autocast(use_amp):
                y = model(x)
            y = y.detach().cpu().to(torch.float32)
            loss_func_val(y, t)
        val_loss = loss_func_val.compute()        
        if val_loss < best_val_loss:
            best_epoch = epoch
            best_val_loss = val_loss
            # print("save model")
            torch.save(model.state_dict(), str(output_path / f'snapshot_epoch_{epoch}.pth'))
        
        elapsed_time = time() - epoch_start
        print(
            f"[epoch {epoch}] train loss: {train_loss: .6f}, val loss: {val_loss: .6f}, elapsed_time: {elapsed_time: .3f}")
        
        if epoch - best_epoch > CFG.es_patience:
            print("Early Stopping!")
            break
            
        train_loss = 0
            
    return best_epoch, best_val_loss

## 02.4 Train

In [None]:

score_list = []
for fold_id in FOLDS:
    output_path = Path(f"fold{fold_id}")
    output_path.mkdir(exist_ok=True)
    print(f"[fold{fold_id}]")
    train_data, val_data = get_path_label(train[train["fold"] != fold_id])
    score_list.append(train_one_fold(CFG, train_data, output_path))
