<a href="https://www.kaggle.com/code/debbiechu/hms-image-classification-with-efficientnet?scriptVersionId=174771162" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%matplotlib inline

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.backends.cudnn.deterministic = True 
    torch.backends.cudnn.benchmark = False

set_seed(42)

### Prepare df

In [3]:
# path for train and test
base_path = "/kaggle/input/hms-harmful-brain-activity-classification"
train_csv_path = os.path.join(base_path, 'train.csv')
test_csv_path = os.path.join(base_path, 'test.csv')

# train
df = pd.read_csv(train_csv_path)

# trim data
temp = df[['eeg_id', 'expert_consensus']].drop_duplicates()
df = df.loc[temp.index].reset_index(drop=True)

# add eeg & spec paths
df['eeg_path'] = base_path + '/train_eegs/' + df['eeg_id'].astype(str) + '.parquet'
df['spec_path'] = base_path + '/train_spectrograms/' + df['spectrogram_id'].astype(str) + '.parquet'
df['class_name'] = df['expert_consensus'].copy()

# Define mappings
class_name_to_label = {
    'Seizure': 0,
    'GPD': 2,
    'LRDA': 3,
    'Other': 5,
    'GRDA': 4,
    'LPD': 1
}

# Apply mapping to create 'class_label'
df['class_label'] = df['class_name'].map(class_name_to_label)

# test
test = pd.read_csv(test_csv_path)
test['eeg_path'] = base_path + '/test_eegs/' + test['eeg_id'].astype(str) + '.parquet'
test['spec_path'] = base_path + '/test_spectrograms/' + test['spectrogram_id'].astype(str) + '.parquet'
if 'spectrogram_label_offset_seconds' not in test.columns:
    test['spectrogram_label_offset_seconds'] = 0
if 'class_label' not in test.columns:
    test['class_label'] = 0

# take a few samples as the test set
eeg_ids = []
for class_label in df['class_label'].unique():
    eeg_id = df[df['class_label'] == class_label].iloc[0]['eeg_id']
    eeg_ids.append(eeg_id)

test_df = df[df['eeg_id'].isin(eeg_ids)] # for test
df = df[~df['eeg_id'].isin(eeg_ids)] # train & val

### Import model

In [4]:
# define efficientnet structure
from torchvision import models, transforms
import torch.nn as nn

model = models.efficientnet_v2_l(weights=None)
num_classes = 6
num_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_features, num_classes)

# Load pre-trained weights except for the classifier layer
weights_path = '/kaggle/input/efficientnet-v2-l/efficientnet_v2_l-59c71312.pth'
state_dict = torch.load(weights_path, map_location='cpu')
state_dict = {k: v for k, v in state_dict.items() if not k.startswith('classifier.1')}
model.load_state_dict(state_dict, strict=False)
model = model.to(device)

### Prepare data for EfficientNet

In [5]:
# Get eeg_subset and spec_subset
def read_parquet_subset(parquet_file_path, offset_seconds, length, is_eeg=True):
    offset_seconds = int(offset_seconds)  # Ensure offset_seconds integer, currently it's float
    start_row = int(offset_seconds * 200) if is_eeg else int(offset_seconds / 2) # The starting row
    end_row = start_row + (10000 if is_eeg else 300)  # The ending row
    df = pd.read_parquet(parquet_file_path)
    df_subset = df.iloc[start_row:end_row]  # Select the subset based on start and end row indices
    return df_subset

# Convert from 2d to 3d and replace null with 0
def convert_2d_to_3d(data_2d):
    # Convert the 2D array into a 3D array with shape (height, width, channels)
    data_2d = data_2d.to_numpy()
    data_2d_clipped = np.clip(data_2d, 0, 255) # ensure value falls within 0 and 255
    data_2d_clipped = np.nan_to_num(data_2d_clipped) # fillna with 0 (for numpy)
    data_3d = np.repeat(data_2d_clipped[:, :, np.newaxis], 3, axis=2)
    data_3d_uint8 = data_3d.astype(np.uint8)
    return data_3d_uint8

# generate feature vecotrs and labels
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class SPECTROGRAM_Dataset(Dataset):
    def __init__(self, dataframe, transform):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        spec_data = read_parquet_subset(row['spec_path'], row['spectrogram_label_offset_seconds'], 300, is_eeg=False)
        spec_data.drop('time', axis=1, inplace=True)
        spec_data_3d = convert_2d_to_3d(spec_data)
        data_img = Image.fromarray(spec_data_3d)
        feature_vector = self.transform(data_img)
        label = row['class_label']
        return feature_vector, label
    
# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [6]:
# params
lr=0.001
num_epochs = 100
batch_size=32
num_classes = 6
num_workers=2

In [7]:
# Train Val Split
# use stratifiedgroupkfold to split train and val set to ensure balance in classes
from sklearn.model_selection import StratifiedGroupKFold

labels = df['class_label'].values
groups = df['eeg_id'].values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in sgkf.split(X=df, y=labels, groups=groups):
    train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
    
# generate feature vectors and labels
train_dataset = SPECTROGRAM_Dataset(train_df, transform)
val_dataset = SPECTROGRAM_Dataset(val_df, transform)
test_dataset = SPECTROGRAM_Dataset(test_df, transform)
test2_dataset = SPECTROGRAM_Dataset(test, transform)

# Initialize DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test2_loader = DataLoader(test2_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

print(f"Size of train: {len(train_dataset)}")
print(f"Size of val: {len(val_dataset)}")
print(f"Size of test: {len(test_dataset)}")
print(f"Size of test2: {len(test2_dataset)}")
print(f"Number of train batches: {len(train_loader)}")
print(f"Number of val batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")
print(f"Number of test2 batches: {len(test2_loader)}")

Size of train: 14394
Size of val: 3612
Size of test: 7
Size of test2: 1
Number of train batches: 450
Number of val batches: 113
Number of test batches: 1
Number of test2 batches: 1


### Training and Validation

In [8]:
import torch.nn.functional as F
from torch.optim import Adam
import time
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.cuda.amp import autocast, GradScaler

# convert integer class labels to one-hot vectors
def to_one_hot(labels, num_classes):
    return torch.eye(num_classes, device=labels.device)[labels]

criterion = torch.nn.KLDivLoss(reduction='batchmean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=0, last_epoch=-1)

In [9]:
best_val_loss = float('inf')
training_losses = []
validation_losses = []
epochs_no_improve = 0
n_patience = 10
epochs_no_improve = 0
accumulation_steps = 4
scaler = GradScaler()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    optimizer.zero_grad()
    batch_count = 0
    for i, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)
            
        with autocast():
            outputs = model(features)
            log_probs = F.log_softmax(outputs, dim=1)# Convert to log probabilities
            one_hot_labels = to_one_hot(labels, num_classes=num_classes) # Convert labels to one-hot for KLDivLoss
            loss = criterion(log_probs, one_hot_labels) / accumulation_steps # Normalize loss to account for accumulation
        
        scaler.scale(loss).backward()
            
        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
                
        train_loss += loss.item() * features.size(0)      
        # Print every 100 batches to check progress
        batch_count += 1
        if batch_count % 100 == 0:
            print(f'Epoch {epoch+1}, Batch {batch_count}, Loss: {loss.item():.4f}')  
            
    # Average training loss for the epoch        
    train_loss /= len(train_loader.dataset)
    training_losses.append(train_loss)
    
        
    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad(), autocast():
        for features, labels in val_loader:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features)
            log_probs = F.log_softmax(outputs, dim=1)
            one_hot_labels = to_one_hot(labels, num_classes=num_classes)
            loss = criterion(log_probs, one_hot_labels)
            val_loss += loss.item() * features.size(0)
    
    # Average validation loss for the epoch
    val_loss /= len(val_loader.dataset)
    validation_losses.append(val_loss)
    scheduler.step(val_loss)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    # find smallest val loss to save as best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0 # reset to 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save best model
        print(f"Best model saved with Validation Loss: {val_loss:.4f}")
    else:
        epochs_no_improve += 1
        
    if epochs_no_improve == n_patience:
        print("Early stopping triggered.")
        break

Epoch 1, Batch 100, Loss: 0.2345
Epoch 1, Batch 200, Loss: 0.2164
Epoch 1, Batch 300, Loss: 0.2968
Epoch 1, Batch 400, Loss: 0.2375
Epoch [1/100], Training Loss: 0.2801, Validation Loss: 0.9633
Best model saved with Validation Loss: 0.9633
Epoch 2, Batch 100, Loss: 0.2678
Epoch 2, Batch 200, Loss: 0.2139
Epoch 2, Batch 300, Loss: 0.2271
Epoch 2, Batch 400, Loss: 0.2131
Epoch [2/100], Training Loss: 0.2230, Validation Loss: 1.0063
Epoch 3, Batch 100, Loss: 0.2495
Epoch 3, Batch 200, Loss: 0.2921
Epoch 3, Batch 300, Loss: 0.2246
Epoch 3, Batch 400, Loss: 0.1987
Epoch [3/100], Training Loss: 0.2026, Validation Loss: 1.2775
Epoch 4, Batch 100, Loss: 0.1698
Epoch 4, Batch 200, Loss: 0.1823
Epoch 4, Batch 300, Loss: 0.1794
Epoch 4, Batch 400, Loss: 0.1752
Epoch [4/100], Training Loss: 0.1856, Validation Loss: 0.8845
Best model saved with Validation Loss: 0.8845
Epoch 5, Batch 100, Loss: 0.1868
Epoch 5, Batch 200, Loss: 0.1378
Epoch 5, Batch 300, Loss: 0.1343
Epoch 5, Batch 400, Loss: 0.1699


### Test Performance

In [10]:
# validate on test samples i extracted

model.load_state_dict(torch.load('best_model.pth'))
model.eval()

all_preds = []
all_targets = []
all_probs = []

with torch.no_grad():
    for features, labels in test_loader:
        features = features.to(device).float()
        labels = labels.to(device)
        outputs = model(features)
        
        # store predicted labels, probabilites, and actual labels
        probs = F.softmax(outputs, dim=1)
        _, preds = torch.max(probs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_targets.extend(labels.cpu().numpy())

# classification report        
from sklearn.metrics import classification_report
classification_report(all_targets, all_preds, target_names=["Seizure", "LPD", "GPD", "LRDA", "GRDA", "Other"])

'              precision    recall  f1-score   support\n\n     Seizure       1.00      1.00      1.00         1\n         LPD       1.00      1.00      1.00         1\n         GPD       0.00      0.00      0.00         1\n        LRDA       1.00      1.00      1.00         1\n        GRDA       0.00      0.00      0.00         1\n       Other       1.00      0.50      0.67         2\n\n    accuracy                           0.57         7\n   macro avg       0.67      0.58      0.61         7\nweighted avg       0.71      0.57      0.62         7\n'

In [11]:
print(all_preds)
print(all_targets)

[0, 4, 3, 2, 5, 2, 1]
[0, 2, 3, 5, 5, 4, 1]


### Submission

In [12]:
# get pred prob of test

model.load_state_dict(torch.load('best_model.pth'))
model.eval()

all_preds = []
all_probs = []

with torch.no_grad():
    for features, labels in test2_loader:
        features = features.to(device).float()
        labels = labels.to(device)
        outputs = model(features)
        
        # store predicted labels, probabilites
        probs = F.softmax(outputs, dim=1)
        _, preds = torch.max(probs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

print(all_preds)
print(all_probs)

[5]
[array([2.0602986e-03, 5.7976600e-02, 2.4985071e-04, 9.3664527e-02,
       3.7953267e-03, 8.4225333e-01], dtype=float32)]


In [13]:
# sample submission
sample_submission_csv_path = os.path.join(base_path, 'sample_submission.csv')
sub = pd.read_csv(sample_submission_csv_path)
sub

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667


In [14]:
submission = pd.DataFrame(all_probs, columns=['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote'])
submission

Unnamed: 0,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,0.00206,0.057977,0.00025,0.093665,0.003795,0.842253


In [15]:
submission['eeg_id'] = test['eeg_id']  
submission

Unnamed: 0,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,eeg_id
0,0.00206,0.057977,0.00025,0.093665,0.003795,0.842253,3911565283


In [16]:
submission.iloc[:,:6].sum(axis=1)

0    1.0
dtype: float32

In [17]:
submission.to_csv('submission.csv', index=False)