# HW4 Speakers Classification

---

# Get Data

## Download Data (These links are no longer working, I finally downloaded these on Kaggle)

In [46]:
# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partaa" -o Dataset.tar.gz.partaa

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partab" -o Dataset.tar.gz.partab

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partac" -o Dataset.tar.gz.partac

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partad" -o Dataset.tar.gz.partad

# !copy /b Dataset.tar.gz.part* Dataset.tar.gz

# !tar -zxvf Dataset.tar.gz


## Manually Unzip

---

# Preparation

## Importing

In [1]:
import os
import json
import math
from pathlib import Path

import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Optimizer, AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset, random_split

from tqdm.auto import tqdm, trange
import random

from torch.utils.tensorboard import SummaryWriter


## Define Dataset

In [2]:
class MyDataset(Dataset):

    def __init__(self, data_dir, segment_len=256):
        self.data_dir = data_dir
        self.segment_len = segment_len

        mapping_path = Path(data_dir) / "mapping.json"
        mapping = json.load(mapping_path.open())
        self.speaker2id = mapping["speaker2id"]

        metadata_path = Path(data_dir) / "metadata.json"
        metadata = json.load(open(metadata_path))["speakers"]

        self.speaker_num = len(metadata.keys())
        self.data = []
        for speaker in metadata.keys():
            for utterances in metadata[speaker]:
                self.data.append(
                    [utterances["feature_path"], self.speaker2id[speaker]])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        feat_path, speaker_id = self.data[index]
        mel = torch.load(os.path.join(self.data_dir, feat_path), weights_only=True)

        if len(mel) > self.segment_len:
            start = random.randint(0, len(mel) - self.segment_len)
            mel = torch.FloatTensor(mel[start:start + self.segment_len])
        else:
            mel = torch.FloatTensor(mel)

        speaker_id = torch.tensor(speaker_id, dtype=torch.long)
        return mel, speaker_id

    def get_speaker_number(self):
        return self.speaker_num


## Define Dataloader

In [3]:
def collate_batch(batch):
    mel, speaker = zip(*batch)
    mel = pad_sequence(mel, batch_first=True, padding_value=-20)
    speaker = torch.stack(speaker)
    return mel, speaker


def get_dataloader(data_dir_val,
                   batch_size_val,
                   n_workers_val,
                   segment_len_val=128):
    dataset = MyDataset(data_dir_val, segment_len=segment_len_val)
    speaker_num_val = dataset.get_speaker_number()

    train_length = int(0.9 * len(dataset))
    lengths = [train_length, len(dataset) - train_length]
    train_dataset, val_dataset = random_split(dataset, lengths)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size_val,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers_val,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    valid_loader = DataLoader(
        val_dataset,
        batch_size=batch_size_val,
        shuffle=False,
        drop_last=False,
        num_workers=n_workers_val,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    return train_loader, valid_loader, speaker_num_val


## Define Model

In [4]:
class FeedForwardModule(nn.Module):

    def __init__(self, d_model, expansion_factor=4, dropout_rate=0.0):

        super().__init__()
        self.layer_norm = nn.LayerNorm(d_model)
        self.linear1 = nn.Linear(d_model, d_model * expansion_factor)
        self.swish = nn.SiLU()  # Swish activation function
        self.dropout1 = nn.Dropout(dropout_rate)
        self.linear2 = nn.Linear(d_model * expansion_factor, d_model)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x):

        x_norm = self.layer_norm(x)

        out = self.linear1(x_norm)
        out = self.swish(out)
        out = self.dropout1(out)
        out = self.linear2(out)
        out = self.dropout2(out)
        # The residual connection is handled in the ConformerBlock
        return out


class ConvolutionModule(nn.Module):

    def __init__(self, d_model, kernel_size=15, dropout_rate=0.0):
        super().__init__()
        assert kernel_size % 2 == 1, "Kernel size must be odd for 'same' padding"
        self.layer_norm = nn.LayerNorm(d_model)
        self.pointwise_conv1 = nn.Conv1d(
            d_model,
            2 * d_model,  # Expand to 2*d_model for GLU
            kernel_size=1,
            stride=1,
            padding=0)
        self.glu = nn.GLU(
            dim=1
        )  # Gated Linear Unit, operates on the channel dimension (dim=1 for Conv1d)
        self.depthwise_conv = nn.Conv1d(
            d_model,  # Input channels for depthwise is d_model (after GLU)
            d_model,  # Output channels is also d_model
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,  # 'same' padding
            groups=d_model)  # Depthwise convolution
        self.batch_norm = nn.BatchNorm1d(d_model)
        self.swish = nn.SiLU()
        self.pointwise_conv2 = nn.Conv1d(d_model,
                                         d_model,
                                         kernel_size=1,
                                         stride=1,
                                         padding=0)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):

        x_norm = self.layer_norm(x)  # Pre-LayerNorm
        x_conv = x_norm.transpose(
            1, 2)  # (batch_size, d_model, sequence_length) for Conv1d

        x_conv = self.pointwise_conv1(
            x_conv)  # (batch_size, 2 * d_model, sequence_length)
        x_conv = self.glu(x_conv)  # (batch_size, d_model, sequence_length)
        x_conv = self.depthwise_conv(x_conv)
        x_conv = self.batch_norm(
            x_conv)  # BatchNorm after convolution, before activation
        x_conv = self.swish(x_conv)
        x_conv = self.pointwise_conv2(x_conv)
        x_conv = self.dropout(x_conv)

        x_conv = x_conv.transpose(1, 2)
        # (batch_size, sequence_length, d_model)
        # The residual connection is handled in the ConformerBlock
        return x_conv


class ConformerBlock(nn.Module):

    def __init__(
            self,
            d_model,
            n_head,  # Renamed from num_heads to match MultiheadAttention's nhead
            dim_feedforward_expansion=4,
            conv_kernel_size=15,
            dropout_rate=0.0):
        super().__init__()
        # First FeedForward Module
        self.ffn1 = FeedForwardModule(d_model, dim_feedforward_expansion,
                                      dropout_rate)

        # Multi-Headed Self-Attention Module
        self.attn_layer_norm = nn.LayerNorm(d_model)  # LayerNorm before MHSA
        self.self_attn = nn.MultiheadAttention(d_model,
                                               n_head,
                                               dropout=dropout_rate,
                                               batch_first=True)
        self.attn_dropout = nn.Dropout(dropout_rate)  # Dropout after MHSA

        # Convolution Module
        self.conv_module = ConvolutionModule(d_model, conv_kernel_size,
                                             dropout_rate)
        # Second FeedForward Module
        self.ffn2 = FeedForwardModule(d_model, dim_feedforward_expansion,
                                      dropout_rate)

        self.final_layer_norm = nn.LayerNorm(
            d_model)  # Final LayerNorm for the block output

    def forward(self, x, src_key_padding_mask=None, src_mask=None):

        # 1. First FeedForward Module (FFN1)
        residual = x
        ffn1_output = self.ffn1(x)  # FFN has LayerNorm internally
        x = residual + 0.5 * ffn1_output  # As per Conformer paper, FFNs have 0.5 factor for residual

        # 2. Multi-Headed Self-Attention Module (MHSA)
        residual = x
        x_norm_for_attn = self.attn_layer_norm(x)  # LayerNorm before MHSA
        attn_output, _ = self.self_attn(query=x_norm_for_attn,
                                        key=x_norm_for_attn,
                                        value=x_norm_for_attn,
                                        key_padding_mask=src_key_padding_mask,
                                        attn_mask=src_mask)
        x = residual + self.attn_dropout(attn_output)

        # 3. Convolution Module (Conv)
        residual = x
        conv_output = self.conv_module(
            x)  # Conv module has LayerNorm internally
        x = residual + conv_output

        # 4. Second FeedForward Module (FFN2)
        residual = x
        ffn2_output = self.ffn2(x)  # FFN has LayerNorm internally
        x = residual + 0.5 * ffn2_output

        x = self.final_layer_norm(x)  # Final LayerNorm for the block
        return x


class SinusoidalPositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000, dropout=0.0):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() *
            (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(
            0
        )  # Shape: (1, max_len, d_model) for batch_first=True compatibility
        self.register_buffer('pe',
                             pe)  # Not a model parameter, but part of the state

    def forward(self, x):
        # self.pe shape: (1, max_len, d_model)
        # Add positional encoding to x. Slice pe to match x's seq_len.
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class Classifier(nn.Module):

    def __init__(
            self,
            d_model,  # Internal dimension of the Conformer
            num_encoder_layers,  # Number of ConformerBlocks
            num_heads,  # Number of attention heads in ConformerBlock
            dim_feedforward_expansion=4,  # Expansion factor for FFN in ConformerBlock
            conv_kernel_size=15,  # Kernel size for ConvolutionModule in ConformerBlock
            dropout_rate=0.0,
            max_seq_len=5000,  # Maximum sequence length for positional encoding
            input_dim=40,
            num_speakers=600):
        super().__init__()
        # Project the dimension of features from input_dim into d_model.
        self.prenet = nn.Linear(input_dim, d_model)
        self.pos_encoder = SinusoidalPositionalEncoding(d_model, max_seq_len,
                                                        dropout_rate)

        self.conformer_blocks = nn.ModuleList([
            ConformerBlock(
                d_model=d_model,
                n_head=num_heads,  # Pass num_heads as n_head to ConformerBlock
                dim_feedforward_expansion=dim_feedforward_expansion,
                conv_kernel_size=conv_kernel_size,
                dropout_rate=dropout_rate) for _ in range(num_encoder_layers)
        ])

        # Project the dimension of features from d_model into num_speakers.
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.SiLU(),  # Using SiLU here as well for consistency, or ReLU
            nn.Dropout(
                dropout_rate),  # Dropout before the final classification layer
            nn.Linear(d_model, num_speakers),
        )

    def forward(self, mels, src_key_padding_mask=None):

        x = self.prenet(mels)  # (batch size, length, d_model)
        x = self.pos_encoder(x)  # Add positional encoding

        # Pass through Conformer blocks
        # Note: src_mask for nn.MultiheadAttention in encoder is typically None unless you have specific masking needs
        # other than padding. If src_key_padding_mask is provided, it will be used by self_attn.
        for block in self.conformer_blocks:
            x = block(x,
                      src_key_padding_mask=src_key_padding_mask,
                      src_mask=None)

        # Mean pooling, considering padding if mask is provided
        if src_key_padding_mask is not None:
            # Invert mask: True for non-padded, False for padded
            active_elements_mask = (~src_key_padding_mask
                                   ).unsqueeze(-1).float()  # (batch, length, 1)
            # Sum only non-padded elements and divide by the count of non-padded elements
            masked_sum = (x * active_elements_mask).sum(dim=1)
            num_active_elements = active_elements_mask.sum(dim=1).clamp(
                min=1e-9)  # Avoid division by zero
            stats = masked_sum / num_active_elements
        else:
            stats = x.mean(dim=1)  # (batch size, d_model)

        out = self.pred_layer(stats)  # (batch size, num_speakers)
        return out


## Fixing seed

In [5]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


## Hyperparameters

In [6]:
# general paths and settings
data_dir = "./Dataset"
model_save_path = "./speaker_model.ckpt"
submission_path = "./submission.csv"
tensorboard_log_dir = "./runs/"
seed = 3407

# training parameters
batch_size = 512
num_workers = 0
num_speakers = 600  # will be determined from the dataset

# model parameters (transformer related)
d_model = 256
num_speakers = 600
num_encoder_layers = 8
dim_feedforward = d_model * 4
num_heads = 4
dropout_rate = 0.0

# optimizer and scheduler parameters
learning_rate = 1e-3
num_warmup_steps = 5000
total_steps = 100000

# training control
valid_steps = 2000
early_stop_patience = 10

# apply seed
same_seeds(seed)

# device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


## Training Utilities

### Warmup Scheduler

In [7]:
def get_cosine_schedule_with_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
):

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(
            max(1, num_training_steps - num_warmup_steps))
        return max(
            0.0, 0.5 *
            (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)


### Validation Process

In [8]:
def validate_speaker_model(valid_loader_val,
                           model_val,
                           criterion_val,
                           device_val,
                           current_step_or_epoch,
                           writer_tb=None,
                           pbar_desc_prefix=""):
    model_val.eval()
    epoch_valid_loss = 0.0
    epoch_valid_corrects = 0
    num_valid_samples = 0

    batch_pbar = tqdm(valid_loader_val,
                      leave=False,
                      desc=f"{pbar_desc_prefix} Validation")
    with torch.no_grad():
        for features, labels in batch_pbar:
            features = features.to(device_val)
            labels = labels.to(device_val)

            outputs = model_val(features)
            loss = criterion_val(outputs, labels)

            preds = outputs.argmax(dim=-1)
            epoch_valid_loss += loss.item() * features.size(0)
            epoch_valid_corrects += (preds == labels).sum().item()
            num_valid_samples += features.size(0)

            batch_pbar.set_postfix(
                loss=f"{loss.item():.4f}",
                acc=f"{(preds == labels).sum().item()/features.size(0):.2%}")

    avg_epoch_valid_loss = epoch_valid_loss / num_valid_samples if num_valid_samples > 0 else 0
    avg_epoch_valid_acc = epoch_valid_corrects / num_valid_samples if num_valid_samples > 0 else 0

    if writer_tb:
        writer_tb.add_scalar('Loss/valid_step', avg_epoch_valid_loss,
                             current_step_or_epoch)
        writer_tb.add_scalar('Accuracy/valid_step', avg_epoch_valid_acc,
                             current_step_or_epoch)

    model_val.train()
    return avg_epoch_valid_loss, avg_epoch_valid_acc


### Training Process

In [9]:
def model_fn_speaker(batch, model, criterion_val, device_val):
    mels, labels = batch
    mels = mels.to(device_val)
    labels = labels.to(device_val)

    outs = model(mels)
    loss = criterion_val(outs, labels)

    preds = outs.argmax(1)
    accuracy = torch.mean((preds == labels).float())

    return loss, accuracy


def train_speaker_model(model_to_train, train_loader_val, valid_loader_val,
                        criterion_val, optimizer_val, scheduler_val,
                        device_val):
    tb_writer = SummaryWriter(log_dir=tensorboard_log_dir)

    best_valid_acc = 0.0
    best_model_state_dict = None
    epochs_no_improve = 0

    train_iterator = iter(train_loader_val)

    epoch_running_loss = 0.0
    epoch_running_corrects = 0
    epoch_num_samples = 0
    completed_data_passes = 0

    pbar_total_steps = tqdm(range(total_steps), desc="Steps")

    for step in pbar_total_steps:
        model_to_train.train()

        try:
            batch = next(train_iterator)
        except StopIteration:
            if epoch_num_samples > 0:
                current_pass_avg_loss = epoch_running_loss / epoch_num_samples
                current_pass_avg_acc = epoch_running_corrects / epoch_num_samples
                if tb_writer:
                    tb_writer.add_scalar('Loss/train_data_pass_avg',
                                         current_pass_avg_loss,
                                         completed_data_passes + 1)
                    tb_writer.add_scalar('Accuracy/train_data_pass_avg',
                                         current_pass_avg_acc,
                                         completed_data_passes + 1)

            epoch_running_loss = 0.0
            epoch_running_corrects = 0
            epoch_num_samples = 0
            completed_data_passes += 1
            train_iterator = iter(train_loader_val)
            batch = next(train_iterator)

        loss, accuracy = model_fn_speaker(batch, model_to_train, criterion_val,
                                          device_val)

        optimizer_val.zero_grad()
        loss.backward()
        optimizer_val.step()
        scheduler_val.step()

        current_lr = optimizer_val.param_groups[0]['lr']

        current_batch_size = batch[0].size(0)
        epoch_running_loss += loss.item() * current_batch_size
        epoch_running_corrects += accuracy.item() * current_batch_size
        epoch_num_samples += current_batch_size

        current_epoch_avg_train_acc = 0
        if epoch_num_samples > 0:
            current_epoch_avg_train_acc = epoch_running_corrects / epoch_num_samples

        if step % 100 == 0:
            tb_writer.add_scalar('Loss/train_batch', loss.item(), step)
            tb_writer.add_scalar('Accuracy/train_batch', accuracy.item(), step)
            tb_writer.add_scalar('LearningRate/step', current_lr, step)

        pbar_total_steps.set_postfix(
            batch_loss=f"{loss.item():.4f}",
            best_acc=f"{best_valid_acc:.2%}",
            run_avg_acc=f"{current_epoch_avg_train_acc:.2%}",
            lr=f"{current_lr:.2e}")

        if (step + 1) % valid_steps == 0:
            avg_valid_loss, avg_valid_acc = validate_speaker_model(
                valid_loader_val,
                model_to_train,
                criterion_val,
                device_val,
                step + 1,
                tb_writer,
                pbar_desc_prefix=f"Step {step+1}")

            pbar_total_steps.write(
                f"Step {step + 1}/{total_steps} - "
                f"Valid Loss: {avg_valid_loss:.4f}, Valid Acc: {avg_valid_acc:.2%}"
            )

            if avg_valid_acc > best_valid_acc:
                best_valid_acc = avg_valid_acc
                best_model_state_dict = model_to_train.state_dict()
                torch.save(best_model_state_dict, model_save_path)
                pbar_total_steps.write(
                    f"Best model updated at step {step + 1}. Accuracy: {best_valid_acc:.2%}. Saved to {model_save_path}"
                )
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= early_stop_patience:
                pbar_total_steps.write(
                    f"Early stopping triggered at step {step + 1} after {epochs_no_improve} validation checks without improvement."
                )
                break

    tb_writer.close()
    print(
        f"\nTraining finished. Best validation accuracy: {best_valid_acc:.4%}")
    if best_valid_acc > 0:
        print(f"Best model saved to {model_save_path}")
    else:
        print(
            "No model was saved as validation accuracy did not improve or training was too short."
        )

    return best_valid_acc


### Train Function

In [10]:
def train():
    # prepare dataloaders
    train_loader, valid_loader, speaker_num_from_data = get_dataloader(
        data_dir_val=data_dir,
        batch_size_val=batch_size,
        n_workers_val=num_workers)

    print(
        f"Train Dataloader: {len(train_loader.dataset)} samples, {len(train_loader)} batches"
    )
    print(
        f"Valid Dataloader: {len(valid_loader.dataset)} samples, {len(valid_loader)} batches"
    )

    # initialize model, criterion, optimizer, and scheduler
    print("Initializing model, criterion, optimizer, and scheduler...")
    speaker_model = Classifier(d_model=d_model,
                               num_encoder_layers=num_encoder_layers,
                               num_heads=num_heads,
                               dropout_rate=dropout_rate).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(speaker_model.parameters(), lr=learning_rate)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps,
                                                total_steps)

    # Run training
    best_accuracy_from_training = train_speaker_model(
        model_to_train=speaker_model,
        train_loader_val=train_loader,
        valid_loader_val=valid_loader,
        criterion_val=criterion,
        optimizer_val=optimizer,
        scheduler_val=scheduler,
        device_val=device)
    print(
        f"Training complete. Best validation accuracy: {best_accuracy_from_training:.2%}"
    )


# Training

In [None]:
train()


Train Dataloader: 50999 samples, 99 batches
Valid Dataloader: 5667 samples, 12 batches
Initializing model, criterion, optimizer, and scheduler...


Steps:   0%|          | 0/100000 [00:00<?, ?it/s]

Step 2000 Validation:   0%|          | 0/12 [00:00<?, ?it/s]

Step 2000/100000 - Valid Loss: 1.3955, Valid Acc: 67.09%
Best model updated at step 2000. Accuracy: 67.09%. Saved to ./speaker_model.ckpt


Step 4000 Validation:   0%|          | 0/12 [00:00<?, ?it/s]

Step 4000/100000 - Valid Loss: 1.3886, Valid Acc: 69.65%
Best model updated at step 4000. Accuracy: 69.65%. Saved to ./speaker_model.ckpt


Step 6000 Validation:   0%|          | 0/12 [00:00<?, ?it/s]

Step 6000/100000 - Valid Loss: 0.9719, Valid Acc: 78.31%
Best model updated at step 6000. Accuracy: 78.31%. Saved to ./speaker_model.ckpt


Step 8000 Validation:   0%|          | 0/12 [00:00<?, ?it/s]

Step 8000/100000 - Valid Loss: 0.7737, Valid Acc: 83.15%
Best model updated at step 8000. Accuracy: 83.15%. Saved to ./speaker_model.ckpt


Step 10000 Validation:   0%|          | 0/12 [00:00<?, ?it/s]

Step 10000/100000 - Valid Loss: 0.6571, Valid Acc: 85.87%
Best model updated at step 10000. Accuracy: 85.87%. Saved to ./speaker_model.ckpt


## Tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/  --port 6008


# Test & Predict

In [11]:
# inference Dataset
class InferenceDataset(Dataset):

    def __init__(self, data_dir_val):
        testdata_path = Path(data_dir_val) / "testdata.json"
        metadata = json.load(testdata_path.open())
        self.data_dir = data_dir_val
        self.data = metadata["utterances"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        utterance = self.data[index]
        feat_path = utterance["feature_path"]
        mel = torch.load(os.path.join(self.data_dir, feat_path), weights_only=True)
        mel = torch.FloatTensor(mel)
        return feat_path, mel


# inference collate function
def inference_collate_batch(batch):
    feat_paths, mels = zip(*batch)
    mels_padded = pad_sequence(mels, batch_first=True, padding_value=-20)
    return feat_paths, mels_padded


# create submission file
def create_submission(device_val):
    mapping_path = Path(data_dir) / "mapping.json"
    mapping = json.load(mapping_path.open())

    dataset = InferenceDataset(data_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=False,
        drop_last=False,
        num_workers=num_workers,
        collate_fn=inference_collate_batch,
    )
    print(f"[Info]: Finish loading data!", flush=True)

    speaker_num = len(mapping["id2speaker"])
    model = Classifier(d_model=d_model,
                       num_encoder_layers=num_encoder_layers,
                       num_heads=num_heads,
                       dropout_rate=0).to(device)
    model.load_state_dict(torch.load(model_save_path))
    model.eval()
    print(f"[Info]: Finish creating model!", flush=True)

    results = [["Id", "Category"]]
    for feat_paths, mels in tqdm(dataloader):
        with torch.no_grad():
            mels = mels.to(device)
            outs = model(mels)
            preds = outs.argmax(1).cpu().numpy()
            for feat_path, pred in zip(feat_paths, preds):
                results.append([feat_path, mapping["id2speaker"][str(pred)]])

    with open(submission_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(results)


# Run inference
create_submission(device)


[Info]: Finish loading data!
[Info]: Finish creating model!


  model.load_state_dict(torch.load(model_save_path))


  0%|          | 0/8000 [00:00<?, ?it/s]