# HW4 Speakers Classification

---

# Get Data

## Download Data (These links are no longer working, I finally downloaded these on Kaggle)

In [59]:
# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partaa" -o Dataset.tar.gz.partaa

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partab" -o Dataset.tar.gz.partab

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partac" -o Dataset.tar.gz.partac

# !curl -L "https://github.com/MachineLearningHW/ML_HW4_Dataset/releases/latest/download/Dataset.tar.gz.partad" -o Dataset.tar.gz.partad

# !copy /b Dataset.tar.gz.part* Dataset.tar.gz

# !tar -zxvf Dataset.tar.gz


## Manually Unzip

---

# Preparation

## Importing

In [60]:
import os
import json
import math
from pathlib import Path

import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Optimizer, AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset, random_split

from tqdm.auto import tqdm, trange
import random

from torch.utils.tensorboard import SummaryWriter


## Define Dataset

In [61]:
class MyDataset(Dataset):

    def __init__(self, data_dir, segment_len=128):
        self.data_dir = data_dir
        self.segment_len = segment_len

        mapping_path = Path(data_dir) / "mapping.json"
        mapping = json.load(mapping_path.open())
        self.speaker2id = mapping["speaker2id"]

        metadata_path = Path(data_dir) / "metadata.json"
        metadata = json.load(open(metadata_path))["speakers"]

        self.speaker_num = len(metadata.keys())
        self.data = []
        for speaker in metadata.keys():
            for utterances in metadata[speaker]:
                self.data.append(
                    [utterances["feature_path"], self.speaker2id[speaker]])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        feat_path, speaker_id = self.data[index]
        mel = torch.load(os.path.join(self.data_dir, feat_path))

        if len(mel) > self.segment_len:
            start = random.randint(0, len(mel) - self.segment_len)
            mel = torch.FloatTensor(mel[start:start + self.segment_len])
        else:
            mel = torch.FloatTensor(mel)

        speaker_id = torch.tensor(speaker_id, dtype=torch.long)
        return mel, speaker_id

    def get_speaker_number(self):
        return self.speaker_num


## Define Dataloader

In [62]:
def collate_batch(batch):
    mel, speaker = zip(*batch)
    mel = pad_sequence(mel, batch_first=True, padding_value=-20)
    speaker = torch.stack(speaker)
    return mel, speaker


def get_dataloader(data_dir_val,
                   batch_size_val,
                   n_workers_val,
                   segment_len_val=128):
    dataset = MyDataset(data_dir_val, segment_len=segment_len_val)
    speaker_num_val = dataset.get_speaker_number()

    train_length = int(0.9 * len(dataset))
    lengths = [train_length, len(dataset) - train_length]
    train_dataset, val_dataset = random_split(dataset, lengths)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size_val,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers_val,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    valid_loader = DataLoader(
        val_dataset,
        batch_size=batch_size_val,
        shuffle=False,
        drop_last=False,
        num_workers=n_workers_val,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    return train_loader, valid_loader, speaker_num_val


## Define Model

In [63]:
class Classifier(nn.Module):

    def __init__(self,
                 d_model,
                 num_encoder_layers,
                 dim_feedforward,
                 num_heads,
                 dropout_rate,
                 num_speakers=600):
        super().__init__()
        # Project the dimension of features from that of input into d_model.
        self.prenet = nn.Linear(40, d_model)
        # TODO:
        #   Change Transformer to Conformer.
        #   https://arxiv.org/abs/2005.08100
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            dim_feedforward=dim_feedforward,
            nhead=num_heads,
            dropout=dropout_rate)
        self.encoder = nn.TransformerEncoder(self.encoder_layer,
                                             num_layers=num_encoder_layers)

        # Project the the dimension of features from d_model into speaker nums.
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, num_speakers),
        )

    def forward(self, mels):
        """
		args:
			mels: (batch size, length, 40)
		return:
			out: (batch size, num_speakers)
		"""
        # out: (batch size, length, d_model)
        out = self.prenet(mels)
        # out: (length, batch size, d_model)
        out = out.permute(1, 0, 2)
        # The encoder layer expect features in the shape of (length, batch size, d_model).
        out = self.encoder(out)
        # out: (batch size, length, d_model)
        out = out.transpose(0, 1)
        # mean pooling
        stats = out.mean(dim=1)

        # out: (batch, num_speakers)
        out = self.pred_layer(stats)
        return out


## Fixing seed

In [64]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


## Hyperparameters

In [None]:
# general paths and settings
data_dir = "./Dataset"
model_save_path = "./speaker_model.ckpt"
submission_path = "./submission.csv"
tensorboard_log_dir = "./runs/speaker_classification"
seed = 3407

# training parameters
batch_size = 32
num_workers = 0
num_speakers = 600  # will be determined from the dataset

# model parameters (transformer related)
d_model = 192
num_speakers = 600
num_encoder_layers = 3
dim_feedforward = 768
num_heads = 4
dropout_rate = 0

# optimizer and scheduler parameters
learning_rate = 5e-4
num_warmup_steps = 1000
total_steps = 100000

# training control
valid_steps = 2000
early_stop_patience = 10

# apply seed
same_seeds(seed)

# device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


## Training Utilities

### Warmup Scheduler

In [66]:
def get_cosine_schedule_with_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
):

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(
            max(1, num_training_steps - num_warmup_steps))
        return max(
            0.0, 0.5 *
            (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)


### Validation Process

In [67]:
def validate_speaker_model(valid_loader_val,
                           model_val,
                           criterion_val,
                           device_val,
                           current_step_or_epoch,
                           writer_tb=None,
                           pbar_desc_prefix=""):
    model_val.eval()
    epoch_valid_loss = 0.0
    epoch_valid_corrects = 0
    num_valid_samples = 0

    batch_pbar = tqdm(valid_loader_val,
                      leave=False,
                      desc=f"{pbar_desc_prefix} Validation")
    with torch.no_grad():
        for features, labels in batch_pbar:
            features = features.to(device_val)
            labels = labels.to(device_val)

            outputs = model_val(features)
            loss = criterion_val(outputs, labels)

            preds = outputs.argmax(dim=-1)
            epoch_valid_loss += loss.item() * features.size(0)
            epoch_valid_corrects += (preds == labels).sum().item()
            num_valid_samples += features.size(0)

            batch_pbar.set_postfix(
                loss=f"{loss.item():.4f}",
                acc=f"{(preds == labels).sum().item()/features.size(0):.2%}")

    avg_epoch_valid_loss = epoch_valid_loss / num_valid_samples if num_valid_samples > 0 else 0
    avg_epoch_valid_acc = epoch_valid_corrects / num_valid_samples if num_valid_samples > 0 else 0

    if writer_tb:
        writer_tb.add_scalar('Loss/valid_step', avg_epoch_valid_loss,
                             current_step_or_epoch)
        writer_tb.add_scalar('Accuracy/valid_step', avg_epoch_valid_acc,
                             current_step_or_epoch)

    model_val.train()
    return avg_epoch_valid_loss, avg_epoch_valid_acc


### Training Process

In [68]:
def model_fn_speaker(batch, model, criterion_val, device_val):
    mels, labels = batch
    mels = mels.to(device_val)
    labels = labels.to(device_val)

    outs = model(mels)
    loss = criterion_val(outs, labels)

    preds = outs.argmax(1)
    accuracy = torch.mean((preds == labels).float())

    return loss, accuracy


def train_speaker_model(model_to_train, train_loader_val, valid_loader_val,
                        criterion_val, optimizer_val, scheduler_val,
                        device_val):
    tb_writer = SummaryWriter(log_dir=tensorboard_log_dir)

    best_valid_acc = 0.0
    best_model_state_dict = None
    epochs_no_improve = 0

    train_iterator = iter(train_loader_val)

    epoch_running_loss = 0.0
    epoch_running_corrects = 0
    epoch_num_samples = 0
    completed_data_passes = 0

    pbar_total_steps = tqdm(range(total_steps), desc="Total Training Steps")

    for step in pbar_total_steps:
        model_to_train.train()

        try:
            batch = next(train_iterator)
        except StopIteration:
            if epoch_num_samples > 0:
                current_pass_avg_loss = epoch_running_loss / epoch_num_samples
                current_pass_avg_acc = epoch_running_corrects / epoch_num_samples
                pbar_total_steps.write(
                    f"End of Training Data Pass {completed_data_passes + 1}: "
                    f"Avg Loss: {current_pass_avg_loss:.4f}, Avg Acc: {current_pass_avg_acc:.2%}"
                )
                if tb_writer:
                    tb_writer.add_scalar('Loss/train_data_pass_avg',
                                         current_pass_avg_loss,
                                         completed_data_passes + 1)
                    tb_writer.add_scalar('Accuracy/train_data_pass_avg',
                                         current_pass_avg_acc,
                                         completed_data_passes + 1)

            epoch_running_loss = 0.0
            epoch_running_corrects = 0
            epoch_num_samples = 0
            completed_data_passes += 1
            train_iterator = iter(train_loader_val)
            batch = next(train_iterator)

        loss, accuracy = model_fn_speaker(batch, model_to_train, criterion_val,
                                          device_val)

        optimizer_val.zero_grad()
        loss.backward()
        optimizer_val.step()
        scheduler_val.step()

        current_lr = optimizer_val.param_groups[0]['lr']

        current_batch_size = batch[0].size(0)
        epoch_running_loss += loss.item() * current_batch_size
        epoch_running_corrects += accuracy.item() * current_batch_size
        epoch_num_samples += current_batch_size

        current_epoch_avg_train_acc = 0
        if epoch_num_samples > 0:
            current_epoch_avg_train_acc = epoch_running_corrects / epoch_num_samples

        if step % 100 == 0:
            tb_writer.add_scalar('Loss/train_batch', loss.item(), step)
            tb_writer.add_scalar('Accuracy/train_batch', accuracy.item(), step)
            tb_writer.add_scalar('LearningRate/step', current_lr, step)

        pbar_total_steps.set_postfix(
            batch_loss=f"{loss.item():.4f}",
            batch_acc=f"{accuracy.item():.2%}",
            run_avg_acc=f"{current_epoch_avg_train_acc:.2%}",
            lr=f"{current_lr:.2e}")

        if (step + 1) % valid_steps == 0:
            avg_valid_loss, avg_valid_acc = validate_speaker_model(
                valid_loader_val,
                model_to_train,
                criterion_val,
                device_val,
                step + 1,
                tb_writer,
                pbar_desc_prefix=f"Step {step+1}")

            pbar_total_steps.write(
                f"Step {step + 1}/{total_steps} - "
                f"Valid Loss: {avg_valid_loss:.4f}, Valid Acc: {avg_valid_acc:.2%}"
            )

            if avg_valid_acc > best_valid_acc:
                best_valid_acc = avg_valid_acc
                best_model_state_dict = model_to_train.state_dict()
                torch.save(best_model_state_dict, model_save_path)
                pbar_total_steps.write(
                    f"Best model updated at step {step + 1}. Accuracy: {best_valid_acc:.2%}. Saved to {model_save_path}"
                )
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= early_stop_patience:
                pbar_total_steps.write(
                    f"Early stopping triggered at step {step + 1} after {epochs_no_improve} validation checks without improvement."
                )
                break

    tb_writer.close()
    print(
        f"\nTraining finished. Best validation accuracy: {best_valid_acc:.4%}")
    if best_valid_acc > 0:
        print(f"Best model saved to {model_save_path}")
    else:
        print(
            "No model was saved as validation accuracy did not improve or training was too short."
        )

    return best_valid_acc


### Train Function

In [69]:
def train():
    # prepare dataloaders
    train_loader, valid_loader, speaker_num_from_data = get_dataloader(
        data_dir_val=data_dir,
        batch_size_val=batch_size,
        n_workers_val=num_workers)

    print(
        f"Train Dataloader: {len(train_loader.dataset)} samples, {len(train_loader)} batches"
    )
    print(
        f"Valid Dataloader: {len(valid_loader.dataset)} samples, {len(valid_loader)} batches"
    )

    # initialize model, criterion, optimizer, and scheduler
    print("Initializing model, criterion, optimizer, and scheduler...")
    speaker_model = Classifier(d_model=d_model,
                               num_encoder_layers=num_encoder_layers,
                               dim_feedforward=dim_feedforward,
                               num_heads=num_heads,
                               dropout_rate=dropout_rate).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(speaker_model.parameters(), lr=learning_rate)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps,
                                                total_steps)

    # Run training
    best_accuracy_from_training = train_speaker_model(
        model_to_train=speaker_model,
        train_loader_val=train_loader,
        valid_loader_val=valid_loader,
        criterion_val=criterion,
        optimizer_val=optimizer,
        scheduler_val=scheduler,
        device_val=device)
    print(
        f"Training complete. Best validation accuracy: {best_accuracy_from_training:.2%}"
    )


# Training

In [70]:
train()


Train Dataloader: 50999 samples, 1593 batches
Valid Dataloader: 5667 samples, 178 batches
Initializing model, criterion, optimizer, and scheduler...




Total Training Steps:   0%|          | 0/100000 [00:00<?, ?it/s]

End of Training Data Pass 1: Avg Loss: 4.9440, Avg Acc: 7.93%


Step 2000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 2000/100000 - Valid Loss: 3.6729, Valid Acc: 21.39%
Best model updated at step 2000. Accuracy: 21.39%. Saved to ./speaker_model.ckpt
End of Training Data Pass 2: Avg Loss: 3.4708, Avg Acc: 24.89%


Step 4000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 4000/100000 - Valid Loss: 3.0347, Valid Acc: 32.91%
Best model updated at step 4000. Accuracy: 32.91%. Saved to ./speaker_model.ckpt
End of Training Data Pass 3: Avg Loss: 2.9264, Avg Acc: 34.32%


Step 6000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 6000/100000 - Valid Loss: 2.6511, Valid Acc: 40.55%
Best model updated at step 6000. Accuracy: 40.55%. Saved to ./speaker_model.ckpt
End of Training Data Pass 4: Avg Loss: 2.6454, Avg Acc: 39.60%
End of Training Data Pass 5: Avg Loss: 2.3930, Avg Acc: 44.15%


Step 8000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 8000/100000 - Valid Loss: 2.5079, Valid Acc: 42.88%
Best model updated at step 8000. Accuracy: 42.88%. Saved to ./speaker_model.ckpt
End of Training Data Pass 6: Avg Loss: 2.3007, Avg Acc: 46.20%


Step 10000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 10000/100000 - Valid Loss: 2.4457, Valid Acc: 44.40%
Best model updated at step 10000. Accuracy: 44.40%. Saved to ./speaker_model.ckpt
End of Training Data Pass 7: Avg Loss: 2.1812, Avg Acc: 48.37%


Step 12000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 12000/100000 - Valid Loss: 2.4487, Valid Acc: 45.16%
Best model updated at step 12000. Accuracy: 45.16%. Saved to ./speaker_model.ckpt
End of Training Data Pass 8: Avg Loss: 2.1140, Avg Acc: 49.91%


Step 14000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 14000/100000 - Valid Loss: 2.2280, Valid Acc: 48.77%
Best model updated at step 14000. Accuracy: 48.77%. Saved to ./speaker_model.ckpt
End of Training Data Pass 9: Avg Loss: 1.9704, Avg Acc: 52.63%
End of Training Data Pass 10: Avg Loss: 1.8848, Avg Acc: 54.72%


Step 16000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 16000/100000 - Valid Loss: 2.0797, Valid Acc: 51.79%
Best model updated at step 16000. Accuracy: 51.79%. Saved to ./speaker_model.ckpt
End of Training Data Pass 11: Avg Loss: 1.8324, Avg Acc: 55.63%


Step 18000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 18000/100000 - Valid Loss: 2.0953, Valid Acc: 51.65%
End of Training Data Pass 12: Avg Loss: 1.7853, Avg Acc: 56.69%


Step 20000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 20000/100000 - Valid Loss: 2.0252, Valid Acc: 53.10%
Best model updated at step 20000. Accuracy: 53.10%. Saved to ./speaker_model.ckpt
End of Training Data Pass 13: Avg Loss: 1.7910, Avg Acc: 56.71%


Step 22000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 22000/100000 - Valid Loss: 2.3880, Valid Acc: 44.94%
End of Training Data Pass 14: Avg Loss: 1.9181, Avg Acc: 53.99%
End of Training Data Pass 15: Avg Loss: 2.0642, Avg Acc: 50.80%


Step 24000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 24000/100000 - Valid Loss: 2.3028, Valid Acc: 47.94%
End of Training Data Pass 16: Avg Loss: 1.8430, Avg Acc: 55.71%


Step 26000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 26000/100000 - Valid Loss: 2.1542, Valid Acc: 50.71%
End of Training Data Pass 17: Avg Loss: 1.7194, Avg Acc: 58.04%


Step 28000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 28000/100000 - Valid Loss: 1.9329, Valid Acc: 55.53%
Best model updated at step 28000. Accuracy: 55.53%. Saved to ./speaker_model.ckpt
End of Training Data Pass 18: Avg Loss: 1.5965, Avg Acc: 60.81%


Step 30000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 30000/100000 - Valid Loss: 1.8839, Valid Acc: 55.71%
Best model updated at step 30000. Accuracy: 55.71%. Saved to ./speaker_model.ckpt
End of Training Data Pass 19: Avg Loss: 1.5775, Avg Acc: 61.01%
End of Training Data Pass 20: Avg Loss: 1.5096, Avg Acc: 62.34%


Step 32000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 32000/100000 - Valid Loss: 1.8685, Valid Acc: 57.17%
Best model updated at step 32000. Accuracy: 57.17%. Saved to ./speaker_model.ckpt
End of Training Data Pass 21: Avg Loss: 1.4669, Avg Acc: 63.60%


Step 34000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 34000/100000 - Valid Loss: 1.9790, Valid Acc: 53.50%
End of Training Data Pass 22: Avg Loss: 1.5511, Avg Acc: 61.60%


Step 36000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 36000/100000 - Valid Loss: 1.8147, Valid Acc: 57.54%
Best model updated at step 36000. Accuracy: 57.54%. Saved to ./speaker_model.ckpt
End of Training Data Pass 23: Avg Loss: 1.4084, Avg Acc: 64.80%


Step 38000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 38000/100000 - Valid Loss: 1.7091, Valid Acc: 59.93%
Best model updated at step 38000. Accuracy: 59.93%. Saved to ./speaker_model.ckpt
End of Training Data Pass 24: Avg Loss: 1.3556, Avg Acc: 65.98%
End of Training Data Pass 25: Avg Loss: 1.3359, Avg Acc: 66.19%


Step 40000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 40000/100000 - Valid Loss: 1.6529, Valid Acc: 61.16%
Best model updated at step 40000. Accuracy: 61.16%. Saved to ./speaker_model.ckpt
End of Training Data Pass 26: Avg Loss: 1.2966, Avg Acc: 67.20%


Step 42000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 42000/100000 - Valid Loss: 1.6927, Valid Acc: 60.05%
End of Training Data Pass 27: Avg Loss: 1.2614, Avg Acc: 68.03%


Step 44000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 44000/100000 - Valid Loss: 1.6453, Valid Acc: 62.40%
Best model updated at step 44000. Accuracy: 62.40%. Saved to ./speaker_model.ckpt
End of Training Data Pass 28: Avg Loss: 1.1803, Avg Acc: 69.93%


Step 46000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 46000/100000 - Valid Loss: 1.6353, Valid Acc: 62.31%
End of Training Data Pass 29: Avg Loss: 1.1795, Avg Acc: 70.04%
End of Training Data Pass 30: Avg Loss: 1.1318, Avg Acc: 71.11%


Step 48000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 48000/100000 - Valid Loss: 1.6961, Valid Acc: 61.41%
End of Training Data Pass 31: Avg Loss: 1.1515, Avg Acc: 70.54%


Step 50000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 50000/100000 - Valid Loss: 1.5765, Valid Acc: 64.00%
Best model updated at step 50000. Accuracy: 64.00%. Saved to ./speaker_model.ckpt
End of Training Data Pass 32: Avg Loss: 1.0724, Avg Acc: 72.53%


Step 52000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 52000/100000 - Valid Loss: 1.6783, Valid Acc: 61.90%
End of Training Data Pass 33: Avg Loss: 1.1190, Avg Acc: 71.24%


Step 54000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 54000/100000 - Valid Loss: 1.5736, Valid Acc: 64.55%
Best model updated at step 54000. Accuracy: 64.55%. Saved to ./speaker_model.ckpt
End of Training Data Pass 34: Avg Loss: 1.0839, Avg Acc: 72.30%
End of Training Data Pass 35: Avg Loss: 1.0189, Avg Acc: 73.51%


Step 56000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 56000/100000 - Valid Loss: 1.4881, Valid Acc: 65.64%
Best model updated at step 56000. Accuracy: 65.64%. Saved to ./speaker_model.ckpt
End of Training Data Pass 36: Avg Loss: 0.9599, Avg Acc: 75.11%


Step 58000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 58000/100000 - Valid Loss: 1.5025, Valid Acc: 66.33%
Best model updated at step 58000. Accuracy: 66.33%. Saved to ./speaker_model.ckpt
End of Training Data Pass 37: Avg Loss: 0.9076, Avg Acc: 76.52%


Step 60000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 60000/100000 - Valid Loss: 1.3890, Valid Acc: 67.20%
Best model updated at step 60000. Accuracy: 67.20%. Saved to ./speaker_model.ckpt
End of Training Data Pass 38: Avg Loss: 0.9033, Avg Acc: 76.50%


Step 62000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 62000/100000 - Valid Loss: 1.3704, Valid Acc: 68.29%
Best model updated at step 62000. Accuracy: 68.29%. Saved to ./speaker_model.ckpt
End of Training Data Pass 39: Avg Loss: 0.8779, Avg Acc: 77.11%
End of Training Data Pass 40: Avg Loss: 0.8419, Avg Acc: 78.07%


Step 64000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 64000/100000 - Valid Loss: 1.4338, Valid Acc: 68.52%
Best model updated at step 64000. Accuracy: 68.52%. Saved to ./speaker_model.ckpt
End of Training Data Pass 41: Avg Loss: 0.8270, Avg Acc: 78.26%


Step 66000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 66000/100000 - Valid Loss: 1.3987, Valid Acc: 68.11%
End of Training Data Pass 42: Avg Loss: 0.8298, Avg Acc: 78.30%


Step 68000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 68000/100000 - Valid Loss: 1.3060, Valid Acc: 70.41%
Best model updated at step 68000. Accuracy: 70.41%. Saved to ./speaker_model.ckpt
End of Training Data Pass 43: Avg Loss: 0.7746, Avg Acc: 79.69%


Step 70000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 70000/100000 - Valid Loss: 1.3863, Valid Acc: 69.00%
End of Training Data Pass 44: Avg Loss: 0.7642, Avg Acc: 79.81%
End of Training Data Pass 45: Avg Loss: 0.7395, Avg Acc: 80.54%


Step 72000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 72000/100000 - Valid Loss: 1.2879, Valid Acc: 70.65%
Best model updated at step 72000. Accuracy: 70.65%. Saved to ./speaker_model.ckpt
End of Training Data Pass 46: Avg Loss: 0.7141, Avg Acc: 81.05%


Step 74000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 74000/100000 - Valid Loss: 1.3145, Valid Acc: 70.48%
End of Training Data Pass 47: Avg Loss: 0.6914, Avg Acc: 81.75%


Step 76000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 76000/100000 - Valid Loss: 1.2974, Valid Acc: 71.34%
Best model updated at step 76000. Accuracy: 71.34%. Saved to ./speaker_model.ckpt
End of Training Data Pass 48: Avg Loss: 0.6684, Avg Acc: 82.32%


Step 78000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 78000/100000 - Valid Loss: 1.2908, Valid Acc: 71.84%
Best model updated at step 78000. Accuracy: 71.84%. Saved to ./speaker_model.ckpt
End of Training Data Pass 49: Avg Loss: 0.6629, Avg Acc: 82.34%
End of Training Data Pass 50: Avg Loss: 0.6405, Avg Acc: 83.17%


Step 80000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 80000/100000 - Valid Loss: 1.2544, Valid Acc: 71.54%
End of Training Data Pass 51: Avg Loss: 0.6209, Avg Acc: 83.85%


Step 82000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 82000/100000 - Valid Loss: 1.2514, Valid Acc: 72.38%
Best model updated at step 82000. Accuracy: 72.38%. Saved to ./speaker_model.ckpt
End of Training Data Pass 52: Avg Loss: 0.6047, Avg Acc: 83.95%


Step 84000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 84000/100000 - Valid Loss: 1.2368, Valid Acc: 72.03%
End of Training Data Pass 53: Avg Loss: 0.6022, Avg Acc: 84.03%


Step 86000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 86000/100000 - Valid Loss: 1.2457, Valid Acc: 73.00%
Best model updated at step 86000. Accuracy: 73.00%. Saved to ./speaker_model.ckpt
End of Training Data Pass 54: Avg Loss: 0.5933, Avg Acc: 84.25%
End of Training Data Pass 55: Avg Loss: 0.5803, Avg Acc: 84.73%


Step 88000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 88000/100000 - Valid Loss: 1.2183, Valid Acc: 73.13%
Best model updated at step 88000. Accuracy: 73.13%. Saved to ./speaker_model.ckpt
End of Training Data Pass 56: Avg Loss: 0.5705, Avg Acc: 84.88%


Step 90000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 90000/100000 - Valid Loss: 1.2109, Valid Acc: 73.14%
Best model updated at step 90000. Accuracy: 73.14%. Saved to ./speaker_model.ckpt
End of Training Data Pass 57: Avg Loss: 0.5687, Avg Acc: 85.00%


Step 92000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 92000/100000 - Valid Loss: 1.2154, Valid Acc: 72.93%
End of Training Data Pass 58: Avg Loss: 0.5580, Avg Acc: 85.25%
End of Training Data Pass 59: Avg Loss: 0.5487, Avg Acc: 85.51%


Step 94000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 94000/100000 - Valid Loss: 1.2422, Valid Acc: 72.47%
End of Training Data Pass 60: Avg Loss: 0.5533, Avg Acc: 85.46%


Step 96000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 96000/100000 - Valid Loss: 1.2206, Valid Acc: 72.83%
End of Training Data Pass 61: Avg Loss: 0.5508, Avg Acc: 85.37%


Step 98000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 98000/100000 - Valid Loss: 1.2151, Valid Acc: 73.55%
Best model updated at step 98000. Accuracy: 73.55%. Saved to ./speaker_model.ckpt
End of Training Data Pass 62: Avg Loss: 0.5442, Avg Acc: 85.64%


Step 100000 Validation:   0%|          | 0/178 [00:00<?, ?it/s]

Step 100000/100000 - Valid Loss: 1.2291, Valid Acc: 73.18%

Training finished. Best validation accuracy: 73.5486%
Best model saved to ./speaker_model.ckpt
Training complete. Best validation accuracy: 73.55%


## Tensorboard

In [71]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/  --port 6006


Reusing TensorBoard on port 6006 (pid 86512), started 3 days, 7:59:04 ago. (Use '!kill 86512' to kill it.)

# Test & Predict

In [72]:
# inference Dataset
class InferenceDataset(Dataset):

    def __init__(self, data_dir_val):
        testdata_path = Path(data_dir_val) / "testdata.json"
        metadata = json.load(testdata_path.open())
        self.data_dir = data_dir_val
        self.data = metadata["utterances"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        utterance = self.data[index]
        feat_path = utterance["feature_path"]
        mel = torch.load(os.path.join(self.data_dir, feat_path))
        mel = torch.FloatTensor(mel)
        return feat_path, mel


# inference collate function
def inference_collate_batch(batch):
    feat_paths, mels = zip(*batch)
    mels_padded = pad_sequence(mels, batch_first=True, padding_value=-20)
    return feat_paths, mels_padded


# create submission file
def create_submission(device_val):
    mapping_path = Path(data_dir) / "mapping.json"
    mapping = json.load(mapping_path.open())

    dataset = InferenceDataset(data_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=False,
        drop_last=False,
        num_workers=num_workers,
        collate_fn=inference_collate_batch,
    )
    print(f"[Info]: Finish loading data!", flush=True)

    speaker_num = len(mapping["id2speaker"])
    model = Classifier(d_model=d_model,
                       num_encoder_layers=num_encoder_layers,
                       dim_feedforward=dim_feedforward,
                       num_heads=num_heads,
                       dropout_rate=0).to(device)
    model.load_state_dict(torch.load(model_save_path))
    model.eval()
    print(f"[Info]: Finish creating model!", flush=True)

    results = [["Id", "Category"]]
    for feat_paths, mels in tqdm(dataloader):
        with torch.no_grad():
            mels = mels.to(device)
            outs = model(mels)
            preds = outs.argmax(1).cpu().numpy()
            for feat_path, pred in zip(feat_paths, preds):
                results.append([feat_path, mapping["id2speaker"][str(pred)]])

    with open(submission_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(results)


# Run inference
create_submission(device)


[Info]: Finish loading data!
[Info]: Finish creating model!


  0%|          | 0/8000 [00:00<?, ?it/s]