In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ml2023spring_hw2_path = kagglehub.competition_download('ml2023spring-hw2')

print('Data source import complete.')


In [1]:
import sys
import logging

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


# **Homework 2: Phoneme Classification**


Objectives:
* Solve a classification problem with deep neural networks (DNNs).
* Understand recursive neural networks (RNNs).

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2023-spring@googlegroups.com

# Some Utility Functions
**Fixes random number generator seeds for reproducibility.**

In [2]:
import numpy as np
import torch
import random

def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

**Helper functions to pre-process the training data from raw MFCC features of each utterance.**

A phoneme may span several frames and is dependent to past and future frames. \
Hence we concatenate neighboring phonemes for training to achieve higher accuracy. The **concat_feat** function concatenates past and future k frames (total 2k+1 = n frames), and we predict the center frame.

Feel free to modify the data preprocess functions, but **do not drop any frame** (if you modify the functions, remember to check that the number of frames are the same as mentioned in the slides)

In [3]:
import os
import torch
from tqdm import tqdm

def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

# def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=1):
#     class_num = 41 # NOTE: pre-computed, should not need change

#     if split == 'train' or split == 'val':
#         mode = 'train'
#     elif split == 'test':
#         mode = 'test'
#     else:
#         raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

#     label_dict = {}
#     if mode == 'train':
#         for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
#             line = line.strip('\n').split(' ')
#             label_dict[line[0]] = [int(p) for p in line[1:]]

#         # split training and validation data
#         usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
#         random.shuffle(usage_list)
#         train_len = int(len(usage_list) * train_ratio)
#         usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

#     elif mode == 'test':
#         usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

#     usage_list = [line.strip('\n') for line in usage_list]
#     print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

#     max_len = 3000000
#     X = torch.empty(max_len, 39 * concat_nframes)
#     if mode == 'train':
#         y = torch.empty(max_len, dtype=torch.long)

#     idx = 0
#     for i, fname in tqdm(enumerate(usage_list)):
#         feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
#         cur_len = len(feat)
#         feat = concat_feat(feat, concat_nframes)
#         if mode == 'train':
#           label = torch.LongTensor(label_dict[fname])

#         X[idx: idx + cur_len, :] = feat
#         if mode == 'train':
#           y[idx: idx + cur_len] = label

#         idx += cur_len

#     X = X[:idx, :]
#     if mode == 'train':
#       y = y[:idx]

#     print(f'[INFO] {split} set')
#     print(X.shape)
#     if mode == 'train':
#       print(y.shape)
#       return X, y
#     else:
#       return X
def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=1):
    import random
    class_num = 41

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid split')

    label_dict = {}
    if mode == 'train':
        with open(os.path.join(phone_path, f'{mode}_labels.txt')) as f:
            for line in f:
                line = line.strip().split(' ')
                label_dict[line[0]] = [int(p) for p in line[1:]]

        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]
    else:
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip() for line in usage_list]
    print(f'[Dataset] {split} - samples: {len(usage_list)}')

    X, y = [], []

    for fname in tqdm(usage_list):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))  # (T, 39)
        feat = concat_feat(feat, concat_nframes)  # (T, 39 * n)

        X.append(feat)
        if mode == 'train':
            label = torch.LongTensor(label_dict[fname])
            assert label.shape[0] == feat.shape[0], f"[{fname}] Label/Feature mismatch: {label.shape[0]} vs {feat.shape[0]}"
            y.append(label)

    print(f'[INFO] {split} set loaded.')
    if mode == 'train':
        return X, y
    else:
        return X

# Dataset

In [4]:
import torch
from torch.utils.data import Dataset

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    # def __getitem__(self, idx):
    #     if self.label is not None:
    #         return self.data[idx], self.label[idx]
    #     else:
    #         return self.data[idx]
    def __getitem__(self, idx):
        x = self.data[idx]
        length = x.shape[0]
        if self.label is not None:
            y = self.label[idx]
            return x, y, length
        else:
            return x, length

    def __len__(self):
        return len(self.data)

from torch.utils.data import Dataset

class LibriSequenceDataset(Dataset):
    def __init__(self, data_list, label_list=None):
        self.data = data_list
        self.label = label_list

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx], self.data[idx].shape[0]
        else:
            return self.data[idx], self.data[idx].shape[0]

    def __len__(self):
        return len(self.data)

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    for b in batch:
        if b[1].shape[0] == 0:
            print(f"[collate_fn] Found empty label: x.shape={b[0].shape}, y.shape={b[1].shape}")
    xs, ys, lengths = zip(*batch)  # batch: list of (x, y, length)
    xs_pad = pad_sequence(xs, batch_first=True)          # (B, T_max, 39)
    ys_pad = pad_sequence(ys, batch_first=True, padding_value=-100)  # (B, T_max)
    return xs_pad, ys_pad, torch.tensor(lengths)

# Model
Feel free to modify the structure of the model.

In [5]:
import torch.nn as nn

class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.BatchNorm1d(output_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = self.block(x)
        return x


class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=1, hidden_dim=256, dropout=0.3):
        super(Classifier, self).__init__()

        self.fc = nn.Sequential(
            BasicBlock(input_dim, hidden_dim),
            *[BasicBlock(hidden_dim, hidden_dim, dropout) for _ in range(hidden_layers)],
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = self.fc(x)
        return x
    
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNNClassifier(nn.Module):
    def __init__(self, feat_dim, hidden_dim=256, num_layers=3, num_classes=41,
                 bidirectional=True, dropout=0.2):
        super().__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim

        self.input_norm = nn.LayerNorm(feat_dim)
        self.input_dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(
            input_size=feat_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )

        self.output_norm = nn.LayerNorm(hidden_dim * (2 if bidirectional else 1))
        self.output_dropout = nn.Dropout(dropout)

        self.output_proj = nn.Linear(
            hidden_dim * (2 if bidirectional else 1),
            num_classes
        )

    def forward(self, x, lengths):  # x: (B, T, D), lengths: list[int]

        x = self.input_norm(x)
        x = self.input_dropout(x)

        packed = pack_padded_sequence(x, lengths=lengths, batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)  # (B, T, H)

        out = self.output_norm(out)
        out = self.output_dropout(out)

        logits = self.output_proj(out)  # (B, T, num_classes)
        return logits

class ConvLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes=41,
                 conv_channels=64, bidirectional=True, dropout=0.4):
        super().__init__()
        # Conv1D expects input: (B, D_in, T), so we'll permute before & after
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=conv_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
        )
        self.lstm = nn.LSTM(
            input_size=conv_channels,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.classifier = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):  # x: (B, T, D)
        x = x.permute(0, 2, 1)                # → (B, D, T)
        x = self.conv(x)                      # → (B, C, T)
        x = x.permute(0, 2, 1)                # → (B, T, C)

        out, _ = self.lstm(x)                 # → (B, T, H)
        mid = out.size(1) // 2
        out = out[:, mid, :]                  # 中间帧
        return self.classifier(out)

# Training Loop

In [7]:
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import KFold
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score


config = {
    'concat_nframes': 61,
    'learning_rate': 5e-3,
    'save_path': './model.ckpt',
    'dropout': 0.2,
    'conv_channels': 64,
    'weight_decay': 1e-4
}

# data prarameters
concat_nframes = config['concat_nframes']              # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
k = 5

# training parameters
seed = 5201314                   # random seed
batch_size = 16                # batch size
num_epoch = 300                   # the number of training epoch
early_stop = 20                # the number of early stop
learning_rate = config['learning_rate']           # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 4               # the number of hidden layers
hidden_dim = 256                # the hidden dim

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1, ignore_index=-100):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.ignore_index = ignore_index
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x, target):
        """
        x: (B, C) logits
        target: (B,) int64 labels with possible ignore_index
        """
        log_probs = self.log_softmax(x)  # (B, C)

        # Create mask for valid targets
        mask = (target != self.ignore_index)
        target = target[mask]         # (N,)
        log_probs = log_probs[mask]   # (N, C)

        if target.numel() == 0:
            return torch.tensor(0.0, device=x.device, requires_grad=True)

        # One-hot with smoothing
        true_dist = torch.full_like(log_probs, self.smoothing / (self.cls - 1))
        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)

        loss = torch.sum(-true_dist * log_probs, dim=1)  # (N,)
        return loss.mean()
    
fold_accuracies = []

def train_one_fold(fold, model, train_loader, val_loader,
    epochs, lr, device, ckpt_path, writer, weights):

    criterion = nn.CrossEntropyLoss(weight=weights)
    # criterion = LabelSmoothingLoss(classes=41, smoothing=0.1, ignore_index=-100)
    # criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=config['weight_decay'])
    # scheduler = CosineAnnealingLR(
	# 	optimizer,
	# 	T_max=20,     # 前 100 个 epoch 为一个完整退火周期
	# 	eta_min=1e-5
	# )
    # scheduler = StepLR(
    #     optimizer,
    #     step_size=20,     # 每 10 个 epoch 降一次
    #     gamma=0.5         # 学习率变为原来的一半
    # )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.8, patience=5, threshold=0.001)
    best_val_acc = 0.0
    global fold_accuracies
    # loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    for epoch in range(1, epochs + 1):
        print(f"\n=== Epoch {epoch} ===")
        # — Training — 
        model.train()
        train_loss = train_correct = train_total = 0
        train_pbar = tqdm(enumerate(train_loader), 
                      total=len(train_loader), 
                      desc=f"Train Epoch {epoch}", 
                      leave=True)
        for step,(feats, labels, lengths) in train_pbar:
            # feats, labels = feats.to(device), labels.to(device)
            # feat_dim = feats.size(1) // concat_nframes
            # feats = feats.view(-1, concat_nframes, feat_dim).to(device)
            # labels = labels.to(device)
            feats = feats.to(device)
            labels = labels.to(device)
            lengths = lengths.to('cpu')  # 注意 pack_padded_sequence 要用 CPU 上的长度列表
            optimizer.zero_grad()
            out = model(feats, lengths)


            # out = model(feats)
            # loss = criterion(out, labels)
            loss = criterion(out.view(-1, out.size(-1)), labels.view(-1))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

            # pred = out.argmax(1)
            # train_correct += (pred == labels).sum().item()
            # train_total += feats.size(0)
            pred=out.argmax(dim=-1)
            mask = (labels != -100)
            correct = (pred == labels) & mask
            train_correct += correct.sum().item()
            train_total += mask.sum().item()
            train_loss += loss.item() * mask.sum().item()
            train_pbar.set_postfix(loss=loss.item(), acc=train_correct/train_total if train_total > 0 else 0)

        train_loss /= train_total
        train_acc  = train_correct / train_total

        # — Validation —
        model.eval()
        val_loss = val_correct = val_total = 0
        all_preds = []
        all_labels = []
        val_pbar = tqdm(enumerate(val_loader), 
                      total=len(val_loader), 
                      desc=f"Val Epoch {epoch}", 
                      leave=True)
        with torch.no_grad():
            for step,(feats, labels, lengths) in val_pbar:
                # feats, labels = feats.to(device), labels.to(device)
                # feat_dim = feats.size(1) // concat_nframes
                # feats = feats.view(-1, concat_nframes, feat_dim).to(device)
                # labels = labels.to(device)
                feats = feats.to(device)
                labels = labels.to(device)
                lengths = lengths.to('cpu')  # 注意 pack_padded_sequence 要用 CPU 上的长度列表
                out = model(feats, lengths)
                loss = criterion(out.view(-1, out.size(-1)), labels.view(-1))

                # out = model(feats)
                # loss = criterion(out, labels)

                # val_loss += loss.item() * feats.size(0)
                # pred = out.argmax(1)
                # val_correct += (pred == labels).sum().item()
                # val_total += feats.size(0)
                # all_preds.extend(pred.cpu().numpy())
                # all_labels.extend(labels.cpu().numpy())
                pred = out.argmax(dim=-1)            # (B, T)
                mask = (labels != -100)              # padding mask
                correct = ((pred == labels) & mask).sum().item()
                total = mask.sum().item()
                val_correct += correct
                val_total += total
                val_loss += loss.item() * mask.sum().item()
                all_preds.extend(pred[mask].cpu().numpy())
                all_labels.extend(labels[mask].cpu().numpy())
                val_pbar.set_postfix(loss=loss.item(), acc=val_correct/val_total if val_total > 0 else 0)

        val_loss /= val_total
        val_acc  = val_correct / val_total
        val_f1 = f1_score(all_labels, all_preds, average='macro')
        scheduler.step(val_acc)

        # — Log to TensorBoard under fold subdir —
        writer.add_scalar(f"Fold{fold}/Loss/Train", train_loss, epoch)
        writer.add_scalar(f"Fold{fold}/Loss/Val",   val_loss,   epoch)
        writer.add_scalar(f"Fold{fold}/Acc/Train",  train_acc,  epoch)
        writer.add_scalar(f"Fold{fold}/Acc/Val",    val_acc,    epoch)
        writer.add_scalar(f"Fold{fold}/LR", scheduler.get_last_lr()[0], epoch)
        writer.add_scalar(f"Fold{fold}/F1/Val", val_f1, epoch)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            if len(fold_accuracies) == 0 or best_val_acc > max(fold_accuracies):
                torch.save(model.state_dict(), f"{ckpt_path}.ckpt")
            early_stop_count = 0
        else:
            early_stop_count += 1
        
        if early_stop_count >= early_stop:
            print(f"Early stopping at epoch {epoch}!")
            break
    writer.add_hparams(
        hparam_dict={
            'concat_nframes': concat_nframes,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        },
        metric_dict={
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc
        }
    )
    return best_val_acc


from torch.utils.data import DataLoader
import gc
import optuna

same_seeds(seed)
train_ratio = 0.8

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {device}')

def objective(trial):
    if trial != None:
        print('\nNew trial here')
        print(f"Trial: {trial.number}")
        # 定义需要调优的超参数空间
        config['concat_nframes'] = trial.suggest_int('concat_nframes', 0, 10) * 2 + 1
    concat_nframes = config['concat_nframes']
    input_dim = 39 * concat_nframes
        # config['betas'][0] = trial.suggest_float('beta1', 0.89, 0.9)
        # config['betas'][1] = trial.suggest_float('beta2', 0.99, 0.9999)
        # config['batch_size'] = trial.suggest_categorical('batch_size', [128])
        # config['k'] = trial.suggest_int('k_feats', 5, 32)
    # 打印所需的超参数
    print(f'''hyper-parameter:
        lr: {config['learning_rate']},
        concat_nframes: {config['concat_nframes']}''')
    
	# preprocess data
    train_X, train_y = preprocess_data(split='train', feat_dir='./mlhw2/libriphone/feat', phone_path='./mlhw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)
    val_X, val_y = preprocess_data(split='val', feat_dir='./mlhw2/libriphone/feat', phone_path='./mlhw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)
    if isinstance(train_y, list):
        y_train = np.concatenate([y.numpy() if isinstance(y, torch.Tensor) else np.array(y) for y in train_y])
    else:
        y_train = train_y.numpy()
    classes = np.arange(41)
    raw_weights = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=y_train
    )
    adjusted_weights = raw_weights ** 0.4  # 平滑惩罚力度
    weights = torch.tensor(adjusted_weights, dtype=torch.float).to(device)

    # get dataset
    train_set = LibriSequenceDataset(train_X, train_y)
    val_set = LibriSequenceDataset(val_X, val_y)
    for i, (x, y) in enumerate(zip(train_X, train_y)):
        if len(y) == 0 or len(x) == 0:
            print(f"Empty sample at index {i}: x.shape={x.shape}, y.shape={y.shape}")

    # remove raw feature to save memory
    del train_X, train_y, val_X, val_y
    gc.collect()

    # get dataloader
    # train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )
    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

    # model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim, dropout=config['dropout']).to(device)
    # — Option B: RNN sequence labeling —
    feat_dim = input_dim // concat_nframes
    model = RNNClassifier(
        input_dim, hidden_dim=hidden_dim, num_layers=hidden_layers,
        num_classes=41, bidirectional=True, dropout=config['dropout']
    ).to(device)
    # model = ConvLSTMClassifier(
    #     input_dim=feat_dim,
    #     hidden_dim=hidden_dim,
    #     num_layers=hidden_layers,
    #     num_classes=41,
    #     conv_channels=config['conv_channels'],
    #     bidirectional=True,
    #     dropout=config['dropout']
    # ).to(device)

    print("Model summary:")
    for name, param in model.named_parameters():
        print(name, param.numel())

    # ── TensorBoard writer per fold ──
    if trial != None:
        writer = SummaryWriter(log_dir=f"./runs/nofold{trial.number}")
    else:
        writer = SummaryWriter()
    global fold_accuracies
    fold_accuracies = []

    if trial != None:
        best_val = train_one_fold(1, model, train_loader, val_loader,
                        epochs=num_epoch, lr=learning_rate, device=device, ckpt_path=f"./model{trial.number}", writer=writer, weights = weights)
    else:
        best_val = train_one_fold(1, model, train_loader, val_loader,
                        epochs=num_epoch, lr=learning_rate, device=device, ckpt_path=f"./model", writer=writer, weights = weights)
    
    writer.close()

    print(f"best val acc: {best_val:.4f}\n")
    avg_acc = best_val

	# # (Re)load your full dataset once:
    # full_X, full_y = preprocess_data(
	# 	split="train",
	# 	feat_dir='./mlhw2/libriphone/feat',
	# 	phone_path='./mlhw2/libriphone',
	# 	concat_nframes=concat_nframes,
	# 	train_ratio=1
	# )
    # full_dataset = LibriDataset(full_X, full_y)

    # y_train = full_y.numpy()
    # classes = np.arange(41)
    # raw_weights = compute_class_weight(
    #     class_weight='balanced',
    #     classes=classes,
    #     y=y_train
    # )
    # adjusted_weights = raw_weights ** 0.3  # 平滑惩罚力度
    # weights = torch.tensor(adjusted_weights, dtype=torch.float).to(device)
    # del full_X, full_y
    # gc.collect()

    # kf = KFold(n_splits=k, shuffle=True, random_state=seed)

    # fold_accuracies = []
    
    # for fold, (train_idx, val_idx) in enumerate(kf.split(full_dataset), 1):
	# 	# ── Prepare subset loaders ──
    #     train_sub = Subset(full_dataset, train_idx)
    #     val_sub   = Subset(full_dataset, val_idx)
    #     train_loader = DataLoader(train_sub, batch_size=batch_size, shuffle=True)
    #     val_loader   = DataLoader(val_sub,   batch_size=batch_size, shuffle=True)
        
		
    #     # model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
	# 	# — Option B: RNN sequence labeling —
    #     feat_dim = input_dim // concat_nframes
	# 	# model = RNNClassifier(
	# 	#     feat_dim, hidden_dim=128, num_layers=2,
	# 	#     num_classes=41, bidirectional=True, dropout=0.3
	# 	# ).to(device)
    #     model = ConvLSTMClassifier(
    #         input_dim=feat_dim,
    #         hidden_dim=hidden_dim,
    #         num_layers=hidden_layers,
    #         num_classes=41,
    #         conv_channels=config['conv_channels'],
    #         bidirectional=True,
    #         dropout=config['dropout']
    #     ).to(device)

	# 	# ── TensorBoard writer per fold ──
    #     writer = SummaryWriter(log_dir=f"runs/exp_kfold/fold{fold}")

    #     print(f"\n=== Starting Fold {fold}/{k} ===")
    #     best_val = train_one_fold(fold, model, train_loader, val_loader,
	# 						epochs=num_epoch, lr=learning_rate, device=device, ckpt_path="./model", writer=writer,weights=weights)
    #     writer.close()

    #     print(f"Fold {fold} best val acc: {best_val:.4f}\n")
    #     fold_accuracies.append(best_val)

    # avg_acc = sum(fold_accuracies) / k
    # print(f"Average CV accuracy over {k} folds: {avg_acc:.4f}")
    return avg_acc


DEVICE: cuda


## Start Train!

In [None]:
AUTO_TUNE_PARAM = False  # Whether to tune parameters automatically

if AUTO_TUNE_PARAM:
    # 使用Optuna库进行超参数搜索
    n_trials = 5  # 设置试验数量
    print(f'n_trials: {n_trials}')
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    optuna.visualization.plot_param_importances(study).show()
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_slice(study).show()

    # 输出最优的超参数组合和性能指标
    print('Best hyperparameters: {}'.format(study.best_params))
    print('Best performance: {:.4f}'.format(study.best_value))
else:
    objective(None)

hyper-parameter:
        lr: 0.005,
        concat_nframes: 61
[Dataset] train - samples: 2743


100%|██████████| 2743/2743 [00:13<00:00, 206.11it/s]


[INFO] train set loaded.
[Dataset] val - samples: 686


100%|██████████| 686/686 [00:08<00:00, 78.65it/s] 


[INFO] val set loaded.
Model summary:
input_norm.weight 2379
input_norm.bias 2379
lstm.weight_ih_l0 2436096
lstm.weight_hh_l0 262144
lstm.bias_ih_l0 1024
lstm.bias_hh_l0 1024
lstm.weight_ih_l0_reverse 2436096
lstm.weight_hh_l0_reverse 262144
lstm.bias_ih_l0_reverse 1024
lstm.bias_hh_l0_reverse 1024
lstm.weight_ih_l1 524288
lstm.weight_hh_l1 262144
lstm.bias_ih_l1 1024
lstm.bias_hh_l1 1024
lstm.weight_ih_l1_reverse 524288
lstm.weight_hh_l1_reverse 262144
lstm.bias_ih_l1_reverse 1024
lstm.bias_hh_l1_reverse 1024
lstm.weight_ih_l2 524288
lstm.weight_hh_l2 262144
lstm.bias_ih_l2 1024
lstm.bias_hh_l2 1024
lstm.weight_ih_l2_reverse 524288
lstm.weight_hh_l2_reverse 262144
lstm.bias_ih_l2_reverse 1024
lstm.bias_hh_l2_reverse 1024
lstm.weight_ih_l3 524288
lstm.weight_hh_l3 262144
lstm.bias_ih_l3 1024
lstm.bias_hh_l3 1024
lstm.weight_ih_l3_reverse 524288
lstm.weight_hh_l3_reverse 262144
lstm.bias_ih_l3_reverse 1024
lstm.bias_hh_l3_reverse 1024
output_norm.weight 512
output_norm.bias 512
output_p

Train Epoch 1: 100%|██████████| 172/172 [02:06<00:00,  1.36it/s, acc=0.396, loss=1.77]
Val Epoch 1: 100%|██████████| 43/43 [00:36<00:00,  1.17it/s, acc=0.534, loss=1.82]



=== Epoch 2 ===


Train Epoch 2: 100%|██████████| 172/172 [01:41<00:00,  1.70it/s, acc=0.605, loss=1.18]
Val Epoch 2: 100%|██████████| 43/43 [00:11<00:00,  3.91it/s, acc=0.656, loss=1.21]



=== Epoch 3 ===


Train Epoch 3: 100%|██████████| 172/172 [01:37<00:00,  1.77it/s, acc=0.663, loss=1.09]
Val Epoch 3: 100%|██████████| 43/43 [00:26<00:00,  1.62it/s, acc=0.694, loss=1.11] 



=== Epoch 4 ===


Train Epoch 4: 100%|██████████| 172/172 [01:46<00:00,  1.62it/s, acc=0.692, loss=0.932]
Val Epoch 4: 100%|██████████| 43/43 [00:18<00:00,  2.27it/s, acc=0.721, loss=0.969]



=== Epoch 5 ===


Train Epoch 5: 100%|██████████| 172/172 [02:51<00:00,  1.00it/s, acc=0.708, loss=0.856]
Val Epoch 5: 100%|██████████| 43/43 [00:20<00:00,  2.13it/s, acc=0.736, loss=1.02] 



=== Epoch 6 ===


Train Epoch 6: 100%|██████████| 172/172 [02:50<00:00,  1.01it/s, acc=0.719, loss=1.04] 
Val Epoch 6: 100%|██████████| 43/43 [00:15<00:00,  2.69it/s, acc=0.739, loss=0.911]



=== Epoch 7 ===


Train Epoch 7: 100%|██████████| 172/172 [02:49<00:00,  1.01it/s, acc=0.724, loss=0.822]
Val Epoch 7: 100%|██████████| 43/43 [00:21<00:00,  1.98it/s, acc=0.738, loss=0.902]



=== Epoch 8 ===


Train Epoch 8: 100%|██████████| 172/172 [02:57<00:00,  1.03s/it, acc=0.731, loss=0.807]
Val Epoch 8: 100%|██████████| 43/43 [00:19<00:00,  2.19it/s, acc=0.753, loss=0.888]



=== Epoch 9 ===


Train Epoch 9: 100%|██████████| 172/172 [02:44<00:00,  1.05it/s, acc=0.74, loss=0.933] 
Val Epoch 9: 100%|██████████| 43/43 [00:21<00:00,  1.99it/s, acc=0.773, loss=0.779]



=== Epoch 10 ===


Train Epoch 10: 100%|██████████| 172/172 [02:43<00:00,  1.05it/s, acc=0.743, loss=1.24] 
Val Epoch 10: 100%|██████████| 43/43 [00:17<00:00,  2.40it/s, acc=0.762, loss=0.8]  



=== Epoch 11 ===


Train Epoch 11: 100%|██████████| 172/172 [03:08<00:00,  1.09s/it, acc=0.747, loss=0.816]
Val Epoch 11: 100%|██████████| 43/43 [00:27<00:00,  1.59it/s, acc=0.767, loss=0.846]



=== Epoch 12 ===


Train Epoch 12: 100%|██████████| 172/172 [03:07<00:00,  1.09s/it, acc=0.751, loss=0.881]
Val Epoch 12: 100%|██████████| 43/43 [00:24<00:00,  1.75it/s, acc=0.773, loss=0.762]



=== Epoch 13 ===


Train Epoch 13: 100%|██████████| 172/172 [03:14<00:00,  1.13s/it, acc=0.753, loss=1.09] 
Val Epoch 13: 100%|██████████| 43/43 [00:26<00:00,  1.63it/s, acc=0.773, loss=0.786]



=== Epoch 14 ===


Train Epoch 14: 100%|██████████| 172/172 [03:10<00:00,  1.11s/it, acc=0.758, loss=1.09] 
Val Epoch 14: 100%|██████████| 43/43 [00:20<00:00,  2.10it/s, acc=0.779, loss=0.754]



=== Epoch 15 ===


Train Epoch 15: 100%|██████████| 172/172 [03:20<00:00,  1.17s/it, acc=0.759, loss=0.953]
Val Epoch 15: 100%|██████████| 43/43 [00:26<00:00,  1.65it/s, acc=0.774, loss=0.785]



=== Epoch 16 ===


Train Epoch 16: 100%|██████████| 172/172 [03:22<00:00,  1.18s/it, acc=0.762, loss=0.926]
Val Epoch 16: 100%|██████████| 43/43 [00:24<00:00,  1.76it/s, acc=0.782, loss=0.738]



=== Epoch 17 ===


Train Epoch 17: 100%|██████████| 172/172 [03:17<00:00,  1.15s/it, acc=0.765, loss=0.937]
Val Epoch 17: 100%|██████████| 43/43 [00:24<00:00,  1.79it/s, acc=0.792, loss=0.749]



=== Epoch 18 ===


Train Epoch 18: 100%|██████████| 172/172 [03:27<00:00,  1.21s/it, acc=0.766, loss=0.742]
Val Epoch 18: 100%|██████████| 43/43 [00:23<00:00,  1.83it/s, acc=0.789, loss=0.746]



=== Epoch 19 ===


Train Epoch 19: 100%|██████████| 172/172 [03:11<00:00,  1.11s/it, acc=0.767, loss=0.801]
Val Epoch 19: 100%|██████████| 43/43 [00:20<00:00,  2.07it/s, acc=0.793, loss=0.68] 



=== Epoch 20 ===


Train Epoch 20: 100%|██████████| 172/172 [03:22<00:00,  1.18s/it, acc=0.767, loss=0.792]
Val Epoch 20: 100%|██████████| 43/43 [00:22<00:00,  1.94it/s, acc=0.789, loss=0.753]



=== Epoch 21 ===


Train Epoch 21: 100%|██████████| 172/172 [04:08<00:00,  1.44s/it, acc=0.769, loss=0.793]
Val Epoch 21: 100%|██████████| 43/43 [00:41<00:00,  1.03it/s, acc=0.795, loss=0.697]



=== Epoch 22 ===


Train Epoch 22: 100%|██████████| 172/172 [03:06<00:00,  1.09s/it, acc=0.769, loss=0.764]
Val Epoch 22: 100%|██████████| 43/43 [00:22<00:00,  1.93it/s, acc=0.791, loss=0.62] 



=== Epoch 23 ===


Train Epoch 23: 100%|██████████| 172/172 [03:03<00:00,  1.06s/it, acc=0.772, loss=0.664]
Val Epoch 23: 100%|██████████| 43/43 [00:26<00:00,  1.61it/s, acc=0.786, loss=0.67] 



=== Epoch 24 ===


Train Epoch 24: 100%|██████████| 172/172 [03:06<00:00,  1.08s/it, acc=0.772, loss=0.689]
Val Epoch 24: 100%|██████████| 43/43 [00:16<00:00,  2.61it/s, acc=0.794, loss=0.685]



=== Epoch 25 ===


Train Epoch 25: 100%|██████████| 172/172 [02:48<00:00,  1.02it/s, acc=0.772, loss=1.11] 
Val Epoch 25: 100%|██████████| 43/43 [00:20<00:00,  2.15it/s, acc=0.792, loss=0.702]



=== Epoch 26 ===


Train Epoch 26: 100%|██████████| 172/172 [03:09<00:00,  1.10s/it, acc=0.775, loss=0.711]
Val Epoch 26: 100%|██████████| 43/43 [00:25<00:00,  1.66it/s, acc=0.788, loss=0.723]



=== Epoch 27 ===


Train Epoch 27: 100%|██████████| 172/172 [03:27<00:00,  1.21s/it, acc=0.777, loss=0.723]
Val Epoch 27: 100%|██████████| 43/43 [00:27<00:00,  1.59it/s, acc=0.797, loss=0.697]



=== Epoch 28 ===


Train Epoch 28: 100%|██████████| 172/172 [03:05<00:00,  1.08s/it, acc=0.776, loss=0.864]
Val Epoch 28: 100%|██████████| 43/43 [00:27<00:00,  1.55it/s, acc=0.795, loss=0.719]



=== Epoch 29 ===


Train Epoch 29: 100%|██████████| 172/172 [03:08<00:00,  1.09s/it, acc=0.772, loss=0.938]
Val Epoch 29: 100%|██████████| 43/43 [00:18<00:00,  2.37it/s, acc=0.791, loss=0.657]



=== Epoch 30 ===


Train Epoch 30: 100%|██████████| 172/172 [03:03<00:00,  1.07s/it, acc=0.776, loss=0.819]
Val Epoch 30: 100%|██████████| 43/43 [00:20<00:00,  2.05it/s, acc=0.799, loss=0.661]



=== Epoch 31 ===


Train Epoch 31: 100%|██████████| 172/172 [03:12<00:00,  1.12s/it, acc=0.777, loss=0.783]
Val Epoch 31: 100%|██████████| 43/43 [00:30<00:00,  1.41it/s, acc=0.8, loss=0.659]  



=== Epoch 32 ===


Train Epoch 33: 100%|██████████| 172/172 [03:20<00:00,  1.17s/it, acc=0.777, loss=0.857]
Val Epoch 33: 100%|██████████| 43/43 [00:29<00:00,  1.47it/s, acc=0.806, loss=0.634]



=== Epoch 34 ===


Train Epoch 34: 100%|██████████| 172/172 [03:18<00:00,  1.15s/it, acc=0.779, loss=0.847]
Val Epoch 34: 100%|██████████| 43/43 [00:26<00:00,  1.60it/s, acc=0.805, loss=0.609]



=== Epoch 35 ===


Train Epoch 35: 100%|██████████| 172/172 [03:14<00:00,  1.13s/it, acc=0.78, loss=0.674] 
Val Epoch 35: 100%|██████████| 43/43 [00:38<00:00,  1.12it/s, acc=0.802, loss=0.618]



=== Epoch 36 ===


Train Epoch 36: 100%|██████████| 172/172 [05:27<00:00,  1.90s/it, acc=0.779, loss=0.783]
Val Epoch 36: 100%|██████████| 43/43 [00:40<00:00,  1.05it/s, acc=0.802, loss=0.684]



=== Epoch 37 ===


Train Epoch 37: 100%|██████████| 172/172 [03:17<00:00,  1.15s/it, acc=0.78, loss=0.9]   
Val Epoch 37: 100%|██████████| 43/43 [00:28<00:00,  1.51it/s, acc=0.807, loss=0.643]



=== Epoch 38 ===


Train Epoch 38: 100%|██████████| 172/172 [03:08<00:00,  1.10s/it, acc=0.78, loss=0.807] 
Val Epoch 38: 100%|██████████| 43/43 [00:21<00:00,  2.02it/s, acc=0.804, loss=0.626]



=== Epoch 39 ===


Train Epoch 39: 100%|██████████| 172/172 [03:31<00:00,  1.23s/it, acc=0.78, loss=0.839] 
Val Epoch 39: 100%|██████████| 43/43 [00:29<00:00,  1.46it/s, acc=0.802, loss=0.614]



=== Epoch 40 ===


Train Epoch 40: 100%|██████████| 172/172 [03:30<00:00,  1.22s/it, acc=0.794, loss=0.792]
Val Epoch 40: 100%|██████████| 43/43 [00:28<00:00,  1.51it/s, acc=0.818, loss=0.583]



=== Epoch 41 ===


Train Epoch 41: 100%|██████████| 172/172 [03:01<00:00,  1.05s/it, acc=0.797, loss=0.666]
Val Epoch 41: 100%|██████████| 43/43 [00:26<00:00,  1.63it/s, acc=0.819, loss=0.607]



=== Epoch 42 ===


Train Epoch 42: 100%|██████████| 172/172 [03:22<00:00,  1.18s/it, acc=0.797, loss=0.841]
Val Epoch 42: 100%|██████████| 43/43 [00:29<00:00,  1.44it/s, acc=0.805, loss=0.605]



=== Epoch 43 ===


Train Epoch 43:  52%|█████▏    | 89/172 [01:25<01:22,  1.01it/s, acc=0.799, loss=0.603]

# Validation


In [9]:
from sklearn.metrics import classification_report

config['concat_nframes'] = 61
input_dim = 39 * config['concat_nframes'] # the input dim of the model, you should not change the value

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir='./mlhw2/libriphone/feat', phone_path='./mlhw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)
val_X, val_y = preprocess_data(split='val', feat_dir='./mlhw2/libriphone/feat', phone_path='./mlhw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio)

# get dataset
train_set = LibriSequenceDataset(train_X, train_y)
val_set = LibriSequenceDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
# train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)
train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_set,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

# load model
feat_dim = input_dim // concat_nframes
model = RNNClassifier(
    input_dim, hidden_dim=hidden_dim, num_layers=hidden_layers,
    num_classes=41, bidirectional=True, dropout=config['dropout']
).to(device)
# model = ConvLSTMClassifier(
#     input_dim=feat_dim,
#     hidden_dim=hidden_dim,
#     num_layers=hidden_layers,
#     num_classes=41,
#     conv_channels=config['conv_channels'],
#     bidirectional=True,
#     dropout=config['dropout']
# ).to(device)
model.load_state_dict(torch.load(config['save_path']))

all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for feats, labels, lengths in tqdm(val_loader):  # 或 test_loader
        feat_dim = feats.size(1) // concat_nframes
        feats = feats.to(device)
        labels = labels.to(device)
        lengths = lengths.to('cpu')  # 注意 pack_padded_sequence 要用 CPU 上的长度列表
        out = model(feats, lengths)

        # outputs = model(feats)
        pred = out.argmax(dim=-1)            # (B, T)
        mask = (labels != -100)              # padding mask
        correct = ((pred == labels) & mask).sum().item()
        all_preds.extend(pred[mask].cpu().numpy())
        all_labels.extend(labels[mask].cpu().numpy())

# 打印分类报告
print(classification_report(all_labels, all_preds, digits=3))

[Dataset] train - samples: 2743


100%|██████████| 2743/2743 [00:19<00:00, 138.31it/s]


[INFO] train set loaded.
[Dataset] val - samples: 686


100%|██████████| 686/686 [00:06<00:00, 114.25it/s]


[INFO] val set loaded.


100%|██████████| 43/43 [00:08<00:00,  4.93it/s]


              precision    recall  f1-score   support

           0      0.988     0.967     0.977     72727
           1      0.925     0.720     0.810      2280
           2      0.955     0.958     0.957     22769
           3      0.929     0.940     0.935      5593
           4      0.920     0.911     0.916     19649
           5      0.926     0.909     0.917     19548
           6      0.942     0.951     0.946     10408
           7      0.909     0.929     0.919      2490
           8      0.934     0.942     0.938     10598
           9      0.938     0.945     0.942      6904
          10      0.949     0.969     0.959      9408
          11      0.932     0.968     0.950      4295
          12      0.935     0.943     0.939      6653
          13      0.914     0.955     0.934      4048
          14      0.964     0.967     0.966      8327
          15      0.894     0.942     0.917      4788
          16      0.912     0.962     0.937      2869
          17      0.966    

# Testing
Create a testing dataset, and load model from the saved checkpoint.

In [12]:
config['concat_nframes'] = 61
input_dim = 39 * config['concat_nframes'] # the input dim of the model, you should not change the value

# load model
feat_dim = input_dim // concat_nframes
# model = RNNClassifier(
#     feat_dim, hidden_dim=hidden_dim, num_layers=hidden_layers,
#     num_classes=41, bidirectional=True, dropout=config['dropout']
# ).to(device)

# model = ConvLSTMClassifier(
#     input_dim=feat_dim,
#     hidden_dim=hidden_dim,
#     num_layers=hidden_layers,
#     num_classes=41,
#     conv_channels=config['conv_channels'],
#     bidirectional=True,
#     dropout=config['dropout']
# ).to(device)
model = RNNClassifier(
    input_dim, hidden_dim=hidden_dim, num_layers=hidden_layers,
    num_classes=41, bidirectional=True, dropout=config['dropout']
).to(device)

model.load_state_dict(torch.load(config['save_path']))

from torch.nn.utils.rnn import pad_sequence

def collate_fn_test(batch):
    # batch: list of (x, length)
    xs, lengths = zip(*batch)
    xs_pad = pad_sequence(xs, batch_first=True)
    return xs_pad, torch.tensor(lengths)

# load data
test_X = preprocess_data(split='test', feat_dir='./mlhw2/libriphone/feat', phone_path='./mlhw2/libriphone', concat_nframes=config['concat_nframes'])
test_set = LibriSequenceDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_test)

[Dataset] test - samples: 857


100%|██████████| 857/857 [00:04<00:00, 173.92it/s]

[INFO] test set loaded.





Make prediction.

In [None]:
import numpy as np
print(np.__version__)

In [13]:
pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for feats, lengths in tqdm(test_loader):
        # features = batch
        # features = features.to(device)
        # feat_dim = feats.size(1) // concat_nframes
        # feats = feats.view(-1, concat_nframes, feat_dim).to(device)
        feats = feats.to(device)
        lengths = lengths.to('cpu')  # 注意 pack_padded_sequence 要用 CPU 上的长度列表
        out = model(feats, lengths)
        pred_batch = out.argmax(dim=-1)      # (B, T)
        
        for i in range(pred_batch.size(0)):
            valid_len = lengths[i]
            pred = np.concatenate((pred,pred_batch[i, :valid_len].cpu().numpy()),axis=0)


100%|██████████| 54/54 [00:10<00:00,  5.14it/s]


Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [14]:
with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))