# **Homework 2: Phoneme Classification**


Objectives:
* Solve a classification problem with deep neural networks (DNNs).
* Understand recursive neural networks (RNNs).

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2023-spring@googlegroups.com

# Import Library

In [223]:
import os
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import random
import numpy as np
from tqdm import tqdm
import gc
import datetime
import math

# Environment Confirmation

In [224]:
!nvidia-smi

Fri Nov 17 15:25:44 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.13                 Driver Version: 537.13       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   42C    P5               5W / 120W |   1694MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [225]:
# 获取当前工作环境
def KaggleColabLocal():
    env = dict(
        inKaggle=False,
        inColab=False,
        inLocal=False
    )

    # 检查是否在Kaggle环境中
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive':
        # print("在Kaggle环境中运行")
        env['inKaggle'] = True
    # 检查是否在Colab环境中
    elif 'google.colab' in str(get_ipython()):
        # print("在Colab环境中运行")
        env['inColab'] = True
    # 检查是否在本地JupyterLab环境中
    elif 'JPY_PARENT_PID' in os.environ:
        # print("在本地JupyterLab环境中运行")
        env['inLocal'] = True
    else:
        print("在其他环境中运行")
    return env


KaggleColabLocal()

try:
    del train_set,val_set
    del train_loader,val_loader
except:
    print("free memory: param already deleted.")
gc.collect()

free memory: param already deleted.


0

# Download Data
Download data from google drive, then unzip it.

You should have
- `libriphone/train_split.txt`: training metadata
- `libriphone/train_labels`: training labels
- `libriphone/test_split.txt`: testing metadata
- `libriphone/feat/train/*.pt`: training feature
- `libriphone/feat/test/*.pt`:  testing feature

after running the following block.

> **Notes: if the google drive link is dead, you can download the data directly from [Kaggle](https://www.kaggle.com/c/ml2023spring-hw2/data) and upload it to the workspace.**


In [226]:
Current_Env = KaggleColabLocal() # 获取当前环境

# 如果文件已经下载，那不用重新下载文件
commonPath = './libriphone'
if Current_Env['inKaggle'] == True:
    commonPath = '/kaggle/working/libriphone'

filePath = commonPath + '/feat/train/103-1240-0015.pt';

if os.path.exists(filePath) == False:
    if Current_Env['inKaggle'] or Current_Env['inColab']:
        !pip install --upgrade gdown
        # Main link
        # !gdown --id '1N1eVIDe9hKM5uiNRGmifBlwSDGiVXPJe' --output libriphone.zip
        !gdown --id '1qzCRnywKh30mTbWUEjXuNT2isOCAPdO1' --output libriphone.zip

        !unzip -q libriphone.zip
        !ls libriphone
    elif Current_Env['inLocal']:
        raise Exception('本地环境中文件不存在，需要重新下载，地址：https://www.kaggle.com/c/ml2023spring-hw2/data')
    else:
        raise Exception('获取文件失败，无法判断当前运行环境，也无法找到文件路径，需要重新下载，地址：https://www.kaggle.com/c/ml2023spring-hw2/data')
else:
    print('Data File already exist.Skip!')

Data File already exist.Skip!


# Playground

In [227]:
# a = torch.load('./libriphone/feat/train/103-1240-0015.pt')
# print(len(a))
# print(len(a[0]))
# print(a)

# Some Utility Functions
**Fixes random number generator seeds for reproducibility.**

In [228]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

**Helper functions to pre-process the training data from raw MFCC features of each utterance.**

A phoneme may span several frames and is dependent to past and future frames. \
Hence we concatenate neighboring phonemes for training to achieve higher accuracy. The **concat_feat** function concatenates past and future k frames (total 2k+1 = n frames), and we predict the center frame.

Feel free to modify the data preprocess functions, but **do not drop any frame** (if you modify the functions, remember to check that the number of frames are the same as mentioned in the slides)

In [229]:
def load_feat(path):
    feat = torch.load(path)
    return feat


def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)


def concat_feat(x, concat_n):
    assert concat_n % 2 == 1  # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n, feature_dim).permute(
        1, 0, 2)  # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)


def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, random_seed=1213):
    class_num = 41  # NOTE: pre-computed, should not need change

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError(
            'Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]

        # split training and validation data
        usage_list = open(os.path.join(
            phone_path, 'train_split.txt')).readlines()
        random.seed(random_seed)
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(
            phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) +
          ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes)
    if mode == 'train':
        y = torch.empty(max_len, dtype=torch.long)

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode == 'train':
            label = torch.LongTensor(label_dict[fname])

        X[idx: idx + cur_len, :] = feat
        if mode == 'train':
            y[idx: idx + cur_len] = label

        idx += cur_len

    X = X[:idx, :]
    if mode == 'train':
        y = y[:idx]

    print(f'[INFO] {split} set')
    print(X.shape)
    if mode == 'train':
        print(y.shape)
        return X, y
    else:
        return X

# Dataset

In [230]:
class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

# Model
Feel free to modify the structure of the model.

In [232]:
class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_p):
        super(BasicBlock, self).__init__()

        # TODO: apply batch normalization and dropout for strong baseline.
        # Reference: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html (batch normalization)
        #       https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html (dropout)
        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.BatchNorm1d(output_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
        )

    def forward(self, x):
        x = self.block(x)
        return x


class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=1, hidden_dim=256, dropout_p=0.5):
        super(Classifier, self).__init__()

        self.fc = nn.Sequential(
            BasicBlock(input_dim, hidden_dim, dropout_p),
            *[BasicBlock(hidden_dim, hidden_dim, dropout_p)
              for _ in range(hidden_layers)],
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = self.fc(x)
        return x

# Hyper-parameters

In [233]:
# data prarameters
# TODO: change the value of "concat_nframes" for medium baseline
# the number of frames to concat with, n must be odd (total 2k+1 = n frames)
concat_nframes = 21
# the ratio of data used for training, the rest will be used for validation
train_ratio = 0.75

# training parameters
seed = 19871201          # random seed
batch_size = 512        # batch size
num_epoch = 3         # the number of training epoch
learning_rate = 1e-3      # learning rate
model_path = './model.ckpt'  # the path where the checkpoint will be saved

# model parameters
# TODO: change the value of "hidden_layers" or "hidden_dim" for medium baseline
# the input dim of the model, you should not change the value
input_dim = 39 * concat_nframes
hidden_layers = 2        # the number of hidden layers
hidden_dim = 1750           # the hidden dim
dropout_p = 0.25

# env related
commonPath = './libriphone'
if Current_Env['inKaggle'] is True:
    commonPath = '/kaggle/working/libriphone'

# Dataloader

In [234]:
same_seeds(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {device}')

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir=commonPath + '/feat', phone_path=commonPath,
                                   concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)
val_X, val_y = preprocess_data(split='val', feat_dir=commonPath + '/feat', phone_path=commonPath,
                               concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)

# get dataset
train_set = LibriDataset(train_X, train_y)
val_set = LibriDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

DEVICE: cuda
[Dataset] - # phone classes: 41, number of utterances for train: 2571


2571it [00:10, 248.47it/s]


[INFO] train set
torch.Size([1580384, 819])
torch.Size([1580384])
[Dataset] - # phone classes: 41, number of utterances for val: 858


858it [00:03, 256.73it/s]


[INFO] val set
torch.Size([536410, 819])
torch.Size([536410])


# Training

## 测试默认配置下,各显卡的表现
- 本地显卡：168秒
- Kaggle: T100 239秒
- Colab: 200秒以上

结论：本地显卡>Kaggle>Colab

In [235]:
# log train start time
starttime = datetime.datetime.now()

# create model, define a loss function, and optimizer
model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers,
                   hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train()  # set the model to training mode
    for i, batch in enumerate(tqdm(train_loader)):
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # get the index of the class with the highest probability
        _, train_pred = torch.max(outputs, 1)
        train_acc += (train_pred.detach() == labels.detach()).sum().item()
        train_loss += loss.item()

    # validation
    model.eval()  # set the model to evaluation mode
    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_loader)):
            features, labels = batch
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features)

            loss = criterion(outputs, labels)

            _, val_pred = torch.max(outputs, 1)
            # get the index of the class with the highest probability
            val_acc += (val_pred.cpu() == labels.cpu()).sum().item()
            val_loss += loss.item()

    print(f'[{epoch+1:03d}/{num_epoch:03d}] Train Acc: {train_acc/len(train_set):3.5f} Loss: {train_loss/len(train_loader):3.5f} | Val Acc: {val_acc/len(val_set):3.5f} loss: {val_loss/len(val_loader):3.5f}')

    # if the model improves, save a checkpoint at this epoch
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), model_path)
        print(f'saving model with acc {best_acc/len(val_set):.5f}')

# log train cost time
print('total train start at:', starttime)
print('concat_nframes:', concat_nframes, ',num_epoch:',
      num_epoch, ',learning_rate:', learning_rate, ',Dropout:')
totalseconds = (datetime.datetime.now()-starttime).seconds
print('total train cost: ', math.floor(totalseconds/60),
      'minutes', totalseconds % 60, 'seconds (total seconds:', totalseconds, 's).')

100%|██████████| 3087/3087 [00:31<00:00, 96.67it/s] 
100%|██████████| 1048/1048 [00:04<00:00, 211.35it/s]


[001/003] Train Acc: 0.62517 Loss: 1.20737 | Val Acc: 0.67080 loss: 1.04838
saving model with acc 0.67080


100%|██████████| 3087/3087 [00:31<00:00, 99.05it/s] 
100%|██████████| 1048/1048 [00:05<00:00, 199.57it/s]


[002/003] Train Acc: 0.68909 Loss: 0.97679 | Val Acc: 0.69501 loss: 0.97219
saving model with acc 0.69501


100%|██████████| 3087/3087 [00:31<00:00, 98.73it/s] 
100%|██████████| 1048/1048 [00:05<00:00, 185.02it/s]

[003/003] Train Acc: 0.71886 Loss: 0.87231 | Val Acc: 0.70744 loss: 0.93791
saving model with acc 0.70744
concat_nframes: 21 ,num_epoch: 3 ,learning_rate: 0.001 ,Dropout:
total train start at: 2023-11-17 15:26:42.056066 ,cost:  1 minutes 50 seconds (total seconds: 110 s).





In [236]:
del train_set, val_set
del train_loader, val_loader
gc.collect()

0

# Testing
Create a testing dataset, and load model from the saved checkpoint.

In [163]:
# load data
test_X = preprocess_data(split='test', feat_dir='./libriphone/feat',
                         phone_path='./libriphone', concat_nframes=concat_nframes)
test_set = LibriDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

[Dataset] - # phone classes: 41, number of utterances for test: 857


857it [00:03, 239.62it/s]

[INFO] test set
torch.Size([527364, 819])





In [164]:
# load model
model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers,
                   hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

Make prediction.

In [165]:
pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        features = batch
        features = features.to(device)

        outputs = model(features)

        # get the index of the class with the highest probability
        _, test_pred = torch.max(outputs, 1)
        pred = np.concatenate((pred, test_pred.cpu().numpy()), axis=0)

100%|██████████| 1031/1031 [00:05<00:00, 193.31it/s]


Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [None]:
with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))