# 作业2：音位分类

学习目标：
* 数据预处理：从原始波形中提取MFCC特征
* 分类：使用预提取的MFCC特征执行逐帧音位分类
* 熟悉并提高pytorch训练技巧，熟悉pytorch模块

In [1]:
# 查看GPU状态
!nvidia-smi

zsh:1: command not found: nvidia-smi


## 准备数据

Helper函数用于预处理来自每个音频的原始MFCC特征的训练数据

In [2]:
import os
import random
from numpy import concat
import pandas as pd
import torch
from tqdm import tqdm

def load_feat(path):
    """ 定义导入feature函数 """
    feat = torch.load(path)
    return feat

def shift(x, n):
    """ 简单理解就是
            n < 0
            11112
            11123
            11234
            12345

            12345

            12345
            23455
            34555
            45555
            n > 0
    """
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    """ 将前后的特征联系到一起，如 concat_n = 11 则前后都接5 """
    assert concat_n % 2 == 1 # n必须为奇数
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n) # 先把特征维度复制出原来长度的concat_n倍，用于后续shift操作
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # (concat_n, seq_len, feature)
    mid = (concat_n // 2)
    for r_idx in range(1, mid + 1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)
    
    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, train_val_seed=1337):
    """ 数据预处理函数 
            return: 返回值 X 代表处理好的数据，若不concat则为39，否则为 39*concat_n
    """
    class_num = 41 # 类别已经处理好了，不需要更改
    mode = "train" if (split == "train" or split == "val") else "test"

    # train/val dataset 读取 label
    label_dict = {}
    if mode != "test":
        phone_file = open(os.path.join(phone_path, "train_labels.txt")).readlines()
        for line in phone_file:
            line = line.strip("\n").split(" ")
            label_dict[line[0]] = [int(p) for p in line[1:]]

    if split == "train" or split == "val":
        # 划分 train 和 val
        temp_list = open(os.path.join(phone_path, "train_split.txt")).readlines()
        random.seed(train_val_seed)
        random.shuffle(temp_list)
        train_val_split_positon = int(len(temp_list) * train_ratio)
        temp_list = temp_list[:train_val_split_positon] if split == "train" else temp_list[train_val_split_positon:]
    elif split == "test":
        temp_list = open(os.path.join(phone_path, "test_split.txt"))
    else:
        raise ValueError("Invalid 'split' argument for dataset: PhoneDataset!")

    temp_list = [line.strip("\n") for line in temp_list]
    print("[Dataset] - # phone classes: " + str(class_num) + ", number of utterances for " + split + ": " + str(len(temp_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes) # 第1维是按照题目提示处理为 39 * concat_nframes
    if mode != "test":
        y = torch.empty(max_len, dtype=torch.long) # 只需要一个维度按照idx存储label即可

    idx = 0
    for i, fname in tqdm(enumerate(temp_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f"{fname}.pt")) # 这些文件已经在课程给的数据中处理好了
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode != "test":
            label = torch.LongTensor(label_dict[fname])
        
        X[idx: idx + cur_len, :] = feat
        if mode != "test":
            y[idx: idx + cur_len] = label
        
        idx += cur_len

    # 截断，只保留有效数据
    X = X[:idx, :]
    if mode != "test":
        y = y[:idx]

    print(f"[INFO] {split} set")
    print(X.shape)
    if mode != "test":
        print(y.shape)
        return X, y
    else:
        return X
    



## 定义数据集

In [3]:
import torch
# 导入数据集
from torch.utils.data import Dataset
# 导入数据加载工具Dataloader
from torch.utils.data import DataLoader

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            # training/val dataset
            self.label = torch.LongTensor(y)
        else:
            # testing dataset
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]
        
    def __len__(self):
        return len(self.data)

## 神经网络模型

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
        )
    
    def forward(self, x):
        x = self.block(x)
        return x
    
class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=1, hidden_dim=256):
        super(Classifier, self).__init__()

        self.fc = nn.Sequential(
            BasicBlock(input_dim, hidden_dim),
            # *[] 是 Python 的 解包（unpacking） 语法，用于将列表中的元素作为独立参数传递给函数或类。
            *[BasicBlock(hidden_dim, hidden_dim) for _ in range(hidden_layers)],
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

## 超参数定义

In [5]:
# data parameters
# 用于数据处理时的参数
concat_nframes = 1   # 要链接的帧数
train_ratio = 0.8

# training parameters
seed = 0
batch_size = 512
num_epoch = 5
learning_rate = 0.0001
model_path = "./model.ckpt"

# model parameters
input_dim = 39 * concat_nframes
hidden_layers = 1
hidden_dim = 256

## 准备数据与模型

In [7]:
# 引入gc模块进行垃圾回收
import gc

# 预处理数据
train_X, train_y = preprocess_data(split="train", feat_dir="./libriphone/feat", phone_path="./libriphone", concat_nframes=concat_nframes, train_ratio=train_ratio)
val_X, val_y = preprocess_data(split="val", feat_dir="./libriphone/feat", phone_path="./libriphone", concat_nframes=concat_nframes, train_ratio=train_ratio)

# 导入数据
train_set = LibriDataset(train_X, train_y)
val_set = LibriDataset(val_X, val_y)

# 删除原始数据以节省内存
del train_X, train_y, val_X, val_y
gc.collect()

# 利用 dataloader 加载数据
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)


[Dataset] - # phone classes: 41, number of utterances for train: 3428


3428it [00:02, 1584.43it/s]


[INFO] train set
torch.Size([2116368, 39])
torch.Size([2116368])
[Dataset] - # phone classes: 41, number of utterances for val: 858


858it [00:00, 1633.54it/s]

[INFO] val set
torch.Size([527790, 39])
torch.Size([527790])





In [9]:
# 检查当前是否有可用GPU，俺是mac当然没有
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cpu


In [10]:
import numpy as np

# 固定随机种子
def same_seeds(seed): 
    # 固定随机种子（CPU）
    torch.manual_seed(seed) 
    # 固定随机种子（GPU)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed) # 为当前GPU设置
        torch.cuda.manual_seed_all(seed)  # 为所有GPU设置
    np.random.seed(seed)  # 保证后续使用random函数时，产生固定的随机数
    torch.backends.cudnn.benchmark = False # GPU、网络结构固定，可设置为True
    torch.backends.cudnn.deterministic = True # 固定网络结构

In [11]:
# 固定随机种子
same_seeds(seed)

# 创建模型、定义损失函数和优化器
model = Classifier(input_dim=input_dim, output_dim=41, hidden_layers=hidden_layers, hidden_dim=hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


## 训练模型

In [13]:
best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train() # 设置模型位训练模式
    for i, batch in enumerate(tqdm(train_loader)):
        features, labels = batch
        features.to(device)
        labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, train_pred = torch.max(outputs, dim=1) # 获取概率最高的类的索引
        # tensor.detach重新在一块内存上保存tensor，数值关联，但是不涉及到剃度计算，避免错误的梯度回传
        # 不过此处计算发生在反向传播之后，梯度已经不需要
        train_acc += (train_pred.detach() == labels.detach()).sum().item()
        train_loss += loss.item() # 将loss转化为数值

    # validing
    if len(val_set) > 0:
        model.eval()
        with torch.no_grad():
            for i, batch in enumerate(tqdm(val_loader)):
                features, labels = batch
                features.to(device)
                labels.to(device)

                outputs = model(features)

                loss = criterion(outputs, labels)

                _, val_pred = torch.max(outputs, dim=1)
                # 后续需要打印结果，gpu上的tensor无法打印，需要移动到cpu转为numpy等可以打印的数据形式
                # 不过通过 .item() 转为 python 的基本数据类型了，可以直接打印，所以不需要 .cpu() 也可以
                val_acc += (val_pred.cpu() == labels.cpu()).sum().item()
                val_loss += loss.item()

            print("[{:03d}/{:03d}] Train Acc: {:3.6} Loss: {:3.6} | Val Acc: {:3.6f}".format(
                epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            # 如果模型获得提升，在此阶段保存模型
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print("saving model with acc {:3.6f}".format(best_acc/len(val_set)))
    else:
        print("[{:03d}/{:03d}] Train Acc: {:3.6} Loss: {:3.6}".format(
            epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
        ))
    
    # 如果结束验证，则保留最后一个epoch得到的模型
    if len(val_set) == 0:
        torch.save(model.state_dict(), model_path)
        print("saving mdoel at last epoch")



100%|██████████| 4134/4134 [00:07<00:00, 538.15it/s]
100%|██████████| 1031/1031 [00:01<00:00, 867.59it/s]


[001/005] Train Acc: 0.421902 Loss: 2.08666 | Val Acc: 0.440529
saving model with acc 0.440529


100%|██████████| 4134/4134 [00:07<00:00, 556.69it/s]
100%|██████████| 1031/1031 [00:01<00:00, 871.15it/s]


[002/005] Train Acc: 0.449017 Loss: 1.9345 | Val Acc: 0.449728
saving model with acc 0.449728


100%|██████████| 4134/4134 [00:07<00:00, 558.10it/s]
100%|██████████| 1031/1031 [00:01<00:00, 896.15it/s]


[003/005] Train Acc: 0.455059 Loss: 1.90437 | Val Acc: 0.453906
saving model with acc 0.453906


100%|██████████| 4134/4134 [00:07<00:00, 556.42it/s]
100%|██████████| 1031/1031 [00:01<00:00, 871.82it/s]


[004/005] Train Acc: 0.458414 Loss: 1.88801 | Val Acc: 0.456062
saving model with acc 0.456062


100%|██████████| 4134/4134 [00:07<00:00, 559.59it/s]
100%|██████████| 1031/1031 [00:01<00:00, 848.42it/s]

[005/005] Train Acc: 0.460793 Loss: 1.87644 | Val Acc: 0.457826
saving model with acc 0.457826





In [None]:
del train_loader, val_loader
gc.collect()

21

## 测试
创建测试数据集，并从保存的检查点加载模型。

In [7]:
# 载入数据
test_X = preprocess_data(split="test", feat_dir="./libriphone/feat", phone_path="./libriphone", concat_nframes=concat_nframes)
test_set = LibriDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

[Dataset] - # phone classes: 41, number of utterances for test: 1078


1078it [00:00, 1569.57it/s]

[INFO] test set
torch.Size([646268, 39])





In [12]:
# 加载已经训练好的模型
model = Classifier(input_dim=input_dim, output_dim=41, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
model.load_state_dict(torch.load("./model.ckpt"))

<All keys matched successfully>

In [15]:
test_acc = 0.0
test_lengths = 0
pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        features = batch
        features = features.to(device)

        outputs = model(features)

        _, test_pred = torch.max(outputs, dim=1)
        # 把预测结果移动到 cpu 上，转为 numpy 数组，然后使用 np.concatenate() 进行合并
        pred = np.concatenate((pred, test_pred.cpu().numpy()), axis=0)

100%|██████████| 1263/1263 [00:01<00:00, 1223.60it/s]


## 将预测结果写入CSV文件

In [14]:
with open("prediction.csv", "w") as f:
    f.write("Id, Class\n")
    for i, y in enumerate(pred):
        f.write("{}, {}\n".format(i, y))