In [23]:
import numpy as np
import random
import os 
from pathlib import Path
import json
import math
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, random_split, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.tensorboard import SummaryWriter

#### Random Seed

In [15]:
def same_seed(config):
    # 固定随机数种子
    # 为了保证相同的网络跑出来的效果时完全相同的需要固定随机数种子
    # 如果使用了pytorch等框架还需要确定对应的种子，以及cuda的种子
    np.random.seed(config['seed'])
    random.seed(config['seed'])
    torch.manual_seed(config['seed'])

    if torch.cuda.is_available():
        torch.cuda.manual_seed(config['seed'])
        torch.cuda.manual_seed_all(config['seed'])
    
    # 在torch进行优化之前，会花费一段时间针对卷积搜索最适合的卷积实现算法，进而实现网络加速
    # 当网络结构固定，输入结构固定时比较适宜使用
    # 当网络结构或输入在不断调整变化时，每次都会重新搜索，这在模型优化时会浪费时间因此选择关闭
    torch.backends.cudnn.benchmark = False

    # 保证每次卷积算法的输出是固定的，即默认算法，如果torch的种子是固定的，那么每次卷积的输入和输出也会是固定的
    torch.backends.cudnn.deterministic = True


#### Dataset

In [16]:
# mapping contain speaker2id and id2spekder
# metadata contain two keys speakers and n_mels
# speakers has some data like {'id': [{'feature_path': '*.pt', 'mel_len': 430}] ... } 
# the *.pt file shape is (mel_len, n_mels) 
# segment_len 限制了选取pt文件的长度，每次获得数据时随机选取其中的一段

class myDataset(Dataset):

    def __init__(self, data_dir, segment_len=128):
        self.data_dir = data_dir
        self.segment_len = segment_len
    
        # Load the mapping from speaker neme to their corresponding id. 
        mapping_path = Path(data_dir) / "mapping.json"
        mapping = json.load(mapping_path.open())
        self.speaker2id = mapping["speaker2id"]
	
		# Load metadata of training data.
        metadata_path = Path(data_dir) / "metadata.json"    
        metadata = json.load(open(metadata_path))["speakers"]

        # Get the total number of speaker.
        self.speaker_num = len(metadata.keys())
        self.data = []
        for speaker in metadata.keys():
            for utterances in metadata[speaker]:
                self.data.append([utterances["feature_path"], self.speaker2id[speaker]])

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ptPath, speakerID = self.data[idx]
        try:
            mel = torch.load(os.path.join(self.data_dir, ptPath))
        except:
            ptPath, speakerID = self.data[0]
            mel = torch.load(os.path.join(self.data_dir, 'uttr-00a3bbf0dc02419c8ae56cdb72c2efa0.pt'))
        
        # Segmemt mel-spectrogram into "segment_len" frames.
        if len(mel) > self.segment_len:
            # Randomly get the starting point of the segment.
            start = random.randint(0, len(mel) - self.segment_len)
            # Get a segment with "segment_len" frames.
            mel = torch.FloatTensor(mel[start:start+self.segment_len])
        else:
            mel = torch.FloatTensor(mel)
		# Turn the speaker id into long for computing loss later.
        speaker = torch.FloatTensor([speakerID]).long()
        return mel, speaker

    def getSpeakerNum(self):
        return self.speaker_num

In [17]:
def collate_batch(batch):
	# Process features within a batch.
	"""Collate a batch of data."""
	# type of batch is list
	# batch contains batch_size tensor
	mel, speaker = zip(*batch)
	# Because we train the model batch by batch, we need to pad the features in the same batch to make their lengths the same.
	mel = pad_sequence(mel, batch_first=True, padding_value=-20)    # pad log 10^(-20) which is very small value.
	# mel: (batch size, length, 40)
	return mel, torch.FloatTensor(speaker).long()

#### model 

In [18]:
class Classifier(nn.Module):

    def __init__(self, d_model=80, n_spk=600, dropout=0.1):
        super().__init__()

        # Project the dimension of features from that of input into d_model
        self.prenet = nn.Linear(40, d_model)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, dim_feedforward=256, nhead=2
        )

        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, n_spk)
        )
    
    def forward(self, mels):
        """
        args:
            mels: (batch size, length, 40)
        return:
            out: (batch size, n_spks)
        """
        # out: (batch size, length, d_model)
        out = self.prenet(mels)
        # out: (length, batch size, d_model)
        out = out.permute(1, 0, 2)
        # The encoder layer expect features in the shape of (length, batch size, d_model)
        out = self.encoder_layer(out)
        # out: (batch size, length, d_model)
        out = out.transpose(0, 1)
        # mean pooling
        stats = out.mean(dim=1)

        # out: (batch, n_spks)
        out = self.pred_layer(stats)
        return out


#### schedule learning rate

In [19]:
def get_cosine_schedule_with_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the 
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer

    想办法让lr先增后减，这里使用装饰器来实现一个更加灵活的scheduler leraning rate

    Args: 
        optimizer (: class: torch.optim.Optimizer):
        The optimizer for which to schedule the learning rate
        num_warmup_step (: obj: int):
        The number of step for the warmup phase
        num_training_step(: obj: int):
        The total number of traing steps
        num_cycles(:obj: float, optional, defaults to 0.5):
        The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).
        last_epoch (: obj: int, optional, defaults to -1)
        The index of the last epoch when resuming training

    Return:
        :obj: torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.
    """
    def lr_lambda(current_step):
        # warmup
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        # decadence
        progress = float(current_step - num_warmup_steps) / float(
            max(1, num_training_steps - num_warmup_steps)
        )
        return max(
            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
        )
    return LambdaLR(optimizer, lr_lambda, last_epoch)

#### Trainer

In [42]:
def Trainer(model, config, train_loader, valid_loader):
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    scheduler = get_cosine_schedule_with_warmup(optimizer, config['num_warmup_steps'], config['num_epochs'])
    criterion = nn.CrossEntropyLoss()
    model.to(config['device'])

    writer = SummaryWriter()

    best_acc = 0.0
    stop_num = 0

    for epoch in range(config['num_epochs']):

        model.train()
        train_loss, train_acc = [], []
        for batch in tqdm(train_loader):
            n_mel, labels = batch
            n_mel, labels =  n_mel.to(config['device']), labels.to(config['device'])

            outs = model(n_mel)
            loss = criterion(outs, labels)

            preds = outs.argmax(dim=1)
            acc = torch.mean((preds==labels).float())

            loss.backward()
            optimizer.step()
            # 清除梯度
            optimizer.zero_grad()

            train_acc.append(acc.item())
            train_loss.append(loss.item())

        acc0 = sum(train_acc) / len(train_acc)
        loss0 = sum(train_loss) / len(train_loss)
        scheduler.step()

        writer.add_scalar(config['writerName'] + 'Loss/Train', loss0, epoch)
        writer.add_scalar(config['writerName'] + 'Acc/Train', acc0, epoch)

        model.eval()

        valid_loss, valid_acc = [], []
        for batch in tqdm(valid_loader):
            with torch.no_grad():
                n_mels, labels = batch
                n_mels, labels = n_mels.to(config['device']), labels.to(config['device'])

                outs = model(n_mels)
                loss = criterion(outs, labels)
                preds = outs.argmax(dim=1)
                acc = (preds == labels).float().mean()

                valid_loss.append(loss.item())
                valid_acc.append(acc.item())
        
        acc1 = sum(valid_acc) / len(valid_acc)
        loss1 = sum(valid_loss) / len(valid_loss)

        writer.add_scalar(config['writerName'] + 'Loss/Valid', loss1, epoch)
        writer.add_scalar(config['writerName'] + 'Acc/Valid', acc1, epoch)

        print(f"epoch {epoch} / {config['num_epochs']}: [ train loss {loss0:.3f} | train acc {acc0:.3f} ]")
        print(f"epoch {epoch} / {config['num_epochs']}: [ valid loss {loss1:.3f} | valid acc {acc1:.3f} ]")

        torch.save(model.state_dict(), config['last_model'])
        if config['early_stop'] > stop_num:
            if acc > best_acc:
                stop_num = 0
                print('Find a better model!!!')
                torch.save(model.state_dict(), config['best_model'])
            else:
                stop_num += 1
        else:
            print('Cannot impove the model~')
            break
        

#### Start Train!!!

In [43]:
config = {
    'segment_len': 128,
    'valid_ratio': 0.1,
    'batch_size': 128,
    'learning_rate': 1e-4,
    'gamma': 0.99,
    'weight_decay': 1e-3,
    'best_model': './model/best_model_1001.ckpt',
    'last_model': './model/last_model_1001.ckpt',
    'num_epochs': 700,
    'early_stop': 50,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'num_warmup_steps': 10,
    'writerName': 'HW4'
}

In [44]:
dataset = myDataset('./data/', segment_len=config['segment_len'])
setLen = len(dataset)
validLen = int(setLen*config['valid_ratio'])
train_set, valid_set = random_split(dataset, [setLen-validLen, validLen])
train_loader = DataLoader(train_set, config['batch_size'], shuffle=True, drop_last=True, pin_memory=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid_set, config['batch_size'], drop_last=True, pin_memory=True, collate_fn=collate_batch)

In [45]:
model = Classifier()

In [None]:
Trainer(model, config, train_loader, valid_loader)