In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import json
import os
from natsort import os_sorted
import time
import copy
from torchvision import models
from tqdm import tqdm
import torchvision.transforms as transforms
from librosa.util import normalize
from pypinyin import lazy_pinyin
import re
from torch.nn.utils.rnn import pad_sequence
from torch.nn import Transformer

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class MyDataset(Dataset):
    def __init__(self, root_dir,max_seq_length,stn_length):
      self.root_dir = root_dir
      self.data_paths = []
      self.label = []
      self.load_data()
      self.max_seq_length = max_seq_length
      self.stn_length = stn_length

    def __getitem__(self, idx):
        wav, sr = librosa.load(self.data_paths[idx], sr=16000)
        mfcc = librosa.feature.mfcc(y=wav, sr=sr,n_mfcc=128, fmin=20.0, fmax=4000.0, hop_length=150,center=True)
        # delta_mfcc  = librosa.feature.delta(mfccs)
        # delta2_mfcc = librosa.feature.delta(mfccs, order=2)
        # M = np.stack([mfccs, delta_mfcc, delta2_mfcc], axis=0)
        stn = self.label[idx]
        # print(mfcc.shape)
        # padded_mfcc = torch.nn.functional.pad(mfcc, (0, self.max_seq_length - mfcc.shape[1]))
        # mfcc = mfcc.T
        if mfcc.shape[1] < self.max_seq_length:
            padded_mfcc = np.pad(mfcc,((0,0),(0, self.max_seq_length - mfcc.shape[1])))
        else:
            padded_mfcc = mfcc[:,:1000]

        tokens = ['<PAD>', '<SOS>', '<EOS>', 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㄌ', 'ㄍ', 'ㄎ', 'ㄏ', 'ㄐ', 'ㄑ', 'ㄒ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ', 'ㄧ', 'ㄨ', 'ㄩ', 'ㄚ', 'ㄛ', 'ㄜ', 'ㄝ', 'ㄞ', 'ㄟ', 'ㄠ', 'ㄡ', 'ㄢ', 'ㄣ', 'ㄤ', 'ㄥ', 'ㄦ']  # List of tokens
        token_to_idx = {token: idx for idx, token in enumerate(tokens)}
        vocab_size = len(tokens)

        train_targets = []

        # Tokenize and convert sentences to token indices

        tokenized = ['<SOS>'] + [token for token in stn] + ['<EOS>']
        token_indices = [token_to_idx[token] for token in tokenized]
        # train_targets.append(token_indices)
        if len(token_indices) < self.stn_length:
            token_indices = np.pad(token_indices,(0, self.stn_length - len(token_indices)))
        
        return padded_mfcc, token_indices
    
    def __len__(self):
        return len(self.data_paths)
    
    def load_data(self):
        word_dirs = []
        for dirs in os_sorted(os.listdir(self.root_dir)):
            paths = f'{self.root_dir}/{dirs}'
            if os.path.isdir(paths):
                word_dirs.append(paths)

        for word_dir in word_dirs:
                word = word_dir.split('/')[-1]
                if word.find('clip') != -1:
                    continue
                for dirs in os.listdir(word_dir):
                    data_path = f'{self.root_dir}/{word}/{dirs}'
                    if data_path.endswith('.wav'):
                        pinyin = lazy_pinyin(word, style=10)
                        stn = []
                        tmp = ''
                        for char in pinyin:
                            label = re.sub('[_˙ˊˇˋ]', '', char)   
                            tmp += label
                        self.data_paths.append(data_path)
                        self.label.append(tmp)

In [4]:
# Define the Transformer architecture
class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_encoder_layers, num_decoder_layers):
        super(TransformerSeq2Seq, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead),
            num_layers=num_encoder_layers
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead),
            num_layers=num_decoder_layers,
            
        )
        self.fc_out = nn.Linear(d_model, output_dim)
    
    def forward(self, src, trg):
        encoder_output = self.encoder(src)
        # print(trg.size())
        # trg_mask = nn.Transformer.generate_square_subsequent_mask(self,trg.size(-1)).to(device)
        # print(trg)
        # print(trg_mask.size())
        decoder_input = trg[:, :].unsqueeze(1)
        decoder_input = decoder_input.type(torch.float)
        
        decoder_output = self.decoder(decoder_input, encoder_output)
        output = self.fc_out(decoder_output)
        return output

In [5]:
input_dim = 128
output_dim = 37
d_model = 1000
nhead = 4
num_encoder_layers = 4
num_decoder_layers = 4
learning_rate = 0.001
batch_size = 20
num_epochs = 10
max_seq_length = 1000  # Maximum sequence length after padding
stn_length = 1000
root_dir = '/home/dmcl/yochen/VoiceData/users/msn9110/voice_data/sentence'


In [6]:
train_dataset = MyDataset(root_dir, max_seq_length, stn_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [7]:
# Define tokens and their mapping to indices
tokens = ['<PAD>', '<SOS>', '<EOS>', 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㄌ', 'ㄍ', 'ㄎ', 'ㄏ', 'ㄐ', 'ㄑ', 'ㄒ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ', 'ㄧ', 'ㄨ', 'ㄩ', 'ㄚ', 'ㄛ', 'ㄜ', 'ㄝ', 'ㄞ', 'ㄟ', 'ㄠ', 'ㄡ', 'ㄢ', 'ㄣ', 'ㄤ', 'ㄥ', 'ㄦ']  # List of tokens
token_to_idx = {token: idx for idx, token in enumerate(tokens)}
vocab_size = len(tokens)

# Create model, loss function, and optimizer
model = TransformerSeq2Seq(input_dim, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
criterion = nn.CrossEntropyLoss(ignore_index=token_to_idx['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model = model.to(device)

In [14]:
# Training loop (same as before)
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for mfcc, target_tokens in tqdm(train_loader):
        mfcc = mfcc.to(device)
        target_tokens = target_tokens.to(device)
        optimizer.zero_grad()
        output = model(mfcc, target_tokens[:, :])  # Exclude the last token from target
        print(target_tokens.size())
        output = output.view(-1, vocab_size)
        print(output.size())
        target_tokens = target_tokens[:, 1:].reshape(-1)  # Shift target by one time step
        loss = criterion(output, target_tokens)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss}")

# Save the trained model
torch.save(model.state_dict(), 'transformer_seq2seq_model.pth')

  0%|          | 0/366 [00:00<?, ?it/s]

torch.Size([20, 1000])
torch.Size([1, 800])





ValueError: Expected input batch_size (1) to match target batch_size (19980).