In [2]:
import re
import os
import pandas as pd
import random
from tqdm import tqdm 

file_paths, file_names, emotions, audios = [], [], [], []
emotion_map = {'Neutral': 'neutral', 'Anger': 'angry', 'Happiness': 'happy', 'Sadness': 'sad', 'Fear': 'fear',
              'Disgust': 'disgust'}

main_path = '/home/rl3155/Multilingual-Speech-Emotion-Recognition-System/Spanish/Spanish_New'
DATA_NATURAL = "/home/rl3155/MESD_All"

In [3]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

1.12.1
0.12.1+cu113
cuda


In [4]:
# Get the entries with assigned session

dataframe_path = '/home/rl3155/Multilingual-Speech-Emotion-Recognition-System/Spanish/session_entries.csv'
file = pd.read_csv(dataframe_path)[['path', 'name', 'emotion', 'session']]
file.head()

Unnamed: 0,path,name,emotion,session
0,/home/rl3155/MESD_All/Fear_F_A_basta_ya.wav,Fear_F_A_basta_ya.wav,fear,1
1,/home/rl3155/MESD_All/Happiness_M_A_arriba.wav,Happiness_M_A_arriba.wav,happy,1
2,/home/rl3155/MESD_All/Fear_F_B_arana.wav,Fear_F_B_arana.wav,fear,1
3,/home/rl3155/MESD_All/Neutral_M_B_articulo.wav,Neutral_M_B_articulo.wav,neutral,1
4,/home/rl3155/MESD_All/Disgust_F_A_antes.wav,Disgust_F_A_antes.wav,disgust,1


In [5]:
from tqdm import tqdm

bundle = torchaudio.pipelines.WAV2VEC2_BASE
extractor = bundle.get_model()
for i in tqdm(range(len(file['path']))):
    path = file['path'][i]
    wave, sr = torchaudio.load(path)
    if sr != bundle.sample_rate:
        wave = torchaudio.functional.resample(wave, sr, bundle.sample_rate)
    with torch.inference_mode():
        feature, _ = extractor.extract_features(wave)
    feature = [f[0] for f in feature]
    audio = torch.stack(feature)
    audios.append(audio)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 862/862 [00:57<00:00, 15.01it/s]


### Load Data

In [31]:
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, audios, labels, label_transform):
        super(MyDataSet).__init__()
        self.audios = audios
        self.labels = labels
        self.label_transform = label_transform
        
    def __getitem__(self, idx):
        label = self.label_transform[self.labels[idx]]
        audio = self.audios[idx]
        return audio, label
    
    def __len__(self):
        return len(self.labels)

In [32]:
categories = ['neutral', 'angry', 'happy', 'sad', 'fear', 'disgust']
cate_dic = {}
for i, cate in enumerate(categories):
    cate_dic[cate] = i
cate_dic

{'neutral': 0, 'angry': 1, 'happy': 2, 'sad': 3, 'fear': 4, 'disgust': 5}

## Train with Model

In [33]:
import torch.nn as nn
import torch.nn.functional as F

class NN(nn.Module):
    def __init__(self, vocab_size, hidden_size=128, label_size=6):
        super().__init__()
        self.nn = nn.Linear(in_features = vocab_size, out_features = hidden_size)
        self.linear = nn.Linear(in_features = hidden_size, out_features = label_size)

    def forward(self, x):
        x = self.nn(x)
        logits = self.linear(x)
        return logits

### Model Traning on each layer 

In [38]:
holdout = 1
train_emotions = list(file[file['session'] != holdout]['emotion'])
train_audios = [audios[i] for i in range(len(audios)) if file['session'][i] != holdout]
test_emotions = list(file[file['session'] == holdout]['emotion'])
test_audios = [audios[i] for i in range(len(audios)) if file['session'][i] == holdout]

In [43]:
print(train_audios[0].size(), train_audios[1].size(), train_audios[2].size())

torch.Size([12, 25, 768]) torch.Size([12, 51, 768]) torch.Size([12, 28, 768])


In [41]:
from torch.utils.data import DataLoader
import torch.optim as optim

layers = 12
test_accuracies = []
for layer in range(layers):
    train = torch.stack([x[layer] for x in train_audios])
    test = torch.stack([x[layer] for x in test_audios])
    train_dataset = MyDataSet(train, train_emotions, cate_dic)
    trainloader_args = dict(batch_size=64, shuffle=True)
    train_dataloader = DataLoader(train_dataset, **trainloader_args)
    
    model = NN(768)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    epochs = 50
    
    train_losses = []
    train_accuracies = []
    valid_losses = []
    valid_accuracies = []
    for epoch in tqdm(range(epochs)):
        train_loss = 0
        acc_cnt = 0
        err_cnt = 0
        batch_cnt = 0
        model.train()
        for batch, (x, y) in enumerate(train_dataloader):
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.cpu().item()

            #model outputs
            out_val, out_indices = torch.max(logits, dim=1)
            tar_indices = y

            for i in range(len(out_indices)):
                if out_indices[i] == tar_indices[i]:
                    acc_cnt += 1
                else:
                    err_cnt += 1
            batch_cnt += 1

        train_loss = train_loss/batch_cnt
        train_accuracy = acc_cnt/(acc_cnt+err_cnt)
        train_accuracies.append(train_accuracy)
        train_losses.append(train_loss)
        
    test_dataset = MyDataSet(test, test_emotions, cate_dic)
    testloader_args = dict(batch_size=1, shuffle=True)
    test_dataloader = DataLoader(test_dataset, **testloader_args)

    test_loss = 0
    acc_cnt = 0
    err_cnt = 0
    batch_cnt = 0
    model.eval()

    for x, y in test_dataloader:

        x = x.to(device)
        y = y.to(device)

        logits = model(x)
        loss = criterion(logits, y)
        test_loss += loss.cpu().item()

        out_val, out_indices = torch.max(logits, dim=1)
        tar_indices = y

        for i in range(len(out_indices)):
            if out_indices[i] == tar_indices[i]:
                acc_cnt += 1
            else:
                err_cnt += 1
        batch_cnt += 1

    test_loss = test_loss/batch_cnt
    test_accuracy = acc_cnt/(acc_cnt+err_cnt)
    print(f'layer: {layer}, test accuracy: {test_accuracy}')
    test_accuracies.append(test_accuracy)

RuntimeError: stack expects each tensor to be equal size, but got [25, 768] at entry 0 and [51, 768] at entry 1

### Model Test

In [42]:
test_loss = 0
acc_cnt = 0
err_cnt = 0
batch_cnt = 0
model.eval()

for x, lengths, y in test_dataloader:

    x = x.to(device)
    y = y.to(device)

    logits = model(x, lengths)
    loss = criterion(logits, y)
    test_loss += loss.cpu().item()

    out_val, out_indices = torch.max(logits, dim=1)
    tar_indices = y

    for i in range(len(out_indices)):
        if out_indices[i] == tar_indices[i]:
            acc_cnt += 1
        else:
            err_cnt += 1
    batch_cnt += 1

test_loss = test_loss/batch_cnt
test_accuracy = acc_cnt/(acc_cnt+err_cnt)
print(f'test accuracy: {test_accuracy}')

test accuracy: 0.7183908045977011


In [43]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

aggr.weight tensor([[[ 0.0281],
         [ 0.3368],
         [ 0.2488],
         [-0.0838],
         [-0.0957],
         [-0.1449],
         [ 0.0569],
         [-0.1551],
         [-0.1686],
         [ 0.2004],
         [ 0.1316],
         [-0.0160]]], device='cuda:0')
aggr.bias tensor([0.0301], device='cuda:0')
embed.weight tensor([[-0.0364,  0.0066, -0.0306,  ...,  0.0352,  0.0494,  0.0268],
        [ 0.0091, -0.0249, -0.0081,  ..., -0.0044, -0.0444, -0.0414],
        [ 0.0563, -0.0091, -0.0121,  ...,  0.0061,  0.0057, -0.0120],
        ...,
        [-0.0039, -0.0668,  0.0063,  ..., -0.0048,  0.0309,  0.0290],
        [-0.0453,  0.0236, -0.0065,  ...,  0.0362,  0.0503,  0.0824],
        [ 0.0471,  0.0100, -0.0052,  ..., -0.0499, -0.0303,  0.0015]],
       device='cuda:0')
embed.bias tensor([ 1.2361e-02, -6.1831e-03, -5.7523e-02, -8.4755e-03, -2.6406e-02,
        -1.2137e-02,  2.0382e-02, -5.8890e-03,  3.3558e-02, -2.8190e-02,
        -3.6650e-02, -2.4355e-02, -3.1892e-04, -1.7508e-0

In [44]:
# model_path = main_path + f'/models/wav2vecbase/holdout_{holdout}.pth'

# torch.save({'epoch':epochs,
#             'model_state_dict':model.state_dict(),
#             'optimizer_state_dict':optimizer.state_dict()},
#             model_path)