## Split and split stats

In [6]:
import os
import random
import shutil

if __name__ == '__main__':

    n_splits = 5
    dataset_location = 'EmoDB/'
    output_location = 'MESD_Final_Splits/'
    os.makedirs(output_location, exist_ok=True)
    emotion_counter = {}
    emotion_to_audio = {}
    emotions = {
        'W': 'angry',
        'L': 'bored',
        'E': 'disgust',
        'A': 'anxious',
        'F': 'happy',
        'T': 'sad',
        'N': 'neutral'
    }


    for audiofile in os.listdir(dataset_location):
        if audiofile == 'desktop.ini':
            continue
        emotion = emotions[audiofile[5]] # get emotion for that file
        
        #Emotion counter
        if emotion not in emotion_counter:
            emotion_counter[emotion] = 0
        emotion_counter[emotion] += 1

        #Add files in emotion
        if emotion not in emotion_to_audio:
            emotion_to_audio[emotion] = []
        emotion_to_audio[emotion].append(audiofile)
        
    #Random shuffle the files
    for emotion in emotion_to_audio:
        random.shuffle(emotion_to_audio[emotion])
        random.shuffle(emotion_to_audio[emotion])

    #Make 5 splits
    for emotion in emotion_to_audio:
        print('EMOTION:',emotion)
        for i in range(n_splits):
            split_location = output_location + 'split_' + str(i) + '/'
            os.makedirs(split_location, exist_ok=True)
        
            max_len = len(emotion_to_audio[emotion])
            split_size = max_len // n_splits

            start_range = i * split_size
            end_range =  (i+1) * split_size

            for filename in emotion_to_audio[emotion][start_range : end_range]:
                source_location = dataset_location + filename
                target_location = split_location + filename
                shutil.copyfile(source_location, target_location)

        #randomly place remaining files in different splits
        for filename in emotion_to_audio[emotion][end_range : max_len]:
            selected_split = random.sample([0,1,2,3,4], 1)[0]
            source_location = dataset_location + filename
            target_location = output_location + 'split_' + str(selected_split) + '/' + filename
            shutil.copyfile(source_location, target_location)

EMOTION: angry
EMOTION: neutral
EMOTION: bored
EMOTION: happy
EMOTION: disgust
EMOTION: anxious
EMOTION: sad


In [7]:
if __name__ == "__main__":

    dataset_location = 'MESD_Final_Splits/'

    for split in range(5):
        print('SPLIT:', split)
        split_location = dataset_location + 'split_' + str(split)
        emotion_counter = {}

        for audiofile in os.listdir(split_location):
            emotion = emotions[audiofile[5]]
        
            #Emotion counter
            if emotion not in emotion_counter:
                emotion_counter[emotion] = 0
            emotion_counter[emotion] += 1


        for emotion in emotion_counter:
            print('--', emotion, ':', emotion_counter[emotion])

        print('TOTAL:', sum(emotion_counter.values()))

        print('-'*30)

SPLIT: 0
-- angry : 87
-- neutral : 50
-- disgust : 29
-- anxious : 47
-- bored : 55
-- happy : 46
-- sad : 43
TOTAL: 357
------------------------------
SPLIT: 1
-- angry : 81
-- neutral : 52
-- bored : 58
-- happy : 43
-- anxious : 48
-- disgust : 31
-- sad : 39
TOTAL: 352
------------------------------
SPLIT: 2
-- neutral : 50
-- bored : 52
-- happy : 44
-- angry : 85
-- disgust : 31
-- anxious : 44
-- sad : 40
TOTAL: 346
------------------------------
SPLIT: 3
-- neutral : 55
-- happy : 46
-- angry : 84
-- disgust : 29
-- bored : 50
-- sad : 42
-- anxious : 46
TOTAL: 352
------------------------------
SPLIT: 4
-- angry : 84
-- neutral : 51
-- disgust : 32
-- anxious : 49
-- happy : 46
-- sad : 41
-- bored : 51
TOTAL: 354
------------------------------


## Model

In [8]:
class Dense(nn.Module):
    def __init__(self, model_name, label_size=7):
        super().__init__()
        #Input Dim = Batch * 12 * Seq_Len * 768
        
        if model_name.find('BASE'):
            num_layers = 12
            feature_dim = 768
        elif model_name.find('LARGE'):
            num_layers = 24
            feature_dim = 1024
           
        hidden_dim = 256

        #Averaging over 12 layers 
        self.aggr = nn.Conv1d(in_channels=num_layers, out_channels=1, kernel_size=1, bias=False)
        
        #Input Dim = Batch * Seq_Len * 768
        self.cnn  = nn.Conv1d(in_channels=feature_dim, out_channels=hidden_dim, kernel_size=1)
        self.cnn2 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=1)
        self.dropout = nn.Dropout(0.2)#not used yet

        self.linear = nn.Linear(in_features = hidden_dim, out_features = label_size)


    def forward(self, x, lengths, device):
        """
        padded_x: (B,T) padded LongTensor
        """

        batch_size, n_layers, seq_len, n_features = x.size(0), x.size(1), x.size(2), x.size(3)
        
        #Take average of 12 layers
        x = torch.flatten(x, start_dim=2)
        x = self.aggr(x)
        x = torch.reshape(x, (batch_size, seq_len, n_features))

        #Pass through CNN
        x = x.transpose(1,2) #now dimension is batch * n_features * seq_len
        x = F.relu(self.cnn(x))
        x = F.relu(self.cnn2(x))
        x = x.transpose(1,2) #now dimension is batch * seq_len * n_features

        #Do global average over time sequence
        global_avg = torch.tensor([]).to(device)
        for i in range(batch_size):
            mean_vector = torch.mean(x[i,:lengths[i],:], dim = 0)
            mean_vector = mean_vector.reshape(1,-1)
            global_avg = torch.cat((global_avg, mean_vector))

        logits = self.linear(global_avg)

        return logits


class ICASSP3CNN(nn.Module):
    def __init__(self, vocab_size, dims = 12, embed_size=128, hidden_size=512, num_lstm_layers = 2, bidirectional = False, label_size=7):
        super().__init__()
        self.n_layers = num_lstm_layers 
        self.hidden = hidden_size
        self.bidirectional = bidirectional
        
        self.aggr = nn.Conv1d(in_channels=dims, out_channels=1, kernel_size=1)
        
        self.embed = nn.Linear(in_features = vocab_size, out_features = embed_size)

        self.cnn  = nn.Conv1d(embed_size, embed_size, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(embed_size, embed_size, kernel_size=5, padding=2)
        self.cnn3 = nn.Conv1d(embed_size, embed_size, kernel_size=7, padding=3)

        self.batchnorm = nn.BatchNorm1d(3 * embed_size)

        self.lstm = nn.LSTM(input_size = 3 * embed_size, 
                            hidden_size = hidden_size, 
                            num_layers = num_lstm_layers, 
                            bidirectional = bidirectional)

        self.linear = nn.Linear(in_features = 2 * hidden_size if bidirectional else hidden_size, 
                                out_features = label_size)


    def forward(self, x, lengths):
        """
        padded_x: (B,T) padded LongTensor
        """
        n, d, b, t = x.size(0), x.size(1), x.size(2), x.size(3)
        x = torch.flatten(x, start_dim=2)
        input = self.aggr(x)
        input = torch.reshape(input, (n, b, t))
        input = self.embed(input)

        batch_size = input.size(0)
        input = input.transpose(1,2)    # (B,T,H) -> (B,H,T)

        cnn_output = torch.cat([self.cnn(input), self.cnn2(input), self.cnn3(input)], dim=1)

        input = F.relu(self.batchnorm(cnn_output))

        input = input.transpose(1,2)

        pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=False)
        _, (hn, cn) = self.lstm(pack_tensor)

        if self.bidirectional:
            h_n = hn.view(self.n_layers, 2, batch_size, self.hidden)
            h_n = torch.cat([ h_n[-1, 0,:], h_n[-1,1,:] ], dim = 1)
        else:
            h_n = hn[-1]

        logits = self.linear(h_n)

        return logits

In [9]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import torch
import torchaudio
import os


class german_dataset(Dataset):
    def __init__(self, dataset_location, data_splits, model_name):
        self.dataset_location = dataset_location
        self.data_splits = data_splits

        #initialize label mapping 
        self.emotion_mapper = {
                            'W': 'angry',
                            'L': 'bored',
                            'E': 'disgust',
                            'A': 'anxious',
                            'F': 'happy',
                            'T': 'sad',
                            'N': 'neutral'}
        self.emotion_to_label = {
                            'neutral':0,
                            'angry':1,
                            'anxious':2,
                            'bored':3,
                            'disgust':4,
                            'happy':5,
                            'sad':6}
        self.label_to_emotion = {
                            0:'neutral',
                            1:'angry',
                            2:'anxious',
                            3:'bored',
                            4:'disgust',
                            5:'happy',
                            6:'sad'}

        #get all audiofile locations 
        self._get_data_locations()

        #load model
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")
        if model_name == 'WAV2VEC2_BASE':
            self.bundle = torchaudio.pipelines.WAV2VEC2_BASE
        elif model_name == 'WAV2VEC2_LARGE':
            self.bundle = torchaudio.pipelines.WAV2VEC2_LARGE
        elif model_name == 'WAV2VEC2_BASE_XLSR':
            self.bundle = torchaudio.pipelines.WAV2VEC2_XLSR53
        elif model_name == 'WAV2VEC2_LARGE_XLSR':
            self.bundle = torchaudio.pipelines.WAV2VEC2_XLSR_300M
        elif model_name == 'HUBERT_BASE':
            self.bundle = torchaudio.pipelines.HUBERT_BASE
        elif model_name == 'HUBERT_LARGE':
            self.bundle = torchaudio.pipelines.HUBERT_LARGE
        elif model_name == 'WAVLM_BASE':
            self.bundle = torchaudio.pipelines.WAVLM_BASE
        elif model_name == 'WAVLM_LARGE':
            self.bundle = torchaudio.pipelines.WAVLM_LARGE

        self.model = self.bundle.get_model().to(self.device)

    def _get_data_locations(self):
        #gatherting all audio file locations
        self.all_data = []
        for split in self.data_splits:
            split_location = self.dataset_location + 'split_' + str(split) + '/'
            for audiofile in os.listdir(split_location):
                filename = split_location + audiofile 
                self.all_data.append(filename)

        random.shuffle(self.all_data)
        self.len = len(self.all_data)

    def __len__(self):
        return self.len
         

    def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        audiofile = self.all_data[index]
        emotion_id = audiofile.split('/')[-1][5]
        emotion = self.emotion_mapper[emotion_id]
        emotion_label = self.emotion_to_label[emotion]

        # step 2: load audio and features
        wave, sr = torchaudio.load(audiofile)
        wave = wave.to(self.device)
        if sr != self.bundle.sample_rate:
            wave = torchaudio.functional.resample(wave, sr, self.bundle.sample_rate)

        with torch.inference_mode():
            features, _ = self.model.extract_features(wave)

        #concatenate all features for the 12 layers
        features_pt = torch.tensor([])
        for layer in range(len(features)):
            features_pt = torch.cat((features_pt, features[layer].detach().cpu()), dim = 0)
        seq_length = features_pt.shape[1]

        return features_pt, emotion_label, seq_length


def my_collate_function(data):

    features, labels, seq_lengths = zip(*data)
    batch_size = len(features)
    max_seq_sen = max(seq_lengths)

    #FEATURES HAS DIMENSIONS 12 * Seq_Len * feature_dim
    features_collated = torch.zeros((batch_size, features[0].shape[0], max_seq_sen, features[0].shape[2]))

    for i in range(batch_size):
        features_collated[i,:,:seq_lengths[i], :] = features[i]

    labels = torch.tensor(labels)
    seq_lengths = torch.tensor(seq_lengths)

    return features_collated, labels, seq_lengths



def initialize_data(dataset_location, train_splits, test_splits, model_name):
    training_set = german_dataset(dataset_location, train_splits, model_name)
    testing_set = german_dataset(dataset_location, test_splits, model_name)

    train_params = {'batch_size': 32,
              'shuffle': True,
              }

    test_params = {'batch_size': 32,
              'shuffle': False,
              'num_workers': 4
              }

    train_loader = DataLoader(training_set, **train_params, collate_fn = my_collate_function)
    test_loader = DataLoader(testing_set, **test_params, collate_fn = my_collate_function)

    return train_loader, test_loader

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
import time
import os
import numpy as np
import argparse


def train_model(my_dataloader, model, criterion, optimizer, train_flag = True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if train_flag:
        model.train()
    else:
        model.eval()

    all_predictions = []
    all_labels = []
    for i, (data, labels, lengths) in enumerate(my_dataloader):
        optimizer.zero_grad()

        #send data to device
        data = data.to(device)
        labels = labels.to(device)

        #train model
        logits = model(data, lengths, device)
        if train_flag:
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

        #store predictions
        predictions = torch.argmax(logits, dim = 1).detach().cpu().tolist()
        labels = labels.detach().cpu().tolist()
        all_predictions.extend(predictions)
        all_labels.extend(labels)


    accuracy = accuracy_score(all_labels, all_predictions)
    print('Train' if train_flag else 'Test', 'Accuracy :', accuracy)

    return model, accuracy


for model_index in range(4):

#     parser = argparse.ArgumentParser(description="List fish in aquarium.")
#     parser.add_argument("--model", type=str, help='WAV2VEC2_BASE/WAV2VEC2_LARGE/HUBERT_BASE/HUBERT_LARGE')
#     # args = parser.parse_args()
#     # print(1)
    args = 'WAV2VEC2_BASE/WAV2VEC2_LARGE/HUBERT_BASE/HUBERT_LARGE'.split('/')
    dataset_location = 'MESD_Final_Splits/'
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    logfilename = 'training_logs_' + args[model_index] + '.txt'
    f = open(logfilename, 'w')
    f.close()

    all_split_accuracies = []
    for test_split in range(5):
        #initialize model
        model = Dense(args[model_index])
        model.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        #initialize dataset
        train_splits = [0,1,2,3,4]
        train_splits.remove(test_split)
        train_loader, test_loader = initialize_data(dataset_location, train_splits, [test_split],args[model_index])

        best_test_accuracy = 0
        for epoch in range(20):

            #training
            start_time = time.time()
            print('Epoch:', epoch + 1)

            model, _ = train_model(train_loader, model, criterion, optimizer, True)
            _, test_accuracy = train_model(test_loader, model, criterion, optimizer, False)

            print('Time:', (time.time() - start_time) / 60)
            print('-'*30)

            #save model
            if test_accuracy > best_test_accuracy:
                best_test_accuracy = test_accuracy
                model_save_location = 'Models/' + args[model_index] + '/'
                os.makedirs(model_save_location, exist_ok=True)
                model_save_path = model_save_location + 'split_' + str(test_split) + '.pt'
                torch.save(model.state_dict(), model_save_path)
                agg_weights = ' '.join([str(weight[0]) for weight in model.aggr.state_dict()['weight'][0].detach().cpu().tolist()])

        
        all_split_accuracies.append(best_test_accuracy)
        f = open(logfilename, 'a')
        f.write('SPLIT : ' + str(test_split) + '|' + str(best_test_accuracy) + '|' + agg_weights + '\n')
        f.close()

    mean_accuracy = np.mean(np.array(all_split_accuracies))
    std_accuracy = np.std(np.array(all_split_accuracies))

    f = open(logfilename, 'a')
    f.write('MEAN : ' + str(mean_accuracy) + '| STD:' + str(std_accuracy) + '\n')
    f.close()


cuda
Epoch: 1
Train Accuracy : 0.4472934472934473
Test Accuracy : 0.6862745098039216
Time: 4.780958616733551
------------------------------
Epoch: 2
Train Accuracy : 0.8155270655270656
Test Accuracy : 0.9215686274509803
Time: 4.7107259432474775
------------------------------
Epoch: 3
