# 11785 Final Project 

Use the MFCC data from the Speech Accent Archive to predict a speaker's native language (using an accent identificaiton vector).

# Libraries

In [18]:
!pip install torchsummaryX wandb --quiet
!pip install librosa

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting librosa
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
     |████████████████████████████████| 214 kB 28.8 MB/s            
[?25hCollecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
     |████████████████████████████████| 56 kB 8.5 MB/s             
Collecting audioread>=2.1.9
  Downloading audioread-3.0.0.tar.gz (377 kB)
     |████████████████████████████████| 377 kB 106.9 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting numba>=0.45.1
  Downloading numba-0.53.1-cp36-cp36m-manylinux2014_x86_64.whl (3.4 MB)
     |████████████████████████████████| 3.4 MB 95.7 MB/s            
Collecting soundfile>=0.10.2
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl (1.3 MB)
     |████████████████████████████████| 1.3 MB 103.2 MB/s            
Collecti

In [95]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
import librosa
import random
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [3]:
### If you are using colab, you can import google drive to save model checkpoints in a folder
#from google.colab import drive
#drive.mount('/content/drive')

In [34]:
### Native Lanuguage List 
TARGET_ACCENTS = [
            "arabic", "german", "polish", "japanese", "english", "greek", "mandarin", "bengali", "korean", "bulgarian" ] # Start with 10 native lanugages

In [89]:
CONFIG = {
    "sample_rate" :  6000, # 'low fidelity'
    "num_mfcc_coefficients": 27, # Same as out HWs
    "cnn_embedding_dimension": 64,
    "max_duration":5, # Chop audio at 5 seconds. 
    'epochs'        : 30,
    'batch_size'    : 1024,
    'context'       : 30,
    'init_lr'       : 0.001,
    'architecture'  : 'medium-cutoff', 
    'weight_decay'  : 1e-2, 
    'dropout_rate': 0.20,
    'patience' :2, 
    
}

# Kaggle

This section contains code that helps you install kaggle's API, creating kaggle.json with you username and API key details. Make sure to input those in the given code to ensure you can download data from the competition successfully.

In [90]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
#!sudo mkdir /root/.kaggle

#with open("/root/.kaggle/kaggle.json", "w+") as f:
    #f.write('{"username":"thomas99talbot","key":"7b4c3ebc2ff2115c12f9c288b9aea9a0"}') 
    # Put your kaggle username & key here

#!chmod 600 /.kaggle/kaggle.json

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting kaggle==1.5.8
  Using cached kaggle-1.5.8-py3-none-any.whl
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.8
    Uninstalling kaggle-1.5.8:
      Successfully uninstalled kaggle-1.5.8
Successfully installed kaggle-1.5.8


In [14]:
# commands to download data from kaggle

!kaggle datasets download -d rtatman/speech-accent-archive
!unzip -qo 'speech-accent-archive.zip' -d '/efs/SpeechAccentArchive'

speech-accent-archive.zip: Skipping, found more recently modified local copy (use --force to force download)
/efs/SpeechAccentArchive/recordings/recordings/polish33.mp3:  write error (disk full?).  Continue? (y/n/^C) ^C


# Dataset

# Load in a mp3 file into MFCC coefficients. 

In [101]:
#fname = "/efs/SpeechAccentArchive/recordings/recordings/english60.mp3"
#y, sr = librosa.load(fname, sr=CONFIG["sample_rate"], duration = CONFIG["max_duration"]) # Chops audio at 5 seconds. 
#mfcc = librosa.feature.mfcc(y=y, sr=CONFIG["sample_rate"], n_mfcc=CONFIG["num_mfcc_coefficients"])
#print(type(mfcc))

from numpy.random import seed
from numpy.random import choice

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, val_partition = False): # Feel free to add more arguments
        # TODO: Load in all files from the root directory that have a native language in the native languages list.
        #y, sr = librosa.load(, sr=SAMPLE_RATE, duration = 5)
        mp3_file_names = os.listdir(root)
        print("The length of mp3 file names is {}".format(len(mp3_file_names)))
        mp3_file_names.sort()
        self.mfccs = []
        self.native_languages = []
    
        for i in range(len(mp3_file_names)):
            if mp3_file_names[i].startswith(tuple(TARGET_ACCENTS)):
                y, sr = librosa.load(root + "/" + mp3_file_names[i],sr=CONFIG["sample_rate"], duration = CONFIG["max_duration"])
                mfcc = librosa.feature.mfcc(y=y, sr=CONFIG["sample_rate"], n_mfcc=CONFIG["num_mfcc_coefficients"])

                # Do Cepstral Normalization :) 
                avg = np.mean(mfcc, axis=0)
                std = np.std(mfcc, axis=0)
                mfcc = (mfcc-avg)/std
                self.mfccs.append(mfcc) # Save the mfcc.

                # Save the corresponding native language. 
                for j in TARGET_ACCENTS:
                    if mp3_file_names[i].find(j) == 0:
                        self.native_languages.append(TARGET_ACCENTS.index(j))
                        #print("I found a file with target accent {}".format(j))

                assert(len(self.mfccs) == len(self.native_languages))


        # TODO: Add some sort of padding based on context size? 

        # Split the mfccs into train/val/test data based on a 70/20/10
        seed(1)
        rand_indices = choice(len(self.mfccs), len(self.mfccs)//5, False) # 20% of the data. 
        if val_partition: 
            # Use 20% random sample.
            val_mfccs = []
            val_native_languages = []
            for ind in rand_indices:
                val_mfccs.append(self.mfccs[ind])
                val_native_languages.append(self.native_languages[ind])

            assert(len(val_mfccs)==len(val_native_languages))
            self.mfccs = val_mfccs
            self.native_languages = val_native_languages
        else:
            # Use the remaining 80% for training data.  
            train_mfccs = []
            train_native_languages = []
            for i in range(len(self.mfccs)):
                if i not in rand_indices:
                    train_mfccs.append(self.mfccs[i])
                    train_native_languages.append(self.native_languages[i])
            
            assert(len(train_mfccs)==len(train_native_languages))
            self.mfccs = train_mfccs
            self.native_languages = train_native_languages

        self.length = len(self.mfccs)
        print("The length of train data is {}".format(self.length))

    def __len__(self):
        return self.length


    def __getitem__(self, ind):
        
        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        #frames = self.mfccs[ind:ind+2*self.context+1]
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        #frames = frames.flatten() # TODO: Flatten to get 1d data
        frames = self.mfccs[ind]
        frames      = torch.FloatTensor(frames) # Convert to tensors
          

        return frames, self.native_languages[ind]
    



# Create Datasets

In [104]:
# Create a dataset object using the AudioDataset class for the training data 
train_data = AudioDataset('/efs/SpeechAccentArchive/recordings/recordings', val_partition=False)

# Create a dataset object using the AudioDataset class for the validation data 
val_data = AudioDataset('/efs/SpeechAccentArchive/recordings/recordings/', val_partition = True) 
 
# TODO: Create a dataset object using the AudioTestDataset class for the test data 
#test_data = AudioTestDataset('/content/data/11-785-s23-hw1p2', PHONEMES, config['context'], 'test-clean')

The length of mp3 file names is 1564


In [64]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 4,
    batch_size  = CONFIG['batch_size'], 
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 2,
    batch_size  = CONFIG['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

'''test_loader = torch.utils.data.DataLoader(
    dataset     = test_data, 
    num_workers = 2, 
    batch_size  = config['batch_size'], 
    pin_memory  = True, 
    shuffle     = False
)
'''

print("Batch size     : ", CONFIG['batch_size'])
print("Output symbols : ", len(TARGET_ACCENTS))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
#print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
#print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size     :  1024
Context        :  30
Input size     :  1647
Output symbols :  10
Train dataset samples = 938, batches = 1


In [67]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, native_lang = data
    print(frames.shape, len(native_lang))
    break

torch.Size([938, 27, 59]) 938


# Network Architecture


This section defines your network architecture for accent identification. We follow the basic description outlined in ~ https://arxiv.org/pdf/2109.07349.pdf.

In [73]:
# This architecture will make you cross the very low cutoff
# However, you need to run a lot of experiments to cross the medium or high cutoff
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):
        super(Network, self).__init__()

        self.embedding = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=CONFIG["num_mfcc_coefficients"], out_channels= CONFIG["cnn_embedding_dimension"], kernel_size=3, padding=3//2), 
            torch.nn.BatchNorm1d(num_features=CONFIG["cnn_embedding_dimension"])
        )
        self.transformer = torch.nn.Transformer(d_model=CONFIG["cnn_embedding_dimension"])
        self.fc_layer = torch.nn.Linear(in_features=CONFIG["cnn_embedding_dimension"], out_features=output_size)
        self.apply(self._init_weights)

    def _init_weights(self, m):
      if isinstance(m, torch.nn.Linear):
          torch.nn.init.kaiming_uniform_(m.weight)
          torch.nn.init.zeros_(m.bias)
        
    def forward(self, x):
        embedding_out = self.embedding(x)
        transformer_out = self.transformer(embedding_out)
        out = self.fc_layer(transformer_out)
        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler. 

In [79]:
INPUT_SIZE  = CONFIG["num_mfcc_coefficients"] # Why is this the case?
model       = Network(INPUT_SIZE, len(TARGET_ACCENTS)).to(device)
#summary(model, frames.to(device)) 

In [80]:
# Define loss function.
# The paper (linked above) used a more complicated loss function the "SDC-loss"
criterion = torch.nn.CrossEntropyLoss()   

# Define Optimizer.
optimizer = torch.optim.AdamW(model.parameters(), lr= CONFIG['init_lr'], weight_decay=CONFIG['weight_decay']) # Defining Optimizer

# Define Scheduler for Learning Rate. 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=CONFIG['patience'])

# Training and Validation Functions

This section covers the training, and validation functions for each epoch of running your experiment with a given model architecture. The code has been provided to you, but we recommend going through the comments to understand the workflow to enable you to write these loops for future HWs.

In [81]:
torch.cuda.empty_cache()
gc.collect()

14755

In [82]:
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    
    for i, (frames, native_lang) in enumerate(dataloader):
        
        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        native_lang    = native_lang.to(device)

        ### Forward Propagation
        logits  = model(frames)

        ### Loss Calculation
        # Create a one-hot vector for the native language? 
        loss    = criterion(logits, native_lang)

        ### Backward Propagation
        loss.backward() 
        
        ### Gradient Descent
        optimizer.step()      

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == native_lang).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))), 
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, native_lang, logits
        torch.cuda.empty_cache()
  
    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [83]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode(): 
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]
        
        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))), 
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()
    
        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

# Weights and Biases Setup

This section is to enable logging metrics and files with Weights and Biases. Please refer to wandb documentationa and recitation 0 that covers the use of weights and biases for logging, hyperparameter tuning and monitoring your runs for your homeworks. Using this tool makes it very easy to show results when submitting your code and models for homeworks, and also extremely useful for study groups to organize and run ablations under a single team in wandb. 

We have written code for you to make use of it out of the box, so that you start using wandb for all your HWs from the beginning.

In [84]:
wandb.login(key="1d44e8777f8970f1b0343326de9227cf69eefb36") #API Key is in your wandb account, under settings (wandb.ai/settings)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mttalbot[0m ([33mdeeper_learners[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ubuntu/.netrc


True

In [85]:
# Create your wandb run
run = wandb.init(
    name    = "test-run", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "ASR_accent_identifciation", ### Project should be created in your wandb account 
    config  = config ### Wandb Config for your run
)

In [86]:
### Save your model architecture as a string with str(model) 
model_arch  = str(model)

### Save it in a txt file 
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

['/home/ubuntu/ASRProject/wandb/run-20230405_220318-nhunocl2/files/model_arch.txt']

# Experiment

Now, it is time to finally run your ablations! Have fun!

In [87]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")
best_val_acc = 0

for epoch in range(CONFIG['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, CONFIG['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    # Update the learning rate for the next epoch 
    scheduler.step(val_loss)

    ### Log metrics at each epoch in your run 
    # Optionally, you can log at each batch inside train/eval functions 
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss, 
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})

    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best
    if val_acc > best_val_acc:
      best_val_acc = val_acc
      torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            #'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'val_acc': val_acc}, 
          '/content/drive/MyDrive/Colab Notebooks/model_checkpoint.pth')

### Finish your wandb run
run.finish()


Epoch 1/30


Train:   0%|          | 0/1 [00:00<?, ?it/s]

AttributeError: 'list' object has no attribute 'to'

# Testing and submission to Kaggle

Before we get to the following code, make sure to see the format of submission given in *sample_submission.csv*. Once you have done so, it is time to fill the following function to complete your inference on test data. Refer the eval function from previous cells to get an idea of how to go about completing this function.

In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval() # TODO train or eval?

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.inference_mode(): # TODO

        for i, mfccs in enumerate(tqdm(test_loader)):

            mfccs   = mfccs.to(device)             
            logits  = model(mfccs)
            
            ### Get most likely predicted phoneme with argmax
            #print(logits.shape)
            predicted_phonemes = torch.argmax(logits, dim=1)

            #print(predicted_phonemes)
            #print(predicted_phonemes.shape)
            #assert(0)

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
            # This could be slow 
            for ind in range(len(predicted_phonemes)):
                pred_phoneme = PHONEMES[predicted_phonemes[ind]]
                test_predictions.append(pred_phoneme)
    
    print("The test predictions are {} of shape {}".format(test_predictions, len(test_predictions)))
    return test_predictions

In [None]:
state_dict = torch.load('/content/drive/MyDrive/Colab Notebooks/model_checkpoint_88.pth')
model.load_state_dict(state_dict['model_state_dict'])
predictions = test(model, test_loader)

  0%|          | 0/1889 [00:00<?, ?it/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
### Create CSV file with predictions
with open("/content/data/submission3.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))

In [None]:
### Submit to kaggle competition using kaggle API (Uncomment below to use)
# !kaggle competitions submit -c 11-785-s23-hw1p2 -f ./submission.csv -m "Test Submission"

### However, its always safer to download the csv file and then upload to kaggle