#### Prelude

In [None]:
!pip install datasets==2.14.4 'transformers[torch]' evaluate gcsfs scikit-learn torchinfo torch torchaudio datasets

In [None]:
import torch
import librosa
import os
import numpy as np
import random
import torchaudio
import evaluate
from accelerate import notebook_launcher
from datasets import Audio, load_from_disk, load_dataset
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from typing import List, Dict, Tuple
from torch.utils.data import DataLoader
from torchinfo import summary
from huggingface_hub import login

In [None]:
if torch.cuda.is_available():
    dev = "cuda:0"
    print("Using GPU")
else:
    print("No GPU")
    dev = "cpu"

In [None]:
my_token = "REDACTED"

assert my_token != "REDACTED", "Please provide a huggingface token."
login(token=my_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Preprocessing

In [None]:
# Determine classes
accents = ['United States English',
            'England English',
            'India and South Asia (India, Pakistan, Sri Lanka)']
accents_map = {accent: i for i, accent in enumerate(accents)}
SAMPLING_RATE = 16_000
SECONDS = 3


def process_dataset(dataset, filter: bool = True):
    def trim_audio(example):
        example['audio']['array'], index = librosa.effects.trim(y=example['audio']['array'])
        return example

    def crop_audio(example):
        example["audio"]["array"] = example["audio"]["array"][:SECONDS * SAMPLING_RATE]
        return example

    def balance_data(example):
        return example["accent"] != "United States English" or random.randint(0, 2) == 0

    def encode_accent(example):
        return {"accent": accents_map[example["accent"]]}

    # Remove columns other than audio and accent
    dataset = dataset.select_columns(["audio", "accent"])
    # Fix sampling rate
    dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

    # Remove unwanted accents
    dataset = dataset.filter(lambda x: x in accents, num_proc=8)

    if filter:
      # Remove a third of US accent samples to balance the data
      random.seed(a=1)
      dataset = dataset.filter(balance_data)

    # Trim silence at beginning and end of clip
    dataset = dataset.map(trim_audio, num_proc=8)

    # Filter out short clips
    dataset = dataset.filter(lambda example: example["audio"]["array"].shape[0] >= SECONDS * SAMPLING_RATE, num_proc=8)

    # Trim anything past SECONDS
    dataset = dataset.map(crop_audio, num_proc=8)

    # Encode accent
    dataset = dataset.map(encode_accent, num_proc=8)

    # wav2vec2 model requires output column to be named "label"
    dataset = dataset.rename_column("accent", "label")

    return dataset

In [None]:
# Feature Extraction
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
assert feature_extractor.sampling_rate == SAMPLING_RATE

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=SAMPLING_RATE, max_length=SECONDS*SAMPLING_RATE, truncation=True
    )
    return inputs

In [None]:
# Load Common Voice dataset from Hugging Face
dataset = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train", cache_dir="cache")

dataset = process_dataset(dataset)

dataset = dataset.map(
    preprocess_function,
    remove_columns="audio",
    batched=True,
    batch_size=2000,
    load_from_cache_file=False,
    num_proc=8,
)
train = dataset

In [None]:
# Load Validation set
val = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="validation", cache_dir="cache")

val = process_dataset(val, filter=False)

val = val.map(
    preprocess_function,
    remove_columns="audio",
    batched=True,
    batch_size=2000,
    load_from_cache_file=False,
    num_proc=8,
)

In [None]:
# Load test set
test = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", cache_dir="cache")

test = process_dataset(test, filter=False)

test = test.map(
    preprocess_function,
    remove_columns="audio",
    batched=True,
    batch_size=2000,
    load_from_cache_file=False,
    num_proc=8,
)

### Training and feature extraction

In [None]:
label2id = accents_map
id2label = {v: k for k, v in label2id.items()}

In [None]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)


def train_model(model, batch_size, lr):
    training_args = TrainingArguments(
        output_dir="model_out_6",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=32,
        # Only the first 4 epochs were analysed in the dissertation
        num_train_epochs=5,
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        # Upload to model to hub after each training epoch
        push_to_hub=True,
        # Load data in separate thread
        dataloader_num_workers=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train.with_format("torch"),
        eval_dataset=val.with_format("torch"),
        tokenizer=feature_extractor,
        compute_metrics=compute_metrics,
    )
    trainer.train()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

### Experiment: Batch size and learning rate

Each train batch size and learning rate combination was set to train for 5 epochs as can be seen in the code of the previous cell. However, during the training of model5 (batch size 8, learning rate 1e-6) there was a connectivity error that caused training to stop before the 5th epoch. Instead of retraining the model from scratch (since training time per 5 epochs was roughly 6 hours), I decided to only compare the first 4 epochs for each model.

#### Train Batch Size 8, learning rate 1e-6

In [None]:
model1 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.weight', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use accelerate library to setup GPU and run training.
notebook_launcher(train_model(model=model1, batch_size=8, lr=1e-6), num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.1934,0.590846,0.781775
2,0.178,0.605535,0.775779
2,0.2407,0.63427,0.780576
4,0.2259,0.625045,0.780576
4,0.1346,0.644285,0.776978


#### Train Batch Size 8, learning rate 1e-5

In [None]:
model2 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_launcher(train_model(model=model2, batch_size=8, lr=1e-5), num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.1299,0.666345,0.784173
2,0.1226,0.754313,0.784173
2,0.1339,0.881027,0.791367
4,0.073,1.001559,0.794964
4,0.0573,1.165421,0.778177


#### Train Batch Size 8, learning rate 1e-4

In [None]:
model3 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_launcher(train_model(model=model3, batch_size=8, lr=1e-4), num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.2958,0.828595,0.718225
2,0.2185,0.662201,0.766187
2,0.2424,0.698771,0.769784
4,0.1759,0.797571,0.793765
4,0.1242,0.872529,0.796163


#### Train Batch Size 16, learning rate 1e-6

In [None]:
model4 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_launcher(train_model, num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.2156,0.59066,0.779376
1,0.2253,0.570457,0.780576
2,0.2496,0.61357,0.773381
4,0.1237,0.610534,0.776978
4,0.1958,0.601529,0.779376


#### Train Batch Size 16, learning rate 1e-5

In [None]:
model5 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_launcher(train_model(model=model5, batch_size=16, lr=1e-5), num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.1429,0.637826,0.790168
1,0.1011,0.750809,0.772182
2,0.0926,0.792801,0.770983
4,0.0531,0.868151,0.790168


#### Train Batch Size 16, learning rate 1e-4

In [None]:
model6 = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_launcher(train_model(model=model6, batch_size=16, lr=1e-4), num_processes=1)

Launching training on one GPU.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.161,0.666441,0.766187
1,0.1932,0.645268,0.793765
2,0.1654,0.730128,0.808153
4,0.081,0.873766,0.792566
4,0.036,1.055973,0.78777


#### Visualisations

Visualisations were created in a seperate file attached.

### Results

In [None]:
# Initial validation accuracy before fine-tuning
model_tmp = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(accents), label2id=label2id, id2label=id2label
).to(dev)
predictions = []
for batch in DataLoader(val.with_format("torch", device=dev), batch_size=32):
    outputs = model_tmp(batch["input_values"]).logits
    predictions.append(torch.argmax(outputs, dim=1))

accuracy.compute(predictions=torch.cat(predictions), references=val["label"])

{'accuracy': 0.23621103117505995}

In [None]:
# Test set evaluation on best model after training
total_correct = 0
total = 1113 # len(test)

# Move to GPU
test = test.with_format("torch", device=dev)

for batch in DataLoader(test, batch_size=32):
    outputs = model6(batch["input_values"], labels=batch["label"])
    correct = torch.sum(torch.argmax(outputs.logits, dim=1) == batch["label"])
    total_correct += correct

print(total_correct / total)

tensor(0.7559, device='cuda:0')


In [None]:
# Model architecture
batch_size = 16
summary(model6, depth=10, input_size=(batch_size, SAMPLING_RATE * SECONDS))

Layer (type:depth-idx)                                  Output Shape              Param #
Wav2Vec2ForSequenceClassification                       [16, 3]                   --
├─Wav2Vec2Model: 1-1                                    [16, 149, 512]            768
│    └─Wav2Vec2FeatureEncoder: 2-1                      [16, 512, 149]            --
│    │    └─ModuleList: 3-1                             --                        --
│    │    │    └─Wav2Vec2GroupNormConvLayer: 4-1        [16, 512, 9599]           --
│    │    │    │    └─Conv1d: 5-1                       [16, 512, 9599]           5,120
│    │    │    │    └─GroupNorm: 5-2                    [16, 512, 9599]           1,024
│    │    │    │    └─GELUActivation: 5-3               [16, 512, 9599]           --
│    │    │    └─Wav2Vec2NoLayerNormConvLayer: 4-2      [16, 512, 4799]           --
│    │    │    │    └─Conv1d: 5-4                       [16, 512, 4799]           786,432
│    │    │    │    └─GELUActivation: 5-5       

### Reference list

Audio Classification, Hugging Face: https://huggingface.co/docs/transformers/tasks/audio_classification

Auto Classes, Hugging Face: https://huggingface.co/docs/transformers/v4.32.1/en/model_doc/auto#transformers.AutoModel

Evaluator, Hugging Face: https://huggingface.co/docs/evaluate/package_reference/evaluator_classes

Model outputs: https://huggingface.co/docs/transformers/main_classes/output

Trainer, Hugging Face: https://huggingface.co/docs/transformers/v4.32.0/en/main_classes/trainer#transformers.TrainingArguments

Tranformers, Hugging Face: https://huggingface.co/docs/evaluate/transformers_integrations

Wav2Vec2, Hugging Face: https://huggingface.co/docs/transformers/model_doc/wav2vec2

Wav2Vec2-Base, Hugging Face: https://huggingface.co/facebook/wav2vec2-base