## Task 3

### Task 3a

#### 1. Import the Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import librosa
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, EvalPrediction,  TrainingArguments, Trainer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from tqdm import tqdm
import jiwer
import evaluate

In [3]:
# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. You can use the GPU.")
else:
    print("CUDA is not available. Using the CPU.")

CUDA is not available. Using the CPU.


#### 2. Initialize the Wav2Vec2Processor and model
- Wav2Vec2Processor from the Hugging Face transformers library to handle both audio feature extraction and text tokenization.
  - Feature Extraction for audio (converting raw audio into a format compatible with Wav2Vec2)
  - Text Tokenization for transcriptions (converting transcriptions into token IDs that the model can process)

In [41]:
# Initialize the Wav2Vec2 processor 
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 3. Audio Preprocessing with Wav2Vec2Processor
- Audio was resampled to 16kHz as wav2vec2 model was pretrained on 16kHz sampled speech audio (target sample rate)
- Audio was normalised to ensure consistency across different files

In [None]:
def preprocess_audio(file_path, target_sr=16000, max_length=16000):
    
    # Load audio file with Librosa (default sampling rate is None, so it loads as is)
    audio, sr = librosa.load(file_path, sr=target_sr)

    # Resample if the sampling rate is different
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

    # Normalize the audio to [-1, 1]
    audio = audio / np.max(np.abs(audio))

    # Check for NaNs or Infs
    if np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
        print(f"Warning: NaN or Inf detected in audio file {file_path}")
        return None

    # Truncate or pad the audio to the fixed length
    if len(audio) > max_length:
        audio = audio[:max_length]  # Truncate to max_length
    elif len(audio) < max_length:
        padding = max_length - len(audio)
        audio = np.pad(audio, (0, padding), mode='constant')  # Pad with zeros

    # Convert audio to numpy array and process with Wav2Vec2 feature extractor
    input_values = processor(audio, sampling_rate=target_sr, return_tensors="pt").input_values  # Use processor to get input values

    # Ensure the input has the correct shape [batch_size, num_channels, num_frames]
    # The model expects input shape: [batch_size, num_channels, num_frames]
    # Here, batch_size = 1, num_channels = 1 (mono audio), num_frames = length of the audio signal
    input_values = input_values.squeeze(0)  # Remove the batch dimension to get [num_channels, num_frames]
    print(f"Input shape after squeeze: {input_values.shape}")  # Check the shape
    
    input_values = input_values.unsqueeze(0)  # Add the batch dimension back
    # Make sure the shape is [1, num_frames] (num_channels = 1)
    print(f"Input shape after unsqueeze: {input_values.shape}")  # Check the shape again
    
    # Move the input values to the same device as the model (GPU or CPU)
    input_values = input_values.to(device)
    
    print(input_values.shape)
    
    return input_values

#### 4. Text Tokenization with Wav2Vec2Processor
- Converting transcription to input IDs (numerical representation of the transcription)
- Padding/truncating the text to a fixed maximum length 

In [44]:
def tokenize_transcription(transcription, processor, max_length=256):
    # Tokenize transcription using Wav2Vec2Processor
    tokenized = processor(text=transcription, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    return tokenized.input_ids.squeeze(0)
    #torch.tensor(tokenized['input_ids'], dtype=torch.long)

#### 5. Custom Dataset class to load and process the audio data

In [3]:
class CommonVoiceDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor
        self.audio_files = dataframe['filename'].values
        self.transcriptions = dataframe['text'].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Preprocess the audio
        input_values = preprocess_audio(self.audio_files[idx])

        # Tokenize transcription
        transcription = self.transcriptions[idx]
        labels = tokenize_transcription(transcription, self.processor)
        
        return {
            'input_values': input_values,
            'labels': labels
        }

#### 6. Loading the data

In [46]:
# Path to the folder of the downloaded data "Common Voice"
cv_folder = 'C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techtest/common_voice/'

# Path to the folder that contains the common-voice mp3 files under cv-valid-train folder
audio_folder = os.path.join(cv_folder, 'cv-valid-train/')

# Path to the CSV file that contains the metadata
csv_file = os.path.join(cv_folder, 'cv-valid-train.csv')

cv_df = pd.read_csv(csv_file)

# Update the file paths in the Dataframe
cv_df['filename'] = cv_df['filename'].apply(lambda x: os.path.join(audio_folder, x))

# Check the first few paths
print(cv_df['filename'].head())

0    C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...
1    C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...
2    C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...
3    C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...
4    C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...
Name: filename, dtype: object


#### 7. Training and Validation Data Split

In [47]:
# Split the dataset into 70% training and 30% validation
train_df, val_df = train_test_split(cv_df, test_size=0.3, random_state=42)

# Check the split
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

Training set size: 137043
Validation set size: 58733


#### 8. Create DataLoaders for training and validation

In [None]:
# Create PyTorch datasets for training and validation
train_dataset = CommonVoiceDataset(train_df, processor)
val_dataset = CommonVoiceDataset(val_df, processor)

#### 9. Fine-tuning the model


In [12]:
# Load word error rate (WER) metric using the evaluate library for evaluation
wer_metric = evaluate.load("wer")

In [None]:
def compute_wer(prediction: EvalPrediction):
    # Convert model output logits to predicted text
    pred_ids = prediction.predictions.argmax(axis=-1)  # Choose the predicted token
    pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True)

    # Decode the references (ground truth transcriptions)
    labels = prediction.label_ids
    ref_text = processor.batch_decode(labels, skip_special_tokens=True)

    # Compute WER
    return wer_metric.compute(predictions=pred_text, references=ref_text)

In [None]:
#Training loop setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from transformers import Trainer, TrainingArguments

# Load the pre-trained Wav2Vec2 model for CTC (Connectionist Temporal Classification)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h", ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned",  # Where the model outputs will be saved
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=1e-3,  # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=2,  # Number of training epochs. Best to include more epochs so model can learn better but was limited by computational resources
    logging_dir="./logs",  # Where logs will be saved
    logging_steps=500,
    gradient_accumulation_steps=2,  # Accumulate gradients over smaller batches
    lr_scheduler_type="cosine",  # Use cosine annealing for learning rate decay
    save_steps=500,  # Save model every 500 steps
    weight_decay=0.01,  # Weight decay for regularization,
    gradient_checkpointing=True,
    group_by_length=True  # makes training more efficient by grouping training samples of similar input length into one batch. 
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {
        'input_values': torch.stack([item['input_values'] for item in data]),
        'labels': torch.stack([item['labels'] for item in data])    
    },
    tokenizer=processor.feature_extractor
)

In [None]:
# Train the model
trainer.train()

#### I am sorry I did not manage to debug and fine-tune the model. I was limited by the GPU. I also ran the code in Google Colab but was limited by the resources available too. Hence, for subsequent tasks (3c, 4, 5) which require the use of the fine-tuned model or the transcribed results from it, I have used the base model ("facebook/wav2vec2-base-960h") as the "fine-tuned" model so I can continue with the tasks. My apologies for that. 

#### 10.Evaluate and visualize Metrics
Visualize the training and validation loss to check how well the model is performing.

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print(f"Evaluation results: {eval_results}")

# Visualize the training and validation loss
train_loss = [x['loss'] for x in trainer.state.log_history if 'loss' in x]
eval_loss = [x['eval_loss'] for x in trainer.state.log_history if 'eval_loss' in x]

plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()

### Task 3b

#### 11. Save the fine-tuned model
Once the model has finished training, save the fine-tuned model and the processor for future use:

In [None]:
# Save the fine-tuned model
model.save_pretrained("./wav2vec2-large-960h-cv-finetuned")
processor.save_pretrained("./wav2vec2-large-960h-cv-finetuned")

### Task 3c

#### Do note that for subsequent tasks, I have used the base model ("facebook/wav2vec2-base-960h") for the fine-tuned model.

In [7]:
# Load the model and tokenizer
model_name = "facebook/wav2vec2-base-960h"
model = Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained(model_name)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Path to the folder that contains the mp3 files and the CSV file
cv_folder = 'C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techtest/common_voice/'  # Update this to your local path
audio_folder = os.path.join(cv_folder, 'cv-valid-test/')
csv_file = os.path.join(cv_folder, 'cv-valid-test.csv')

# Load the CSV file into a dataframe
cv_df = pd.read_csv(csv_file)

# Update the 'filename' column to include the full path to the audio files
cv_df['filename'] = cv_df['filename'].apply(lambda x: os.path.join(audio_folder, x))

# Show the first few rows of the DataFrame
cv_df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text_fine-tuned
0,C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...,without the dataset the article is useless,1,0,,,,,without the dat asset the articles useless
1,C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...,i've got to go to him,1,0,twenties,male,,,ive gat go to him
2,C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...,and you know it,1,0,,,,,and you know it
3,C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,,down below in the darkness were hundreds of pe...
4,C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...,hold your nose to keep the smell from disablin...,2,0,,,,,hold your nose to keep the smell from disablin...


In [9]:
def preprocess_audio(file_path, target_sr=16000):
    # Load audio file using librosa
    audio, sr = librosa.load(file_path, sr=None)
    
    # Resample audio if needed
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

    # Normalize the audio to [-1, 1]
    audio = audio / np.max(np.abs(audio))
    
    # Convert to tensor for Wav2Vec2 processor
    input_values = processor(audio, sampling_rate=target_sr, return_tensors="pt").input_values
    input_values = input_values.squeeze(0)
    input_values = input_values.unsqueeze(0)
    return input_values  # Remove batch dimension

In [10]:
def transcribe_audio(file_path):
    # Preprocess the audio file
    input_values = preprocess_audio(file_path)
    
    # Move input values to the same device as the model
    input_values = input_values.to(device)

    # Perform inference (model outputs logits)
    with torch.no_grad():
        logits = model(input_values).logits
    
    # Use argmax to get the most probable token ids
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the token ids to text
    transcription = processor.batch_decode(predicted_ids)
    
    return transcription[0]

##### Generated text stored in cv-valid-test.csv under column "generated_text_fine-tuned"

In [11]:
# Create a new column with transcriptions
cv_df['generated_text_fine-tuned'] = cv_df['filename'].apply(lambda x: transcribe_audio(x))
cv_df['generated_text_fine-tuned'] = cv_df['generated_text_fine-tuned'].str.lower()

# Print a sample to verify
print(cv_df[['filename', 'generated_text_fine-tuned']].head())

                                            filename  \
0  C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...   
1  C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...   
2  C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...   
3  C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...   
4  C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techte...   

                           generated_text_fine-tuned  
0         without the dat asset the articles useless  
1                                  ive gat go to him  
2                                    and you know it  
3  down below in the darkness were hundreds of pe...  
4  hold your nose to keep the smell from disablin...  


In [13]:
def evaluate_model_generated_text_fine_tuned(df):
    predictions = []
    references = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        # Get the file path and ground truth transcription
        predicted_text = row['generated_text_fine-tuned']
        ground_truth = row['text']
        
        # Append predictions and references for evaluation
        predictions.append(predicted_text)
        references.append(ground_truth)
    
    # Compute the Word Error Rate (WER)
    wer_score = wer_metric.compute(predictions=predictions, references=references)
    return wer_score

In [14]:
wer_score = evaluate_model_generated_text_fine_tuned(cv_df)

# Log the overall performance
print(f"Word Error Rate (WER) on the cv_valid_dev set (groundtruth VS fine-tuned): {wer_score * 100:.2f}%")

100%|████████████████████████████████████████████████████████████████████████████| 3995/3995 [00:00<00:00, 4565.84it/s]


Word Error Rate (WER) on the cv_valid_dev set (groundtruth VS fine-tuned): 13.67%


#### Word Error Rate (WER) of "fine-tuned" model on the cv-valid-test set: 13.67%