# Fine-tuning wav2vec for speaker classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/release_in_the_wild.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: release_in_the_wild/550.wav  
  inflating: release_in_the_wild/5500.wav  
  inflating: release_in_the_wild/5501.wav  
  inflating: release_in_the_wild/5502.wav  
  inflating: release_in_the_wild/5503.wav  
  inflating: release_in_the_wild/5504.wav  
  inflating: release_in_the_wild/5505.wav  
  inflating: release_in_the_wild/5506.wav  
  inflating: release_in_the_wild/5507.wav  
  inflating: release_in_the_wild/5508.wav  
  inflating: release_in_the_wild/5509.wav  
  inflating: release_in_the_wild/551.wav  
  inflating: release_in_the_wild/5510.wav  
  inflating: release_in_the_wild/5511.wav  
  inflating: release_in_the_wild/5512.wav  
  inflating: release_in_the_wild/5513.wav  
  inflating: release_in_the_wild/5514.wav  
  inflating: release_in_the_wild/5515.wav  
  inflating: release_in_the_wild/5516.wav  
  inflating: release_in_the_wild/5517.wav  
  inflating: release_in_the_wild/5518.wav  
  inflating: 

## Download and import the necessary libraries

In [None]:
!pip install transformers librosa datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from google.colab import files
import shutil
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from datasets import Dataset
from torch.utils.data import Dataset, DataLoader


## Initialization

In [None]:
# Paths to audio files and metadata
audio_folder = "/content/release_in_the_wild"
meta_path = "/content/release_in_the_wild/final_updated_meta.csv"

# Load metadata
meta_full = pd.read_csv(meta_path)
print(f"Loaded metadata with {len(meta_full)} entries.")

# Splitting the metadata
meta = meta_full.iloc[:19963]  # From index 0 to 19962
split2 = meta_full.iloc[19963:]  # From index 19963 to 24247

# Display the sizes of the splits
print(f"Split1 size: {len(meta)} entries.")
print(f"Split2 size: {len(split2)} entries.")

Loaded metadata with 20448 entries.
Split1 size: 19963 entries.
Split2 size: 485 entries.


In [None]:
celebrities = meta["speaker"].unique()
# Convert to a list (optional)
celebrities = celebrities.tolist()
label_map = {name: idx for idx, name in enumerate(celebrities)}

In [None]:
# Hyperparameters configuration
MAX_LENGTH = 16000  # Maximum length of audio (in samples)
SAMPLE_RATE = 16000
NUM_LABELS = len(celebrities)  # Number of classes
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 10
MAX_AUDIO_LENGTH = SAMPLE_RATE * 4  # 4 seconds max duration

## Audio input normalization

The audio normalization consists of following steps:  
1. Removing outliers
2. Remove imbalanced classes

### Remove outliers from data folder and metadata file

In [None]:
def identify_outliers(durations, threshold=1.5):
    """
    Identifies outliers in audio durations using the IQR method.

    Parameters:
    - durations: List of durations (in seconds).
    - threshold: Multiplicative factor for IQR to determine outliers.

    Returns:
    - outliers: List of durations identified as outliers.
    """
    Q1 = np.percentile(durations, 25)
    Q3 = np.percentile(durations, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    outliers = [d for d in durations if d < lower_bound or d > upper_bound]
    return outliers


def remove_outliers(meta, column='duration', threshold=1.5):
    """
    Removes rows from the DataFrame where the values in the specified column are outliers.

    Parameters:
    - meta (pd.DataFrame): DataFrame containing the column to check for outliers.
    - column (str): Column name to check for outliers.
    - threshold (float): Threshold for IQR to determine outliers.

    Returns:
    - meta (pd.DataFrame): Updated DataFrame with outliers removed.
    """
    durations = meta[column].tolist()
    outliers = identify_outliers(durations, threshold=threshold)
    initial_count = len(meta)

    # Remove rows with outlier durations
    meta = meta[~meta[column].isin(outliers)].reset_index(drop=True)
    final_count = len(meta)
    removed = initial_count - final_count
    print(f"Removed {removed} outlier entries.")

    return meta

In [None]:
def calculate_durations(meta, audio_folder, sr=16000):
    """
    Calculates the duration of each audio file and adds it as a new column in the metadata DataFrame.

    Parameters:
    - meta (pd.DataFrame): DataFrame containing at least a 'file' column.
    - audio_folder (str): Path to the folder containing audio files.
    - sr (int): Sampling rate for loading audio files.

    Returns:
    - meta (pd.DataFrame): Updated DataFrame with a new 'duration' column.
    """
    durations = []
    for idx, row in tqdm(meta.iterrows(), total=len(meta), desc="Calculating Durations"):
        file_name = row['file']
        file_path = os.path.join(audio_folder, file_name)
        try:
            duration = librosa.get_duration(filename=file_path, sr=sr)
            durations.append(duration)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            durations.append(None)  # Append None for failed files

    # Add durations to DataFrame
    meta['duration'] = durations

    # Optionally, remove entries with None durations
    initial_count = len(meta)
    meta = meta.dropna(subset=['duration']).reset_index(drop=True)
    final_count = len(meta)
    removed = initial_count - final_count
    if removed > 0:
        print(f"Removed {removed} entries due to failed duration calculations.")

    return meta

# Calculate durations
meta = calculate_durations(meta, audio_folder, sr=16000)
print(f"Metadata now contains 'duration' column.")

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=file_path, sr=sr)
Calculating Durations: 100%|██████████| 19963/19963 [14:12<00:00, 23.42it/s]

Metadata now contains 'duration' column.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['duration'] = durations


In [None]:
# Step 2: Remove outliers
meta = remove_outliers(meta, column='duration', threshold=1.5)
print("Outliers have been removed from the DataFrame.")

Removed 1155 outlier entries.
Outliers have been removed from the DataFrame.


### Remove imbalanced classes

In [None]:
def aggregate_audio_data_by_speaker(meta, duration_column='duration'):
    """
    Aggregates audio data to compute:
    - Total number of audio files per speaker.
    - Total duration of audio files per speaker.
    - Average duration per audio file for each speaker.

    Parameters:
    - meta (pd.DataFrame): DataFrame containing the data.
    - duration_column (str): Column name representing the audio duration.

    Returns:
    - aggregated_df (pd.DataFrame): Aggregated data with counts, total durations, and average durations per speaker.
    """
    aggregated_df = (
        meta.groupby('speaker')
        .agg(
            total_files=('file', 'count'),
            total_duration=(duration_column, 'sum'),
        )
        .reset_index()
    )

    # Add average duration per file
    aggregated_df['average_duration'] = (
        aggregated_df['total_duration'] / aggregated_df['total_files']
    )

    # Sort by total duration in descending order
    aggregated_df = aggregated_df.sort_values(by='total_duration', ascending=False).reset_index(drop=True)

    return aggregated_df


aggregate_audio_data_by_speaker(meta, duration_column='duration')

Unnamed: 0,speaker,total_files,total_duration,average_duration
0,Barack Obama,3098,11035.678,3.562194
1,Donald Trump,3147,10230.536313,3.250885
2,Bernie Sanders,1513,5115.920687,3.381309
3,Alec Guinness,1818,4748.248312,2.611798
4,Ayn Rand,1356,4735.859875,3.492522
5,Bill Clinton,1117,3064.43075,2.743447
6,Christopher Hitchens,753,2610.13025,3.466308
7,Ronald Reagan,607,1902.231313,3.133824
8,Winston Churchill,623,1689.100062,2.711236
9,Martin Luther King,490,1364.259375,2.784203


Randomly dropping tracks from extreme majority classes

In [None]:
drop_counts = {
    'Barack Obama': 2000,
    'Donald Trump': 2000,
    'Bernie Sanders': 500,
    'Alec Guinness': 800,
    'Ayn Rand': 300
}

# Loop through each speaker and drop the specified number of records
for speaker, drop_count in drop_counts.items():
    speaker_data = meta[meta['speaker'] == speaker]

    # Randomly sample the records to drop
    drop_indices = np.random.choice(speaker_data.index, size=drop_count, replace=False)

    # Drop the selected records from the DataFrame
    meta = meta.drop(drop_indices)

# Reset the index after dropping
meta.reset_index(drop=True, inplace=True)

print(f"After dropping, the dataset contains {len(meta)} records.")

After dropping, the dataset contains 13208 records.


Fully dropping extreme minority classes

In [None]:
# Count the number of recordings per speaker
speaker_counts = meta['speaker'].value_counts()

# Get a list of speakers with fewer than 10 recordings
speakers_to_drop = speaker_counts[speaker_counts < 10].index

# Drop those speakers from the DataFrame
meta = meta[~meta['speaker'].isin(speakers_to_drop)]

# Reset index after dropping
meta.reset_index(drop=True, inplace=True)

print(f"After dropping, the dataset contains {len(meta)} records.")

After dropping, the dataset contains 13180 records.


In [None]:
aggregate_audio_data_by_speaker(meta, duration_column='duration')

Unnamed: 0,speaker,total_files,total_duration,average_duration
0,Barack Obama,1098,3923.650938,3.573453
1,Ayn Rand,1056,3711.477188,3.514656
2,Donald Trump,1147,3689.890438,3.216993
3,Bernie Sanders,1013,3431.733563,3.387694
4,Bill Clinton,1117,3064.43075,2.743447
5,Alec Guinness,1018,2666.168813,2.619026
6,Christopher Hitchens,753,2610.13025,3.466308
7,Ronald Reagan,607,1902.231313,3.133824
8,Winston Churchill,623,1689.100062,2.711236
9,Martin Luther King,490,1364.259375,2.784203


In [None]:
meta = pd.concat([meta, split2], ignore_index=True)

print(f"Loaded metadata with {len(meta)} entries.")
meta.to_csv("/content/processed_meta.csv", index=False)

Loaded metadata with 13665 entries.


## Load pre-processed meta file

In [None]:
import pandas as pd
meta_path = "/content/processed_meta.csv"
meta = pd.read_csv(meta_path)
audio_folder = "/content/release_in_the_wild"

## Preprocessing and feature extracion

In [None]:
# Load pre-trained Wav2Vec2 feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

# Create label map
celebrities = meta["speaker"].unique()
NUM_LABELS = len(celebrities)
label_map = {name: idx for idx, name in enumerate(celebrities)}

In [None]:
def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    y = librosa.util.normalize(y)

    # Pad or truncate to fixed length
    if len(y) > MAX_AUDIO_LENGTH:
        y = y[:MAX_AUDIO_LENGTH]
    else:
        y = np.pad(y, (0, MAX_AUDIO_LENGTH - len(y)))
    return y

In [None]:
# Dataset class
class SpeakerRecognitionDataset(Dataset):
    def __init__(self, metadata, audio_folder, label_map, preprocess_audio_fn, feature_extractor):
        self.metadata = metadata
        self.audio_folder = audio_folder
        self.label_map = label_map
        self.preprocess_audio_fn = preprocess_audio_fn
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        file_path = os.path.join(self.audio_folder, row['file'])
        label = self.label_map[row['speaker']]

        # Preprocess audio to get raw waveform
        audio = self.preprocess_audio_fn(file_path)
        if audio is None:
            raise ValueError(f"Error processing file: {file_path}")

        # Use Wav2Vec2 feature extractor
        inputs = self.feature_extractor(
            audio, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True
        )
        input_values = inputs.input_values.squeeze(0)  # Remove batch dimension

        return input_values, torch.tensor(label, dtype=torch.long)

In [None]:
# Custom collate function for DataLoader
def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs = torch.stack(inputs)
    labels = torch.tensor(labels, dtype=torch.long)
    return inputs, labels

In [None]:
# Create dataset
dataset = SpeakerRecognitionDataset(meta, audio_folder, label_map, preprocess_audio, feature_extractor)

# Split dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

## Load model and optimizer

In [None]:
# Load Wav2Vec2 model and move to device
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h", num_labels=NUM_LABELS
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.CrossEntropyLoss()

## Training

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{EPOCHS}"):
        input_values, labels = batch
        input_values = input_values.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

    train_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {total_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/10: 100%|██████████| 684/684 [10:37<00:00,  1.07it/s]


Epoch 1/10, Loss: 3.3391, Accuracy: 0.1025


Training Epoch 2/10: 100%|██████████| 684/684 [10:11<00:00,  1.12it/s]


Epoch 2/10, Loss: 2.4413, Accuracy: 0.3261


Training Epoch 3/10: 100%|██████████| 684/684 [10:08<00:00,  1.12it/s]


Epoch 3/10, Loss: 1.6433, Accuracy: 0.5393


Training Epoch 4/10: 100%|██████████| 684/684 [10:06<00:00,  1.13it/s]


Epoch 4/10, Loss: 1.2689, Accuracy: 0.6427


Training Epoch 5/10: 100%|██████████| 684/684 [10:03<00:00,  1.13it/s]


Epoch 5/10, Loss: 0.9610, Accuracy: 0.7515


Training Epoch 6/10: 100%|██████████| 684/684 [10:07<00:00,  1.13it/s]


Epoch 6/10, Loss: 0.7096, Accuracy: 0.8230


Training Epoch 7/10: 100%|██████████| 684/684 [10:05<00:00,  1.13it/s]


Epoch 7/10, Loss: 0.5930, Accuracy: 0.8545


Training Epoch 8/10: 100%|██████████| 684/684 [10:07<00:00,  1.13it/s]


Epoch 8/10, Loss: 0.4725, Accuracy: 0.8814


Training Epoch 9/10: 100%|██████████| 684/684 [10:05<00:00,  1.13it/s]


Epoch 9/10, Loss: 0.3988, Accuracy: 0.9004


Training Epoch 10/10:  83%|████████▎ | 567/684 [08:23<01:44,  1.12it/s]

## Evaluation

In [None]:
model.eval()
correct_predictions = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_values, labels = batch
        input_values = input_values.to(device)
        labels = labels.to(device)

        outputs = model(input_values=input_values)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = correct_predictions / total_samples
print(f"Test Accuracy: {test_accuracy:.4f}")

## Save model

In [None]:
# Save model and feature extractor
model.save_pretrained("trained_model_3")
feature_extractor.save_pretrained("trained_model_3")

TypeError: stat: path should be string, bytes, os.PathLike or integer, not Wav2Vec2ForSequenceClassification

In [None]:
import shutil
from google.colab import files

# Zip the saved model directory
shutil.make_archive("trained_model_3", 'zip', "trained_model_3")

# Download the zipped model file
files.download("trained_model_3.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>