<a href="https://colab.research.google.com/github/tmm-ai/affect_whisperer/blob/main/Wav2Vec_intv_helper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install resampy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')
# drive.flush_and_unmount()

%cd /content/drive/MyDrive/Interview_Helper

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Interview_Helper


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import librosa
import os
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import os
import glob
import random
import librosa
import numpy as np
from sklearn.model_selection import train_test_split

def load_data_tess(directory):
    data = []
    labels = []
    label_map = {
        "angry": 0,
        "disgust": 1,
        "fear": 2,
        "happy": 3,
        "neutral": 4,
        "sad": 5,
        "surprise": 6
    }
    for emotion in label_map:
        file_list = glob.glob(os.path.join(directory, f"{emotion}_*.wav"))
        for file in file_list:
            audio, _ = librosa.load(file, sr=16000)
            data.append(audio)
            labels.append(label_map[emotion])
    return data, labels

def load_data_ravdess(directory):
    data = []
    labels = []
    file_list = glob.glob(os.path.join(directory, "Actor_*/*.wav"))
    for file in file_list:
        emotion = int(os.path.basename(file).split("-")[2]) - 1
        audio, _ = librosa.load(file, sr=16000)
        data.append(audio)
        labels.append(emotion)
    return data, labels

def preprocess_data(data, labels, sr=16000, duration=3):
    preprocessed_data = []
    preprocessed_labels = []

    for i, audio in enumerate(data):
        length = len(audio)
        target_length = sr * duration
        if length >= target_length:
            start = (length - target_length) // 2
            end = start + target_length
            preprocessed_data.append(audio[start:end])
        else:
            padding = (target_length - length) // 2
            preprocessed_data.append(np.pad(audio, (padding, target_length - length - padding), mode='constant'))
        preprocessed_labels.append(labels[i])

    return preprocessed_data, preprocessed_labels

# Load and preprocess TESS data
tess_data, tess_labels = load_data_tess("/content/drive/MyDrive/Interview_Helper/RAVDESS_data/Audio_Speech_Actors_01-24")
tess_data, tess_labels = preprocess_data(tess_data, tess_labels)

# Load and preprocess RAVDESS data
ravdess_data, ravdess_labels = load_data_ravdess("RAVDESS")
ravdess_data, ravdess_labels = preprocess_data(ravdess_data, ravdess_labels)

# Combine TESS and RAVDESS datasets
combined_data = tess_data + ravdess_data
combined_labels = tess_labels + ravdess_labels

# Split data into train and eval sets
train_data, eval_data, train_labels, eval_labels = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42, stratify=combined_labels)


In [None]:
def load_and_preprocess_data(dataset_path, sample_rate=22050, duration=4):
    features = []
    labels = []
    num_samples = sample_rate * duration

    # Iterate through all the folders in the dataset
    for actor_dir in os.listdir(dataset_path):
        actor_path = os.path.join(dataset_path, actor_dir)
        
        # Iterate through all the audio files in each folder
        for audio_file in os.listdir(actor_path):
            file_path = os.path.join(actor_path, audio_file)
            
            # Load the audio file
            audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration, res_type='kaiser_fast')
            
            # Pad or truncate audio to the desired duration
            audio = librosa.util.pad_center(audio, size=num_samples)
            
            # Get the emotion label from the file name
            emotion = int(audio_file.split("-")[2]) - 1  # Subtract 1 to make labels zero-indexed
            
            # Add features and labels to the lists
            features.append(audio)
            labels.append(emotion)
    
    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels


In [None]:
# features = np.load('/content/drive/MyDrive/Interview_Helper/features_Wav2Vec.npy')
# labels = np.load('/content/drive/MyDrive/Interview_Helper/labels_Wav2Vec.npy')

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32).squeeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label


In [None]:
# Load and preprocess data
features, labels = load_and_preprocess_data('/content/drive/MyDrive/Interview_Helper/RAVDESS_data/Audio_Speech_Actors_01-24')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create Dataset objects''
train_dataset = EmotionDataset(X_train, y_train)
val_dataset = EmotionDataset(X_val, y_val)

# Create DataLoader objects
batch_size = 32  #16,32,64,128,256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Save features and labels to Google Drive
np.save('/content/drive/MyDrive/Interview_Helper/features_Wav2Vec.npy', features)
np.save('/content/drive/MyDrive/Interview_Helper/labels_Wav2Vec.npy', labels)

In [None]:
# Initialize Wav2Vec2 model
num_emotions = 8
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=num_emotions)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_q.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.weight', 'projecto

In [None]:
# Training parameters
lr = 1e-5   #  1e-5, 3e-5, 1e-4, 1e-3, 5e-3, 1e-2, 5e-2
num_epochs = 30  # 10, 20 30, 50, 100, 200

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

patience = 4
best_val_loss = float('inf')
epochs_without_improvement = 0

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    for batch in train_loader:
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(features).logits
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.argmax(1) == labels).sum().item()

    train_loss = total_loss / len(train_dataset)
    train_acc = total_correct / len(train_dataset)

    # Validation
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in val_loader:
            features, labels = batch
            features = features.to(device)
            labels = labels.to(device)

            outputs = model(features).logits
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == labels).sum().item()

    val_loss = total_loss / len(val_dataset)
    val_acc = total_correct / len(val_dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement == patience:
            print("Early stopping due to no improvement in validation loss for {} epochs.".format(patience))
            break

Epoch 1/30
Train Loss: 0.0643, Train Acc: 0.2535
Val Loss: 0.0630, Val Acc: 0.2917
Epoch 2/30
Train Loss: 0.0606, Train Acc: 0.3446
Val Loss: 0.0576, Val Acc: 0.3333
Epoch 3/30
Train Loss: 0.0545, Train Acc: 0.4627
Val Loss: 0.0509, Val Acc: 0.5486
Epoch 4/30
Train Loss: 0.0480, Train Acc: 0.5773
Val Loss: 0.0449, Val Acc: 0.5833
Epoch 5/30
Train Loss: 0.0416, Train Acc: 0.6519
Val Loss: 0.0402, Val Acc: 0.6215
Epoch 6/30
Train Loss: 0.0348, Train Acc: 0.7665
Val Loss: 0.0359, Val Acc: 0.6562
Epoch 7/30
Train Loss: 0.0301, Train Acc: 0.8021
Val Loss: 0.0320, Val Acc: 0.7222
Epoch 8/30
Train Loss: 0.0252, Train Acc: 0.8585
Val Loss: 0.0302, Val Acc: 0.7153
Epoch 9/30
Train Loss: 0.0211, Train Acc: 0.8889
Val Loss: 0.0276, Val Acc: 0.7361
Epoch 10/30
Train Loss: 0.0184, Train Acc: 0.9036
Val Loss: 0.0246, Val Acc: 0.7882
Epoch 11/30
Train Loss: 0.0160, Train Acc: 0.9201
Val Loss: 0.0240, Val Acc: 0.7917
Epoch 12/30
Train Loss: 0.0140, Train Acc: 0.9288
Val Loss: 0.0276, Val Acc: 0.7361
E

In [None]:
from sklearn.base import BaseEstimator

class PyTorchModelWrapper(BaseEstimator):
    def __init__(self, model_class, criterion, device, batch_size=32, learning_rate=1e-3, num_epochs=10):
        self.model_class = model_class
        self.criterion = criterion
        self.device = device
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

    def fit(self, X, y):
        # Create Datasets and DataLoaders
        train_dataset = EmotionDataset(X, y)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        # Initialize the model, optimizer, and move them to the device
        model = self.model_class().to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)

        # Training loop
        for epoch in range(self.num_epochs):
            train_model(model, train_loader, self.criterion, optimizer, self.device)

        self.model_ = model
        return self

    def predict(self, X):
        val_dataset = EmotionDataset(X, np.zeros(X.shape[0]))  # Dummy labels
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)

        # Set the model to evaluation mode
        self.model_.eval()

        # Run the model on validation data
        all_outputs = []
        for features, _ in val_loader:
            features = features.to(self.device)
            with torch.no_grad():
                outputs = self.model_(features)
            all_outputs.append(outputs.cpu().numpy())

        # Concatenate all outputs
        return np.concatenate(all_outputs, axis=0)

    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y == np.argmax(y_pred, axis=1))


In [None]:
# Specify the directory where you want to save the model
output_dir = "/content/drive/MyDrive/Interview_Helper/wav2vec2_emotion_model_Ravdess1e5_30epoch_8160"

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model
model.save_pretrained(output_dir)

In [None]:
from transformers import Wav2Vec2ForSequenceClassification

# Load the saved model
model = Wav2Vec2ForSequenceClassification.from_pretrained(output_dir)
