# 1 Load Dataset

In [None]:
!pip install torch torchaudio transformers datasets librosa



In [None]:
import os
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
folder = '/content/drive/My Drive/ITI110'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Unzip the Dataset
import os
import zipfile

# Define the path to the zip file in your Google Drive and where to unzip in Colab's virtual space
zip_file_path = folder + '/RAVDESS_EmotionalSpeechAudio.zip'
data_folder = '/content/RAVDESS/'  # Virtual Colab space (not on Google Drive)

# Unzipping the dataset into the Colab virtual space
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_folder)

# Now you can access the unzipped data in Colab
print("Dataset extracted successfully!")

Dataset extracted successfully!


In [None]:
# Explore Dataset Structure
# run the following command to check the folder structure and see how files are organized

#import os

# List files and folders in the extracted dataset
for root, dirs, files in os.walk(data_folder):
    print(f"📂 {root}")
    for file in files[:5]:  # Show only the first 5 files per folder
        print(f"  📄 {file}")
    print("------")


📂 /content/RAVDESS/
------
📂 /content/RAVDESS/Actor_16
  📄 03-01-06-01-01-02-16.wav
  📄 03-01-03-02-02-01-16.wav
  📄 03-01-08-01-02-02-16.wav
  📄 03-01-07-01-02-02-16.wav
  📄 03-01-03-01-01-02-16.wav
------
📂 /content/RAVDESS/Actor_11
  📄 03-01-07-02-02-01-11.wav
  📄 03-01-08-02-01-01-11.wav
  📄 03-01-02-01-02-02-11.wav
  📄 03-01-04-01-01-01-11.wav
  📄 03-01-06-02-01-01-11.wav
------
📂 /content/RAVDESS/Actor_20
  📄 03-01-05-01-02-02-20.wav
  📄 03-01-01-01-01-02-20.wav
  📄 03-01-05-01-01-01-20.wav
  📄 03-01-08-02-02-02-20.wav
  📄 03-01-08-02-02-01-20.wav
------
📂 /content/RAVDESS/Actor_06
  📄 03-01-07-02-02-01-06.wav
  📄 03-01-08-02-01-02-06.wav
  📄 03-01-03-02-02-02-06.wav
  📄 03-01-06-02-02-02-06.wav
  📄 03-01-08-01-02-01-06.wav
------
📂 /content/RAVDESS/Actor_10
  📄 03-01-06-02-01-02-10.wav
  📄 03-01-05-01-01-01-10.wav
  📄 03-01-05-02-02-01-10.wav
  📄 03-01-07-02-02-02-10.wav
  📄 03-01-05-02-02-02-10.wav
------
📂 /content/RAVDESS/Actor_13
  📄 03-01-05-01-01-02-13.wav
  📄 03-01-08-01-

In [None]:
#import os
#import pandas as pd
import glob

# Define dataset folder
data_folder = "/content/RAVDESS/"

# Extract all WAV file paths
file_paths = glob.glob(os.path.join(data_folder, "Actor_*", "*.wav"))


In [None]:
# Emotion mapping from filename convention
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# Function to extract emotion & gender from filename
def parse_filename(filepath):
    filename = os.path.basename(filepath)
    parts = filename.split("-")  # Split filename using '-'

    if len(parts) > 2:
        emotion_code = parts[2]  # 3rd part of filename is the emotion code
        emotion = emotion_map.get(emotion_code, "unknown")

        actor_id = parts[-1].split(".")[0]  # Last part before `.wav` is the actor ID
        gender = "male" if int(actor_id) % 2 != 0 else "female"

        return emotion, gender
    return "unknown", "unknown"


In [None]:
# Create DataFrame with file paths only
df = pd.DataFrame({
    "File Name": [os.path.basename(f) for f in file_paths],  # Extract just filenames
    "File Path": file_paths  # Full path for loading
})

# Apply function to extract emotion & gender
df[["Emotion", "Gender"]] = df["File Path"].apply(lambda f: pd.Series(parse_filename(f)))

# Display sample
df.head()

Unnamed: 0,File Name,File Path,Emotion,Gender
0,03-01-06-01-01-02-16.wav,/content/RAVDESS/Actor_16/03-01-06-01-01-02-16...,fearful,female
1,03-01-03-02-02-01-16.wav,/content/RAVDESS/Actor_16/03-01-03-02-02-01-16...,happy,female
2,03-01-08-01-02-02-16.wav,/content/RAVDESS/Actor_16/03-01-08-01-02-02-16...,surprised,female
3,03-01-07-01-02-02-16.wav,/content/RAVDESS/Actor_16/03-01-07-01-02-02-16...,disgust,female
4,03-01-03-01-01-02-16.wav,/content/RAVDESS/Actor_16/03-01-03-01-01-02-16...,happy,female


In [None]:
print(f"\nTotal Samples: {len(df)}")

# Count of each emotion
print("\nEmotion Distribution:")
print(df["Emotion"].value_counts())


Total Samples: 1440

Emotion Distribution:
Emotion
fearful      192
happy        192
surprised    192
disgust      192
sad          192
calm         192
angry        192
neutral       96
Name: count, dtype: int64


# 2 Preprocessing steps

In [None]:
import torchaudio
import torchaudio.transforms as T

# Define the target sample rate
target_sample_rate = 16000

# Function to load and preprocess audio (including resampling and normalization)
def load_audio(filepath):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(filepath)

    # Resample if the sample rate is different from the target
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    # Normalize the audio
    waveform = waveform / waveform.abs().max()  # Normalize to -1 to 1

    return waveform

# Function to trim silence from the audio
def trim_silence(waveform, threshold=0.01):
    # Use torchaudio's VAD (Voice Activity Detection) to trim silence
    vad = T.Vad(sample_rate=target_sample_rate, trigger_level=threshold)
    trimmed_waveform = vad(waveform)
    return trimmed_waveform

# Example usage: trim silence from one audio file
audio_file = file_paths[0]  # Select one of the audio file paths
waveform = load_audio(audio_file)  # Load the audio
trimmed_waveform = trim_silence(waveform)  # Trim silence

# Display original vs trimmed audio length
print(f"Original audio length: {waveform.shape[1]} samples")
print(f"Trimmed audio length: {trimmed_waveform.shape[1]} samples")


Original audio length: 56056 samples
Trimmed audio length: 41656 samples


In [None]:
# Random Oversampling of Neutral Samples

from sklearn.utils import resample

# Separate the neutral emotion class
neutral_df = df[df["Emotion"] == "neutral"]

# Separate the majority classes (those that aren't neutral)
majority_df = df[df["Emotion"] != "neutral"]

# Oversample the neutral class to 192 samples (matching the other classes)
neutral_oversampled = resample(neutral_df,
                               replace=True,      # Sample with replacement
                               n_samples=192,     # Set to 192 to match the other classes
                               random_state=42)   # For reproducibility

# Concatenate the oversampled neutral class back with the majority class data
df_balanced = pd.concat([majority_df, neutral_oversampled])

# Check the new emotion distribution
print("\nBalanced Emotion Distribution:")
print(df_balanced["Emotion"].value_counts())


Balanced Emotion Distribution:
Emotion
fearful      192
happy        192
surprised    192
disgust      192
sad          192
calm         192
angry        192
neutral      192
Name: count, dtype: int64


In [None]:
# Map Labels to Emotion Groups

# Define the emotion mapping
emotion_mapping = {
    "neutral": "neutral",
    "calm": "neutral",
    "happy": "positive",
    "surprised": "positive",
    "sad": "negative",
    "disgust": "negative",
    "angry": "negative",
    "fearful": "negative"
}

# Apply the mapping to create a new column 'Emotion Group'
df_balanced['Label'] = df_balanced['Emotion'].map(emotion_mapping)

# Display the first few rows to check the mapping
print(df_balanced.head())


                  File Name  \
0  03-01-06-01-01-02-16.wav   
1  03-01-03-02-02-01-16.wav   
2  03-01-08-01-02-02-16.wav   
3  03-01-07-01-02-02-16.wav   
4  03-01-03-01-01-02-16.wav   

                                           File Path    Emotion  Gender  \
0  /content/RAVDESS/Actor_16/03-01-06-01-01-02-16...    fearful  female   
1  /content/RAVDESS/Actor_16/03-01-03-02-02-01-16...      happy  female   
2  /content/RAVDESS/Actor_16/03-01-08-01-02-02-16...  surprised  female   
3  /content/RAVDESS/Actor_16/03-01-07-01-02-02-16...    disgust  female   
4  /content/RAVDESS/Actor_16/03-01-03-01-01-02-16...      happy  female   

      Label  
0  negative  
1  positive  
2  positive  
3  negative  
4  positive  


In [None]:
print(f"\nTotal Samples: {len(df_balanced)}")

# Count of each emotion
print("\nLabel Distribution:")
print(df_balanced["Label"].value_counts())


Total Samples: 1536

Label Distribution:
Label
negative    768
positive    384
neutral     384
Name: count, dtype: int64


In [None]:
# Stratified Undersampling for Negative Label

from sklearn.utils import resample

# Separate the negative, positive, and neutral classes
negative_df = df_balanced[df_balanced["Label"] == "negative"]
positive_neutral_df = df_balanced[df_balanced["Label"] != "negative"]

# Perform stratified undersampling for the negative class to 384 samples
negative_undersampled = resample(negative_df,
                                 replace=False,      # Sample without replacement
                                 n_samples=384,      # Set to 384 to match the other classes
                                 random_state=42)    # For reproducibility

# Concatenate the undersampled negative class back with the other classes
df_balMap = pd.concat([positive_neutral_df, negative_undersampled])

# Check the new label distribution
print("\nBalanced Label Distribution after Undersampling:")
print(df_balMap["Label"].value_counts())



Balanced Label Distribution after Undersampling:
Label
positive    384
neutral     384
negative    384
Name: count, dtype: int64


In [None]:
# Resample to 16kHz & Padding/Trimming to 2 seconds

import torch
import torchaudio

# Function to resample audio to 16kHz
def resample_audio(audio_path, target_sample_rate=16000):
    waveform, original_sample_rate = torchaudio.load(audio_path)
    if original_sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
    return waveform

# Function to pad or trim the audio to 2 seconds (at 16kHz)
def pad_or_trim_audio(waveform, target_duration_sec=2, sample_rate=16000):
    target_length = target_duration_sec * sample_rate
    waveform_length = waveform.size(1)

    if waveform_length > target_length:  # Trim the waveform
        waveform = waveform[:, :target_length]
    elif waveform_length < target_length:  # Pad the waveform
        padding = target_length - waveform_length
        waveform = torch.nn.functional.pad(waveform, (0, padding))  # Pad with zeros at the end

    return waveform

# Example usage on your dataset
def preprocess_audio_file(audio_path):
    # Resample to 16kHz
    resampled_waveform = resample_audio(audio_path)

    # Pad or trim to 2 seconds
    final_waveform = pad_or_trim_audio(resampled_waveform)

    return final_waveform

# Example: Apply preprocessing to all audio files in your DataFrame
df_balMap['Processed Audio'] = df_balMap['File Path'].apply(lambda x: preprocess_audio_file(x))

# Verify the preprocessing works by checking the shape of one waveform
print(f"Processed audio shape for first file: {df_balMap['Processed Audio'].iloc[0].shape}")


Processed audio shape for first file: torch.Size([1, 32000])


# 3 Preparing the Dataset for Wave2Vec

Step 1: Preparing the Dataset for Wave2Vec
Will need to create a custom dataset class that will handle the loading of your audio files, along with their corresponding labels (emotion labels).


1.   Audio Input: The input to Wave2Vec should be the waveform (as a tensor).
2.   Labels: The labels for emotion classification will be the Label column in your df_balMap DataFrame.


Step 2: Implement Dataset Class
We'll use PyTorch's Dataset class to create a custom dataset that loads the audio and the corresponding labels.

In [None]:
# # Dataset Class

# import torch
# from torch.utils.data import Dataset
# import torchaudio

# class EmotionDataset(Dataset):
#     def __init__(self, dataframe, transform=None):
#         self.dataframe = dataframe
#         self.transform = transform

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         # Get the file path and label
#         audio_path = self.dataframe.iloc[idx]['File Path']
#         label = self.dataframe.iloc[idx]['Label']

#         # Load the audio file
#         waveform = self.dataframe.iloc[idx]['Processed Audio']  # Already processed audio

#         # Convert the label to an integer
#         label_map = {'negative': 0, 'positive': 1, 'neutral': 2}  # Map your labels to integers
#         label_idx = label_map[label]

#         # Apply any transformations (if needed)
#         if self.transform:
#             waveform = self.transform(waveform)

#         return waveform, label_idx

# # Example: Create the dataset for training
# emotion_dataset = EmotionDataset(df_balMap)

# # Example: Get the first item from the dataset
# example_waveform, example_label = emotion_dataset[0]
# print(f"Waveform shape: {example_waveform.shape}, Label: {example_label}")


Waveform shape: torch.Size([1, 32000]), Label: 1


In [None]:
# import torch
# from torch.utils.data import Dataset
# import torchaudio

# class EmotionDataset(Dataset):
#     def __init__(self, dataframe, transform=None):
#         self.dataframe = dataframe
#         self.transform = transform

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         # Get the file path and label
#         audio_path = self.dataframe.iloc[idx]['File Path']
#         label = self.dataframe.iloc[idx]['Label']

#         # Load the processed audio directly
#         waveform = self.dataframe.iloc[idx]['Processed Audio']  # Already processed audio

#         # Map label to integer
#         label_map = {'negative': 0, 'positive': 1, 'neutral': 2}  # Label mapping
#         label_idx = torch.tensor(label_map[label], dtype=torch.long)  # Convert label to tensor

#         # Apply transformations (if needed)
#         if self.transform:
#             waveform = self.transform(waveform)

#         return waveform, label_idx  # Ensure label is returned as a tensor

# # Example: Create the dataset for training
# emotion_dataset = EmotionDataset(df_balMap)

# # Example: Get the first item from the dataset
# example_waveform, example_label = emotion_dataset[0]
# print(f"Waveform shape: {example_waveform.shape}, Label: {example_label}")


Waveform shape: torch.Size([1, 32000]), Label: 1


In [None]:
import torch
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, dataframe, processor, transform=None):
        self.dataframe = dataframe
        self.processor = processor  # Wav2Vec2 processor
        self.transform = transform
        self.label_map = {'negative': 0, 'positive': 1, 'neutral': 2}  # Label mapping

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load preprocessed audio from DataFrame
        #waveform = self.dataframe.iloc[idx]['Processed Audio']  # Already processed audio
        waveform = torch.tensor(self.dataframe.iloc[idx]['Processed Audio'], dtype=torch.float32)
        label = self.dataframe.iloc[idx]['Label']

        # Convert label to integer
        label_idx = torch.tensor(self.label_map[label], dtype=torch.long)

        # Process audio with Wav2Vec2 processor
        inputs = self.processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
        #inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values = inputs.input_values.squeeze(0)  # Remove batch dimension

        return input_values, label_idx  # Ensure label is returned as a tensor


Step 3: Creating DataLoader
Once we have the EmotionDataset class, we can use PyTorch’s DataLoader to handle batching, shuffling, and parallel processing of the data.

In [None]:
from sklearn.model_selection import train_test_split

# 1. Split Data into Train/Validation (70:30)
train_df, val_df = train_test_split(df_balMap, test_size=0.3, stratify=df_balMap['Label'], random_state=42)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

Training set size: 806
Validation set size: 346


In [None]:
#import torch
#import os
#from torch.utils.data import DataLoader, Dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from torch import nn
from transformers import AdamW
#from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import matplotlib.pyplot as plt
#import numpy as np

In [None]:
# 3. Load Processor and Model
model_name = "facebook/wav2vec2-base"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: negative, positive, neutral
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate

# 4. Create DataLoader for Train and Validation
train_dataset = EmotionDataset(train_df, processor)
val_dataset = EmotionDataset(val_df, processor)

# def custom_collate_fn(batch):
#     # Ensure all waveforms have the same number of channels (1 channel in this case)
#     waveforms, labels = zip(*batch)

#     # Process each waveform in the batch to ensure consistent shape
#     processed_waveforms = []
#     for waveform in waveforms:
#         # If the waveform has more than 1 channel, convert it to mono by averaging over the channels
#         if waveform.ndim > 1:
#             waveform = waveform.mean(axis=0)  # Averaging channels to make it mono

#         # Ensure the waveform is a 1D tensor (shape: [num_samples]) and convert to tensor
#         waveform = torch.tensor(waveform, dtype=torch.float32)

#         # Add to the list of processed waveforms
#         processed_waveforms.append(waveform)

#     # Use the default collate_fn to stack the processed waveforms into a batch
#     waveforms_batch = default_collate(processed_waveforms)
#     labels_batch = torch.tensor(labels)

#     return waveforms_batch, labels_batch


train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, collate_fn=custom_collate_fn)
# val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, collate_fn=custom_collate_fn)


In [None]:
#from torch.utils.data import DataLoader

# Create DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Example: Get a batch of data
waveforms, labels = next(iter(train_dataloader))
print(f"Batch of waveforms shape: {waveforms.shape}, Labels shape: {labels.shape}")


Batch of waveforms shape: torch.Size([32, 32000]), Labels shape: torch.Size([32])


  waveform = torch.tensor(self.dataframe.iloc[idx]['Processed Audio'], dtype=torch.float32)


Explanation:


1.   EmotionDataset Class:
  *   Loads the processed audio and corresponding label.
  *   Converts the label to an integer (for training with Wave2Vec).
  *   Supports optional transformations if needed.

2.   DataLoader:
  *   Batches the dataset for efficient processing.
  *   Shuffles the data to ensure randomness during training.
  *   Allows parallel data loading using multiple workers.


# 4 Fine-tuning Wave2Vec 2.0
To fine-tune Wave2Vec 2.0 on your emotion classification task, you’ll need to:


1.   Load Pre-trained Wave2Vec 2.0: We'll load the pre-trained model from the Hugging Face transformers library.
2.   Modify the Model for Emotion Classification: Wave2Vec 2.0 outputs embeddings, and we'll add a classification head (e.g., a simple fully connected layer) on top of it to predict the emotion labels.
3.   Train the Model: Train the model on your dataset for emotion classification.


In [None]:
# # Load Pre-trained Wave2Vec 2.0 Model

# import torch
# from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
# from torch import nn
# from transformers import AdamW
# from tqdm import tqdm

# # Load the pre-trained Wav2Vec 2.0 model and processor
# #model_name = "facebook/wav2vec2-base"
# model_name = "facebook/wav2vec2-small"
# model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: negative, positive, neutral
# processor = Wav2Vec2Processor.from_pretrained(model_name)

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Define loss function and optimizer
# loss_fn = nn.CrossEntropyLoss()
# optimizer = AdamW(model.parameters(), lr=1e-5)

# # Training loop
# def train_model(model, dataloader, optimizer, loss_fn, epochs=3):
#     model.train()  # Set the model to training mode
#     for epoch in range(epochs):
#         total_loss = 0
#         for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
#             input_waveforms, labels = batch
#             input_waveforms = input_waveforms.squeeze(1).to(device)  # Remove channel dimension and move to device
#             labels = labels.to(device)

#             # Process the audio to the correct input format for Wave2Vec 2.0
#             input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
#             input_values = input_values.to(device)

#             # Forward pass
#             outputs = model(input_values, labels=labels)
#             loss = outputs.loss
#             logits = outputs.logits

#             # Backward pass
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()

#         print(f"Epoch {epoch + 1} Loss: {total_loss / len(dataloader)}")

# # Train the model
# train_model(model, train_dataloader, optimizer, loss_fn, epochs=3)


In [None]:
# # 2. Custom Dataset Class for Audio Files
# class AudioDataset(Dataset):
#     def __init__(self, dataframe, processor, max_duration=2, sampling_rate=16000):
#         self.dataframe = dataframe
#         self.processor = processor
#         self.max_duration = max_duration
#         self.sampling_rate = sampling_rate

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         file_path = self.dataframe.iloc[idx]['File Path']
#         label = self.dataframe.iloc[idx]['Label']

#         # Load and preprocess audio
#         waveform, _ = torchaudio.load(file_path)
#         waveform = self.resample_audio(waveform)
#         waveform = self.trim_or_pad(waveform)

#         # Process audio with Wav2Vec2 processor
#         inputs = self.processor(waveform, return_tensors="pt", sampling_rate=self.sampling_rate, padding=True)
#         input_values = inputs.input_values.squeeze(0)  # Remove the batch dimension

#         return input_values, label

#     def resample_audio(self, waveform):
#         return torchaudio.transforms.Resample(orig_freq=44100, new_freq=self.sampling_rate)(waveform)

#     def trim_or_pad(self, waveform):
#         target_length = self.max_duration * self.sampling_rate
#         current_length = waveform.shape[1]

#         if current_length > target_length:
#             return waveform[:, :target_length]
#         elif current_length < target_length:
#             padding = target_length - current_length
#             return torch.nn.functional.pad(waveform, (0, padding))
#         else:
#             return waveform


In [None]:
# # 2. Custom Dataset Class for Audio Files
# class AudioDataset(Dataset):
#     def __init__(self, dataframe, processor, max_duration=2, sampling_rate=16000):
#         self.dataframe = dataframe
#         self.processor = processor
#         self.max_duration = max_duration
#         self.sampling_rate = sampling_rate

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         file_path = self.dataframe.iloc[idx]['File Path']
#         label = torch.tensor(self.dataframe.iloc[idx]['Label'], dtype=torch.long)  # Convert label to tensor

#         # Load and preprocess audio
#         waveform, _ = torchaudio.load(file_path)
#         waveform = self.resample_audio(waveform)
#         waveform = self.trim_or_pad(waveform)

#         # Process audio with Wav2Vec2 processor
#         inputs = self.processor(waveform, return_tensors="pt", sampling_rate=self.sampling_rate, padding=True)
#         input_values = inputs.input_values.squeeze(0)  # Remove the batch dimension

#         return input_values, label  # Ensure label is a tensor

#     def resample_audio(self, waveform):
#         return torchaudio.transforms.Resample(orig_freq=44100, new_freq=self.sampling_rate)(waveform)

#     def trim_or_pad(self, waveform):
#         target_length = self.max_duration * self.sampling_rate
#         current_length = waveform.shape[1]

#         if current_length > target_length:
#             return waveform[:, :target_length]
#         elif current_length < target_length:
#             padding = target_length - current_length
#             return torch.nn.functional.pad(waveform, (0, padding))
#         else:
#             return waveform


In [None]:
# 5.1 Define Callbacks
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, loss):
        if loss < self.best_loss - self.delta:
            self.best_loss = loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Early stopping and model checkpoint
early_stopping = EarlyStopping(patience=5)

def save_best_model(model, epoch, loss, path='./'):
    model_save_path = os.path.join(path, f'model_epoch_{epoch + 1}_loss_{loss:.4f}.pth')
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved at {model_save_path}")


In [None]:
# # 5.2 Define Training Loop

# # Optimizer and loss function
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# loss_fn = torch.nn.CrossEntropyLoss()

# # Training function
# def train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, epochs=3):
#     model.train()
#     best_val_loss = float('inf')
#     training_losses = []
#     validation_losses = []
#     train_accuracies = []
#     val_accuracies = []

#     for epoch in range(epochs):
#         total_loss = 0
#         correct_preds_train = 0
#         total_train = 0
#         for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
#             input_waveforms, labels = batch
#             input_waveforms = input_waveforms.squeeze(1).to(device)
#             labels = torch.tensor(labels, dtype=torch.long).to(device)  # Ensure labels are tensors

#         # for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
#         #     input_waveforms, labels = batch
#         #     input_waveforms = input_waveforms.squeeze(1).to(device)
#         #     labels = labels.to(device)

#             # Process the audio
#             input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
#             input_values = input_values.to(device)

#             # Forward pass
#             outputs = model(input_values, labels=labels)
#             loss = outputs.loss
#             logits = outputs.logits

#             # Backward pass
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()

#             # Accuracy calculation
#             preds = torch.argmax(logits, dim=-1)
#             correct_preds_train += (preds == labels).sum().item()
#             total_train += labels.size(0)

#         # Calculate training accuracy
#         train_accuracy = correct_preds_train / total_train
#         train_accuracies.append(train_accuracy)

#         # Validation phase
#         model.eval()
#         correct_preds_val = 0
#         total_val = 0
#         with torch.no_grad():
#             for batch in val_dataloader:
#                 input_waveforms, labels = batch
#                 input_waveforms = input_waveforms.squeeze(1).to(device)
#                 labels = labels.to(device)

#                 input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
#                 input_values = input_values.to(device)

#                 # Predict
#                 outputs = model(input_values)
#                 logits = outputs.logits
#                 preds = torch.argmax(logits, dim=-1)

#                 correct_preds_val += (preds == labels).sum().item()
#                 total_val += labels.size(0)

#         # Calculate validation accuracy
#         val_accuracy = correct_preds_val / total_val
#         val_accuracies.append(val_accuracy)

#         # Save the best model
#         val_loss = total_loss / len(train_dataloader)
#         if val_loss < best_val_loss:
#             best_val_loss = val_loss
#             save_best_model(model, epoch, best_val_loss)

#         # Early stopping
#         early_stopping(val_loss)
#         if early_stopping.early_stop:
#             print(f"Early stopping at epoch {epoch + 1}")
#             break

#         training_losses.append(total_loss / len(train_dataloader))
#         validation_losses.append(val_loss)

#     return training_losses, validation_losses, train_accuracies, val_accuracies

# # Training the model and getting losses and accuracies
# training_losses, validation_losses, train_accuracies, val_accuracies = train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, epochs=10)

# # Display Training Loss and Accuracy
# display_training_loss_accuracy(training_losses, validation_losses, train_accuracies, val_accuracies, epochs=10)

# # Save model4 after training
# #model4 = model  # Final model


  waveform = torch.tensor(self.dataframe.iloc[idx]['Processed Audio'], dtype=torch.float32)
  labels = torch.tensor(labels, dtype=torch.long).to(device)  # Ensure labels are tensors
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Epoch 1/10:   0%|          | 0/26 [00:00<?, ?it/s]


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 32, 32000]

In [None]:
# 5.2 Define Training Loop

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training function
def train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, epochs=3):
    model.train()
    best_val_loss = float('inf')
    training_losses = []
    validation_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(epochs):
        total_loss = 0
        correct_preds_train = 0
        total_train = 0
        print(f"Epoch {epoch + 1}/{epochs} starting...")

        for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}")):
            input_waveforms, labels = batch
            input_waveforms = input_waveforms.to(device)  # Ensure correct shape
            labels = labels.to(device)

            # Process the audio (ensure shape [batch_size, num_samples])
            input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
            input_values = input_values.squeeze(1)  # Remove unnecessary channel dimension (if present)
            input_values = input_values.squeeze(0)  # Remove the extra batch dimension
            input_values = input_values.to(device)  # Move to device

            # Print the shape of the input values after processing
            print(f"Batch {batch_idx + 1}: Input values shape: {input_values.shape}")

            # Forward pass
            outputs = model(input_values, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Accuracy calculation
            preds = torch.argmax(logits, dim=-1)
            correct_preds_train += (preds == labels).sum().item()
            total_train += labels.size(0)

        # Calculate training accuracy
        train_accuracy = correct_preds_train / total_train
        train_accuracies.append(train_accuracy)

        # Validation phase
        model.eval()
        correct_preds_val = 0
        total_val = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_waveforms, labels = batch
                input_waveforms = input_waveforms.to(device)
                labels = labels.to(device)

                # Process the audio (ensure shape [batch_size, num_samples])
                input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
                input_values = input_values.squeeze(1)  # Remove unnecessary channel dimension
                input_values = input_values.squeeze(0)  # Remove the extra batch dimension
                input_values = input_values.to(device)

                # Predict
                outputs = model(input_values)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=-1)

                correct_preds_val += (preds == labels).sum().item()
                total_val += labels.size(0)

        # Calculate validation accuracy
        val_accuracy = correct_preds_val / total_val
        val_accuracies.append(val_accuracy)

        # Save the best model
        val_loss = total_loss / len(train_dataloader)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_best_model(model, epoch, best_val_loss)

        # Early stopping
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch + 1}")
            break

        training_losses.append(total_loss / len(train_dataloader))
        validation_losses.append(val_loss)

    return training_losses, validation_losses, train_accuracies, val_accuracies


In [None]:
# Training the model and getting losses and accuracies
training_losses, validation_losses, train_accuracies, val_accuracies = train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, epochs=100)

# Display Training Loss and Accuracy
display_training_loss_accuracy(training_losses, validation_losses, train_accuracies, val_accuracies, epochs=10)

# Save model4 after training
#model4 = model  # Final model

Epoch 1/100 starting...


  waveform = torch.tensor(self.dataframe.iloc[idx]['Processed Audio'], dtype=torch.float32)
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Batch 1: Input values shape: torch.Size([32, 32000])


Epoch 1/100:   4%|▍         | 1/26 [00:01<00:35,  1.42s/it]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Batch 2: Input values shape: torch.Size([32, 32000])


Epoch 1/100:   8%|▊         | 2/26 [00:02<00:30,  1.27s/it]


RuntimeError: stack expects each tensor to be equal size, but got [32000] at entry 0 and [2, 32000] at entry 3

## Visualize training performance

In [None]:
# 6. Display Training Loss and Accuracy
def display_training_loss_accuracy(training_losses, validation_losses, train_accuracies, val_accuracies, epochs):
    plt.figure(figsize=(20, 4))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='upper left')

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(training_losses, label='Train Loss')
    plt.plot(validation_losses, label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')

    plt.show()

#display_training_loss_accuracy(training_losses, validation_losses, epochs=10)


In [None]:
# 7. Evaluate and Generate Classification Report
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_waveforms, labels = batch
            input_waveforms = input_waveforms.squeeze(1).to(device)
            labels = labels.to(device)

            input_values = processor(input_waveforms, return_tensors="pt", padding=True).input_values
            input_values = input_values.to(device)

            # Predict
            outputs = model(input_values)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=-1)

            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return np.array(true_labels), np.array(predictions)

# Evaluate model on validation set
true_labels_val, predictions_val = evaluate_model(model, val_dataloader)

labels = ['Negative', 'Positive', 'Neutral']
print("\nValidation Data Classification Report:")
print("--------------------------------------------------------")
print(classification_report(true_labels_val, predictions_val, target_names=labels))