In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa

# Monitor the training process
# !pip install wandb

In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [None]:
data = []

for path in tqdm(Path("/content/drive/MyDrive/vocal_dataset/TESS Toronto emotional speech set data").glob("**/*.wav")):
    name = str(path).split('/')[-1].split('.')[0]
    label = str(path).split('/')[-2].split('_')[-1]
    if label == "surprised":
      label = "surprise"
    try:
        # There are some broken files
        s = torchaudio.load(path)
        data.append({
            "name": name,
            "path": path,
            "emotion": label.lower()
        })
    except Exception as e:
        # print(str(path), e)
        pass

    # break

2800it [01:49, 25.63it/s]


In [None]:
df = pd.DataFrame(data)
df[30:300:20]

Unnamed: 0,name,path,emotion
30,YAF_dog_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
50,YAF_gaze_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
70,YAF_kill_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
90,YAF_luck_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
110,YAF_mop_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
130,YAF_rat_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
150,YAF_rush_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
170,YAF_time_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
190,YAF_turn_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear
210,OAF_book_happy,/content/drive/MyDrive/vocal_dataset/TESS Toro...,happy


In [None]:
# Filter broken and non-existed paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
# df = df.drop("status")
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 2800
Step 1: 2800


Unnamed: 0,name,path,emotion,status
0,YAF_pad_disgust,/content/drive/MyDrive/vocal_dataset/TESS Toro...,disgust,True
1,OAF_cheek_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear,True
2,YAF_merge_ps,/content/drive/MyDrive/vocal_dataset/TESS Toro...,surprise,True
3,OAF_youth_fear,/content/drive/MyDrive/vocal_dataset/TESS Toro...,fear,True
4,OAF_raise_disgust,/content/drive/MyDrive/vocal_dataset/TESS Toro...,disgust,True


In [None]:
print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['disgust' 'fear' 'surprise' 'happy' 'neutral' 'angry' 'sad']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
angry,400
disgust,400
fear,400
happy,400
neutral,400
sad,400
surprise,400


In [None]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["emotion"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
# Pass the original and target sample rates as keyword arguments
speech = librosa.resample(y=np.asarray(speech), orig_sr=sr, target_sr=16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 856
      Label: happy



In [None]:
save_path = "/content/drive/MyDrive/vocal_dataset/data"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(2240, 4)
(560, 4)


In [None]:
# Loading the created dataset using datasets
import datasets
from datasets import load_dataset


data_files = {
    "train": f"{save_path}/train.csv",
    "validation": f"{save_path}/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

train_dataset = train_dataset.remove_columns("status")
eval_dataset = eval_dataset.remove_columns("status")

print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 2240
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 560
})


In [None]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 7 classes: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [None]:
from transformers import AutoConfig, Wav2Vec2Processor

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
pooling_mode = "mean"

In [None]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The target sampling rate: 16000


In [None]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    try:
        speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
        target_list = [label_to_id(label, label_list) for label in examples[output_column]]

        result = processor(speech_list, sampling_rate=target_sampling_rate)
        result["labels"] = list(target_list)

        return result
    except Exception as e:
        import traceback
        print(f"Error processing examples: {examples}")
        traceback.print_exc()
        raise e # Re-raise the exception to stop execution

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4  # Disable multiprocessing
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4  # Disable multiprocessing
)

In [None]:
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

# Update the preprocess_function to use the feature extractor
def preprocess_function(examples):
    try:
        speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
        target_list = [label_to_id(label, label_list) for label in examples[output_column]]

        # Use feature_extractor instead of processor
        result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
        result["labels"] = list(target_list)

        return result
    except Exception as e:
        import traceback
        print(f"Error processing examples: {examples}")
        traceback.print_exc()
        raise e # Re-raise the exception to stop execution



model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

In [None]:
model.summary()