<a href="https://colab.research.google.com/github/wiamfa/Speech_emotion_recognition/blob/main/huggingFace_Hubert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
model_checkpoint = "facebook/wav2vec2-base"
batch_size = 32
ravdess_emotions = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]



In [2]:
%%capture
!pip install datasets
!pip install transformers
!pip install librosa

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
%%capture
!apt install git-lfs

In [6]:
from transformers.utils import send_example_telemetry

send_example_telemetry("audio_classification_notebook", framework="pytorch")

#test HuggingFace on Hubert

In [7]:
from datasets import load_dataset, load_metric, Audio

In [9]:
dataset = load_dataset("xbgoose/ravdess",split="train")
metric = load_metric("accuracy")

In [10]:
dataset000=dataset.remove_columns([ 'modality', 'vocal_channel', 'emotional_intensity', 'statement', 'repetition', 'actor', 'gender'])

In [11]:
from datasets.tasks.text_classification import ClassLabel
#resampling to 16Khz
dataset00 = dataset000.cast_column("audio", Audio(sampling_rate=16000))
#Label identification
dataset0 = dataset00.cast_column("emotion", ClassLabel(num_classes=8, names=ravdess_emotions))


In [12]:
dataset1=dataset0.train_test_split(test_size=0.2)

In [13]:
dataset1

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 288
    })
})

In [14]:
dataset1["test"][100]

{'audio': {'path': '03-01-06-02-02-01-22.wav',
  'array': array([-4.99989983e-06, -6.66290725e-06, -1.61988228e-05, ...,
         -2.66165017e-07,  2.41199956e-07,  0.00000000e+00]),
  'sampling_rate': 16000},
 'emotion': 5}

In [15]:
dataset1["train"].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'emotion': ClassLabel(names=['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'], id=None)}

In [16]:
labels = dataset1["train"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label["7"]

'surprised'

In [17]:
import random
from IPython.display import Audio, display

for _ in range(5):
    rand_idx = random.randint(0, len(dataset1["train"])-1)
    example = dataset1["train"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[str(example["emotion"])]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

In [18]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [19]:
max_duration = 2.0  # seconds

In [20]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    return inputs

In [21]:
preprocess_function(dataset1['train'][:5])

{'input_values': [array([-1.01773665e-04, -1.01776764e-04, -1.01771220e-04, ...,
       -2.39907131e-01, -2.63000667e-01, -2.84357309e-01], dtype=float32), array([2.5621802e-04, 2.5621802e-04, 2.5621802e-04, ..., 1.4594896e-01,
       2.3602819e-01, 3.1466085e-01], dtype=float32), array([ 0.00153229,  0.00138206,  0.00188451, ..., -0.11429794,
       -0.20738928,  0.10649358], dtype=float32), array([0.0009017 , 0.0009017 , 0.0009017 , ..., 0.39395788, 0.38719684,
       0.43759483], dtype=float32), array([ 0.00191795,  0.00148678,  0.0019821 , ..., -0.67094785,
       -1.0033948 , -1.3098693 ], dtype=float32)]}

In [22]:
encoded_dataset = dataset1.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_dataset

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 288
    })
})

In [23]:
encoded_dataset["train"][0]["emotion"]

7

In [24]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.bias', 'classifier.weight', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install accelerate -U

In [None]:
!accelerate config



In [27]:
!accelerate env

In [None]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ravdess",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    gradient_checkpointing=True,


)

In [29]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [30]:
encoded_dataset["train"]

Dataset({
    features: ['emotion', 'input_values'],
    num_rows: 1152
})

In [31]:
encoded_dataset1= encoded_dataset.rename_column("emotion", "labels")

In [32]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset1["train"],
    eval_dataset=encoded_dataset1["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/Wiam/wav2vec2-base-finetuned-ravdess into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/361M [00:00<?, ?B/s]

Download file runs/Aug17_11-58-01_976a2f77c133/events.out.tfevents.1692273492.976a2f77c133.7193.0: 100%|######…

Download file training_args.bin: 100%|##########| 3.93k/3.93k [00:00<?, ?B/s]

Download file runs/Aug17_12-18-40_976a2f77c133/events.out.tfevents.1692274753.976a2f77c133.7193.4:  59%|#####8…

Download file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692273523.976a2f77c133.7193.1: 100%|######…

Download file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692274149.976a2f77c133.7193.2: 100%|######…

Download file runs/Aug17_12-18-40_976a2f77c133/events.out.tfevents.1692279222.976a2f77c133.7193.5: 100%|######…

Clean file runs/Aug17_11-58-01_976a2f77c133/events.out.tfevents.1692273492.976a2f77c133.7193.0:  17%|#6       …

Clean file training_args.bin:  25%|##5       | 1.00k/3.93k [00:00<?, ?B/s]

Clean file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692273523.976a2f77c133.7193.1:  17%|#6       …

Clean file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692274149.976a2f77c133.7193.2:  12%|#1       …

Clean file runs/Aug17_12-18-40_976a2f77c133/events.out.tfevents.1692274753.976a2f77c133.7193.4:   3%|3        …

Clean file runs/Aug17_12-18-40_976a2f77c133/events.out.tfevents.1692279222.976a2f77c133.7193.5: 100%|#########…

Download file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692274593.976a2f77c133.7193.3: 100%|######…

Clean file runs/Aug17_11-58-34_976a2f77c133/events.out.tfevents.1692274593.976a2f77c133.7193.3: 100%|#########…

Clean file pytorch_model.bin:   0%|          | 1.00k/361M [00:00<?, ?B/s]

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.073949,0.15625
2,2.078100,2.061116,0.118056
3,2.066800,2.030826,0.253472


In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()