In [1]:
from transformers import AutoProcessor, AutoModelForAudioClassification

In [7]:
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import numpy as np
# from pydub import AudioSegment

In [8]:
import numpy as np
import pandas as pd
import librosa

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Pre-trained Model

In [9]:

# https://github.com/ehcalabres/EMOVoice
# the preprocessor was derived from https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
# processor1 = AutoProcessor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
# ^^^ no preload model available for this model (above), but the `feature_extractor` works in place


model1 = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.output.weight', 'classifier.output.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

In [72]:
id2label = {
        0: "angry",
        1: "calm",
        2: "disgust",
        3: "fearful",
        4: "happy",
        5: "neutral",
        6: "sad",
        7: "surprised"
    }

label2id = {y:x for x, y in id2label.items()}

In [71]:
crema_id2label = {
        0: "angry",
        2: "disgust",
        3: "fear",
        4: "happy",
        5: "neutral",
        6: "sadness",
    }

crema_label2id = {y:x for x, y in crema_id2label.items()}

In [52]:
def predict_emotion(audio_file):
    # if not audio_file:
    #     # I fetched some samples with known emotions from here: https://www.fesliyanstudios.com/royalty-free-sound-effects-download/poeple-crying-252
    #     audio_file = 'mp3/dude-crying.mp3'

    # sound = AudioSegment.from_file(audio_file)
    # sound = sound.set_frame_rate(16000)

    sig, sr = librosa.load(audio_file)
    wav_data = librosa.resample(sig, orig_sr=sr, target_sr=16000)
    # sound_array = np.array(wav_data.get_array_of_samples())
    
    # this model is VERY SLOW, so best to pass in small sections that contain 
    # emotional words from the transcript. like 10s or less.
    # how to make sub-chunk  -- this was necessary even with very short audio files 
    # test = torch.tensor(input.input_values.float()[:, :100000])

    input = feature_extractor(
        raw_speech=wav_data,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt")

    result = model1.forward(input.input_values.float())
    # making sense of the result 
    id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }
    interp = dict(zip(id2label.values(), list(round(float(i),4) for i in result[0][0])))

    pred = np.argmax(result[0][0].detach().numpy())
    return pred, interp

In [53]:
df = pd.read_csv("crema_dataset_meta.csv")

In [54]:
df.head()

Unnamed: 0,file,file_path,label
0,1022_ITS_ANG_XX.wav,Crema/1022_ITS_ANG_XX.wav,angry
1,1037_ITS_ANG_XX.wav,Crema/1037_ITS_ANG_XX.wav,angry
2,1060_ITS_NEU_XX.wav,Crema/1060_ITS_NEU_XX.wav,neutral
3,1075_ITS_NEU_XX.wav,Crema/1075_ITS_NEU_XX.wav,neutral
4,1073_IOM_DIS_XX.wav,Crema/1073_IOM_DIS_XX.wav,disgust


In [55]:
files = df.loc[:10, "file_path"]
labels = df.loc[:10, "label"]

In [56]:
preds = []
for file in files:
    curr_pred = predict_emotion(file)
    preds.append(curr_pred)

In [66]:
raw_pred = pd.Series(list(zip(*preds))[0])

In [73]:
labels_id = labels.apply(lambda x: crema_label2id[x])
pred_id = raw_pred

In [74]:
matrix = confusion_matrix(labels_id, pred_id)

In [75]:
matrix

array([[2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0],
       [1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0]])

In [77]:
accuracy = matrix.diagonal()/matrix.sum(axis=1)

  accuracy = matrix.diagonal()/matrix.sum(axis=1)


In [78]:
accuracy

array([1. , 0. , 0.5, 0. , 0. , 0.5, nan])

# Experiments

In [13]:
sig, sr = librosa.load(file_path_str)
wav_data = librosa.resample(sig, orig_sr=sr, target_sr=16000)


In [16]:
input = feature_extractor(
    raw_speech=wav_data,
    sampling_rate=16000,
    padding=True,
    return_tensors="pt")

In [17]:
result = model1.forward(input.input_values.float())

In [22]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0625,  0.1069, -0.0362, -0.0244,  0.0209,  0.0692, -0.0219,  0.0375]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [18]:
id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }
interp = dict(zip(id2label.values(), list(round(float(i),4) for i in result[0][0])))

In [19]:
interp

{'angry': 0.0625,
 'calm': 0.1069,
 'disgust': -0.0362,
 'fearful': -0.0244,
 'happy': 0.0209,
 'neutral': 0.0692,
 'sad': -0.0219,
 'surprised': 0.0375}

In [None]:
    sig, sr = librosa.load(audio_file)
    wav_data = librosa.resample(sig, orig_sr=sr, target_sr=16000)
    sound_array = np.array(wav_data.get_array_of_samples())
    
    # this model is VERY SLOW, so best to pass in small sections that contain 
    # emotional words from the transcript. like 10s or less.
    # how to make sub-chunk  -- this was necessary even with very short audio files 
    # test = torch.tensor(input.input_values.float()[:, :100000])

    input = feature_extractor(
        raw_speech=sound_array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt")

    result = model1.forward(input.input_values.float())
    # making sense of the result 
    id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }
    interp = dict(zip(id2label.values(), list(round(float(i),4) for i in result[0][0])))

In [11]:
df = pd.read_csv("huge_collated_dataset_meta.csv")

In [12]:
file_path_str = df.loc[1, "file_path"]

In [None]:
sig, sr = librosa.load(file_path_str)
wav_data = librosa.resample(sig, orig_sr=sr, target_sr=16000)

In [23]:
processor = AutoProcessor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")



OSError: Can't load tokenizer for 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.