## Installing the required libraries

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git datasets huggingface-hub joblib librosa resampy keras-core

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m753.1/753.1 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m

## Getting the data from the cloud storage

In [None]:
!wget https://storage.googleapis.com/kerascvnlp_data/archive.zip

--2023-07-24 17:04:41--  https://storage.googleapis.com/kerascvnlp_data/archive.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.125.128, 142.250.136.128, 142.250.148.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.125.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 450102890 (429M) [application/zip]
Saving to: ‘archive.zip’


2023-07-24 17:04:47 (79.3 MB/s) - ‘archive.zip’ saved [450102890/450102890]



In [None]:
!unzip -q /content/archive.zip -d ravdess/

In [None]:
import os
import numpy as np
import librosa


We use the librosa library to read the audio files and generate dataloader

In [None]:
def get_feature(file_path: str, mfcc_len: int = 39, mean_signal_length: int = 110000):
    signal, fs = librosa.load(file_path)
    s_len = len(signal)

    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_rem = pad_len % 2
        pad_len //= 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values = 0)
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len:pad_len + mean_signal_length]
    mfcc = librosa.feature.mfcc(y=signal, sr=fs, n_mfcc=39)
    mfcc = mfcc.T
    feature = mfcc
    return feature

In [None]:
from tqdm import tqdm
lst = []
path = '/content/ravdess'

for subdir, dirs, files in os.walk(path):
    for file in files:
        try:
            X, sample_rate = librosa.load(os.path.join(subdir, file),
                                          res_type='kaiser_fast')
            #
            file = int(file[7:8]) - 1
            arr = X[:64000], file
            lst.append(arr)
        except ValueError as err:
            print(err)
            continue

X, y = zip(*lst)
X, y = np.asarray(X), np.asarray(y)

CPU times: user 2min 39s, sys: 1.64 s, total: 2min 41s
Wall time: 2min 58s


In [None]:
X.shape,y.shape

((2880, 64000), (2880,))

In [None]:
MAX_DURATION = 2
# Sampling rate is the number of samples of audio recorded every second
SAMPLING_RATE = 16000
BATCH_SIZE = 2  # Batch-size for training and evaluating our model.
NUM_CLASSES = 8  # Number of classes our dataset will have (11 in our case).
HIDDEN_DIM = 768  # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE  # Maximum length of the input audio file.
# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
MAX_FRAMES = 99
MAX_EPOCHS = 5  # Maximum number of training epochs.

MODEL_CHECKPOINT = "facebook/wav2vec2-base"

In [None]:
RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise")

In [None]:
labels = RAVDESS_CLASS_LABELS
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

print(id2label)

{'0': 'angry', '1': 'calm', '2': 'disgust', '3': 'fear', '4': 'happy', '5': 'neutral', '6': 'sad', '7': 'surprise'}


We use the feature extractor model from transformer library to extract features from the audio data

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_CHECKPOINT, return_attention_mask=True
)


audio_arrays = X
inputs = feature_extractor(
    audio_arrays,
    sampling_rate=feature_extractor.sampling_rate,
    max_length=MAX_SEQ_LENGTH,
    truncation=True,
    padding=True,
)


Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



In [None]:
len(inputs['input_values']) , inputs['input_values'][0].shape

(2880, (32000,))

In [None]:
import random
import logging

import numpy as np
import tensorflow as tf
import keras_core as keras
from keras_core import layers

Using TensorFlow backend


## Model Buidling

In [None]:
from transformers import TFWav2Vec2Model


def mean_pool(hidden_states, feature_lengths):
    attenion_mask = tf.sequence_mask(
        feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
    )
    padding_mask = tf.cast(
        tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]),
        dtype=tf.dtypes.bool,
    )
    hidden_states = tf.where(
        tf.broadcast_to(
            tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM)
        ),
        0.0,
        hidden_states,
    )
    pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape(
        tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
        [-1, 1],
    )
    return pooled_state


class TFWav2Vec2ForAudioClassification(layers.Layer):

    def __init__(self, model_checkpoint, num_classes):
        super().__init__()
        # Instantiate the Wav2Vec 2.0 model without the Classification-Head
        self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
            model_checkpoint, apply_spec_augment=False, from_pt=True
        )
        self.pooling = layers.GlobalAveragePooling1D()
        self.intermediate_layer_dropout = layers.Dropout(0.5)
        # Classification-Head
        self.final_layer = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        hidden_states = self.wav2vec2(inputs[0])[0]
        if tf.is_tensor(inputs[1]):
            audio_lengths = tf.cumsum(inputs[1], -1)[:, -1]
            feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
                audio_lengths
            )
            pooled_state = mean_pool(hidden_states, feature_lengths)
        else:
            pooled_state = self.pooling(hidden_states)

        intermediate_state = self.intermediate_layer_dropout(pooled_state)
        final_state = self.final_layer(intermediate_state)

        return final_state

In [None]:
def build_model():
    # Model's input
    inputs = [keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="float32"),
         keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="int32"),
    ]
    wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT, NUM_CLASSES)(
        inputs
    )
    # Model
    model = keras.Model(inputs, wav2vec2_model)
    # Loss
    loss = keras.losses.CategoricalCrossentropy(from_logits=False)
    # Optimizer
    optimizer = keras.optimizers.Adam(learning_rate=1e-5)
    # Compile and return
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    return model


model = build_model()
model.summary()



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['project_q.bias', 'project_hid.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'quantizer.codevectors', 'project_hid.bias', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFWav2Vec2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the che

In [None]:
train_x = [y for x, y in inputs.items()]

In [None]:
tx = np.array(train_x)
tx.shape

(2, 2880, 32000)

In [None]:
b = np.zeros((y.size, y.max() + 1))
b[np.arange(y.size), y] = 1
b.shape

(2880, 8)

In [None]:
model.fit(
    [tx[0],tx[1]],
    b,
    batch_size=BATCH_SIZE,
    epochs=MAX_EPOCHS,
)

Epoch 1/5
[1m1207/1440[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m55s[0m 237ms/step - accuracy: 0.1405 - loss: 2.1368

## Saving and loading the model using inbuilt functions of keras_core

In [None]:
model.save_model('model.keras')

In [None]:
tmp = keras.saving.load_model('model.keras')

In [None]:
import os
import numpy as np
import librosa
import tensorflow as tf
import keras_core as keras
from keras_core import layers
from transformers import AutoFeatureExtractor

MAX_DURATION = 2
# Sampling rate is the number of samples of audio recorded every second
SAMPLING_RATE = 16000
BATCH_SIZE = 2  # Batch-size for training and evaluating our model.
NUM_CLASSES = 8  # Number of classes our dataset will have (11 in our case).
HIDDEN_DIM = 768  # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE  # Maximum length of the input audio file.
# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
MAX_FRAMES = 99
MAX_EPOCHS = 5  # Maximum number of training epochs.
RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise")
MODEL_CHECKPOINT = "facebook/wav2vec2-base"

labels = RAVDESS_CLASS_LABELS
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

print(id2label)


feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_CHECKPOINT, return_attention_mask=True
)
tmp = keras.saving.load_model('model.keras')
sample,_ = librosa.load('/content/ravdess/Actor_01/03-01-02-01-01-02-01.wav',res_type='kaiser_fast')
inp =  feature_extractor(
    sample[:64000],
    sampling_rate=feature_extractor.sampling_rate,
    max_length=MAX_SEQ_LENGTH,
    truncation=True,
    padding=True,
)
inp = np.array([y for x,y in inp.items()])

print(inp[0])
pred = tmp.predict([inp[0],inp[1]])

[[ 6.13512413e-04  6.13512413e-04  6.13512413e-04 ... -7.35014141e-01
  -8.40578854e-01 -9.34421659e-01]]


In [None]:
id2label[str(np.argmax(pred))]

'calm'

In [None]:
!pip install gradio

## Gradio demo creation

In [None]:
import gradio as gr
import os
import numpy as np
import librosa
import tensorflow as tf
import keras_core as keras
from keras_core import layers
from transformers import AutoFeatureExtractor

MAX_DURATION = 2
# Sampling rate is the number of samples of audio recorded every second
SAMPLING_RATE = 16000
BATCH_SIZE = 2  # Batch-size for training and evaluating our model.
NUM_CLASSES = 8  # Number of classes our dataset will have (11 in our case).
HIDDEN_DIM = 768  # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE  # Maximum length of the input audio file.
# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
MAX_FRAMES = 99
MAX_EPOCHS = 5  # Maximum number of training epochs.
RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise")
MODEL_CHECKPOINT = "facebook/wav2vec2-base"

labels = RAVDESS_CLASS_LABELS
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label



feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_CHECKPOINT, return_attention_mask=True
)
tmp = keras.saving.load_model('model.keras')

def greet(name):
  inp =  feature_extractor(
    name[1],
    sampling_rate=feature_extractor.sampling_rate,
    max_length=MAX_SEQ_LENGTH,
    truncation=True,
    padding=True,
  )
  inp = np.array([y for x,y in inp.items()])
  pred = tmp.predict([inp[0],inp[1]])
  lab = id2label[str(np.argmax(pred))]
  return lab

iface = gr.Interface(fn=greet, inputs="audio", outputs="text")
iface.launch(debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Keyboard interruption in main thread... closing server.


