In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"maryamchaudhaary","key":"02f6547e7adf1e88ded61b673d765b9d"}'}

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
 87% 375M/429M [00:00<00:00, 569MB/s]
100% 429M/429M [00:00<00:00, 531MB/s]


In [6]:
!unzip ravdess-emotional-speech-audio.zip -d ravdess_data

Archive:  ravdess-emotional-speech-audio.zip
  inflating: ravdess_data/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-02-02-02-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-03-01-01-01-01.wav  
  inflating: ravdess_data/Actor_01/03-01-03-01-01-02-01.wav  
  inflating: ravdess_data/Actor_01/03-01-03-01-02-01-01.wav  
  inflating: ravdess_data

In [34]:
!pip install librosa soundfile numpy pandas scikit-learn tensorflow gradio joblib



In [35]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
import joblib

In [36]:
# ----------------- SETTINGS -----------------
DATASET_PATH = "/content/ravdess_data/audio_speech_actors_01-24"  # <-- change to your dataset folder
SAMPLE_RATE = 22050
N_MFCC = 40
MAX_PAD_LEN = 174
TARGET_EMOTIONS = {'01': 'neutral', '03': 'happy', '04': 'sad', '05': 'angry'}

In [37]:
def get_emotion_from_filename(fname):
    parts = fname.split('-')
    return parts[2] if len(parts) >= 3 else None

In [38]:
def extract_features(path):
    y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC).T
    if len(mfcc) < MAX_PAD_LEN:
        pad = MAX_PAD_LEN - len(mfcc)
        mfcc = np.pad(mfcc, ((0, pad), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:MAX_PAD_LEN, :]
    return mfcc

# Collect files & labels
files, labels = [], []
for root, _, fnames in os.walk(DATASET_PATH):
    for f in fnames:
        if f.lower().endswith('.wav'):
            code = get_emotion_from_filename(f)
            if code in TARGET_EMOTIONS:
                files.append(os.path.join(root, f))
                labels.append(TARGET_EMOTIONS[code])

In [39]:
# Extract MFCC features
X, y = [], []
for path, lab in zip(files, labels):
    feat = extract_features(path)
    if feat is not None:
        X.append(feat); y.append(lab)
X, y = np.array(X), np.array(y)
print("Dataset:", X.shape, len(y))

Dataset: (672, 174, 40) 672


In [40]:
# Encode labels
le = LabelEncoder()
y_enc = to_categorical(le.fit_transform(y))
joblib.dump(le, 'label_encoder.joblib')

['label_encoder.joblib']

In [41]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, stratify=y, random_state=42)

In [42]:
# CNN model
model = Sequential([
    Conv1D(64, 5, activation='relu', input_shape=(MAX_PAD_LEN, N_MFCC)),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.3),

    Conv1D(128, 5, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(y_enc.shape[1], activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [44]:
# Train
callbacks = [
    ModelCheckpoint('best_ravdess_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True)
]
history = model.fit(X_train, y_train, validation_split=0.15, epochs=50, batch_size=32, callbacks=callbacks)

Epoch 1/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 43ms/step - accuracy: 0.3841 - loss: 2.2209
Epoch 1: val_accuracy improved from -inf to 0.29630, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 76ms/step - accuracy: 0.3909 - loss: 2.1823 - val_accuracy: 0.2963 - val_loss: 4.4893
Epoch 2/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 43ms/step - accuracy: 0.5498 - loss: 1.0229
Epoch 2: val_accuracy did not improve from 0.29630
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.5491 - loss: 1.0209 - val_accuracy: 0.2963 - val_loss: 4.0761
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6686 - loss: 0.8105
Epoch 3: val_accuracy improved from 0.29630 to 0.33333, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.6684 - loss: 0.8106 - val_accuracy: 0.3333 - val_loss: 5.4041
Epoch 4/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 46ms/step - accuracy: 0.6337 - loss: 0.7910
Epoch 4: val_accuracy improved from 0.33333 to 0.39506, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.6346 - loss: 0.7904 - val_accuracy: 0.3951 - val_loss: 2.9102
Epoch 5/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 75ms/step - accuracy: 0.7359 - loss: 0.6481
Epoch 5: val_accuracy improved from 0.39506 to 0.44444, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 92ms/step - accuracy: 0.7366 - loss: 0.6470 - val_accuracy: 0.4444 - val_loss: 2.8082
Epoch 6/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 48ms/step - accuracy: 0.7168 - loss: 0.6831
Epoch 6: val_accuracy improved from 0.44444 to 0.45679, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.7188 - loss: 0.6770 - val_accuracy: 0.4568 - val_loss: 1.9547
Epoch 7/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.7826 - loss: 0.5501
Epoch 7: val_accuracy improved from 0.45679 to 0.54321, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.7837 - loss: 0.5471 - val_accuracy: 0.5432 - val_loss: 1.1967
Epoch 8/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.8440 - loss: 0.4320
Epoch 8: val_accuracy improved from 0.54321 to 0.61728, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.8440 - loss: 0.4299 - val_accuracy: 0.6173 - val_loss: 0.9250
Epoch 9/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 43ms/step - accuracy: 0.8605 - loss: 0.3660
Epoch 9: val_accuracy improved from 0.61728 to 0.70370, saving model to best_ravdess_model.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.8585 - loss: 0.3672 - val_accuracy: 0.7037 - val_loss: 0.7969
Epoch 10/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 42ms/step - accuracy: 0.8564 - loss: 0.3286
Epoch 10: val_accuracy did not improve from 0.70370
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.8549 - loss: 0.3345 - val_accuracy: 0.6420 - val_loss: 1.0485
Epoch 11/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.8676 - loss: 0.3125
Epoch 11: val_accuracy did not improve from 0.70370
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - accuracy: 0.8671 - loss: 0.3140 - val_accuracy: 0.6914 - val_loss: 0.7758
Epoch 12/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.8684 - loss: 0.3368
Epoch 12: v



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 119ms/step - accuracy: 0.9354 - loss: 0.1999 - val_accuracy: 0.7654 - val_loss: 0.7358
Epoch 15/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 113ms/step - accuracy: 0.9336 - loss: 0.1605
Epoch 15: val_accuracy did not improve from 0.76543
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 122ms/step - accuracy: 0.9320 - loss: 0.1641 - val_accuracy: 0.7531 - val_loss: 0.5891
Epoch 16/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.9574 - loss: 0.1406
Epoch 16: val_accuracy did not improve from 0.76543
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - accuracy: 0.9560 - loss: 0.1436 - val_accuracy: 0.7284 - val_loss: 0.6873
Epoch 17/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.9383 - loss: 0.1685
Epoch 17:



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.9015 - loss: 0.2559 - val_accuracy: 0.8272 - val_loss: 0.6098
Epoch 19/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 43ms/step - accuracy: 0.9135 - loss: 0.2223
Epoch 19: val_accuracy did not improve from 0.82716
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.9153 - loss: 0.2188 - val_accuracy: 0.5679 - val_loss: 1.6851
Epoch 20/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.9145 - loss: 0.2588
Epoch 20: val_accuracy did not improve from 0.82716
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.9145 - loss: 0.2573 - val_accuracy: 0.6173 - val_loss: 1.2988
Epoch 21/50
[1m14/15[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 44ms/step - accuracy: 0.9670 - loss: 0.1144
Epoch 21: va

In [45]:
# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"✅ Test accuracy: {acc:.3f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6515 - loss: 1.1507
✅ Test accuracy: 0.667


In [46]:
import gradio as gr
import librosa
import numpy as np
import tensorflow as tf
import joblib
import pandas as pd

# ---- Load model & encoder ----
model = tf.keras.models.load_model("best_ravdess_model.h5")
le = joblib.load("label_encoder.joblib")

# ---- Constants ----
SAMPLE_RATE = 22050
N_MFCC = 40
MAX_PAD_LEN = 174

def extract_features_from_audio(path):
    y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC).T
    if len(mfcc) < MAX_PAD_LEN:
        pad = MAX_PAD_LEN - len(mfcc)
        mfcc = np.pad(mfcc, ((0, pad), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:MAX_PAD_LEN, :]
    return np.expand_dims(mfcc, axis=0)

def predict_emotion(audio):
    if audio is None:
        return "Please record or upload an audio file.", None

    # Handle tuple input from microphone
    if isinstance(audio, tuple):
        sr, data = audio
        librosa.output.write_wav("temp.wav", data, sr)
        audio = "temp.wav"

    features = extract_features_from_audio(audio)
    preds = model.predict(features)[0]
    pred_idx = np.argmax(preds)
    emotion = le.inverse_transform([pred_idx])[0]

    # make a table of probabilities
    df = pd.DataFrame({
        "Emotion": le.classes_,
        "Probability": [f"{p*100:.2f}%" for p in preds]
    })
    return f"🎯 Predicted Emotion: **{emotion.capitalize()}**", df

# ---- Gradio Interface ----
app = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎙️ Speak or Upload Audio (.wav)"),
    outputs=[
        gr.Markdown(label="Prediction"),
        gr.Dataframe(label="Emotion Probabilities")
    ],
    title="🎵 Speech Emotion Recognition (RAVDESS)",
    description="Record or upload a short speech clip — the model predicts the speaker's emotion (neutral, happy, sad, angry)."
)

app.launch(share=True)  # set share=False for local run




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8bb01a52c40865aad5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


