In [6]:
import json
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"

In [7]:
!chmod 600 /content/kaggle.json

In [8]:
!kaggle datasets download -d dmitrybabko/speech-emotion-recognition-en
!unzip -q speech-emotion-recognition-en.zip -d dataset

Dataset URL: https://www.kaggle.com/datasets/dmitrybabko/speech-emotion-recognition-en
License(s): copyright-authors
Downloading speech-emotion-recognition-en.zip to /content
 95% 938M/987M [00:09<00:00, 107MB/s] 
100% 987M/987M [00:09<00:00, 108MB/s]


In [9]:
!pip install librosa scikit-learn tensorflow numpy



In [10]:
import os
import numpy as np
import librosa
import librosa.display
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization)
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint


In [12]:
DATA_PATH = "/content/dataset/Crema"
FIXED_LENGTH = 16000
SAMPLE_RATE = 16000

In [13]:
def add_noise(y, noise_factor=0.005):
    return y + noise_factor * np.random.randn(len(y))

def time_stretch(y, rate=0.8):
    try:
        return librosa.effects.time_stretch(y, rate)
    except:
        return y

def pitch_shift(y, sr, steps=2):
    try:
        return librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)
    except:
        return y

def extract_features(y, sr=SAMPLE_RATE):
    y = librosa.util.fix_length(y, size=FIXED_LENGTH)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64, n_fft=1024, hop_length=256)
    log_mel = librosa.power_to_db(mel)
    return log_mel


In [14]:
def load_data(augment=True):
    X, y = [], []
    emotion_map = {
        "ANG": "angry", "DIS": "disgust", "FEA": "fear",
        "HAP": "happy", "NEU": "neutral", "SAD": "sad"
    }

    for file in os.listdir(DATA_PATH):
        if not file.endswith(".wav"):
            continue
        try:
            parts = file.split("_")
            emotion_code = parts[2].strip().upper()
            if emotion_code not in emotion_map:
                continue
            emotion = emotion_map[emotion_code]
            path = os.path.join(DATA_PATH, file)
            y_raw, sr = librosa.load(path, sr=SAMPLE_RATE, duration=1.0)

            for audio in [y_raw,
                          add_noise(y_raw),
                          pitch_shift(y_raw, sr, steps=2),
                          time_stretch(y_raw, rate=0.9)] if augment else [y_raw]:
                features = extract_features(audio, sr)
                X.append(features)
                y.append(emotion)

        except Exception as e:
            print(f"Error: {file} -> {e}")
            continue

    return np.array(X), np.array(y)

In [15]:
X, y = load_data(augment=True)
X = X[..., np.newaxis]
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y_cat, random_state=42)

In [16]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(X.shape[1], X.shape[2], 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ModelCheckpoint("best_modelnow.keras", monitor='val_accuracy', save_best_only=True, verbose=1)
]

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32,
    callbacks=callbacks
)


Epoch 1/30
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2941 - loss: 4.1376
Epoch 1: val_accuracy improved from -inf to 0.36580, saving model to best_model2.keras
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.2941 - loss: 4.1348 - val_accuracy: 0.3658 - val_loss: 1.5112 - learning_rate: 0.0010
Epoch 2/30
[1m744/745[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.3694 - loss: 1.4946
Epoch 2: val_accuracy improved from 0.36580 to 0.40578, saving model to best_model2.keras
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.3694 - loss: 1.4946 - val_accuracy: 0.4058 - val_loss: 1.4299 - learning_rate: 0.0010
Epoch 3/30
[1m743/745[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.3940 - loss: 1.4481
Epoch 3: val_accuracy improved from 0.40578 to 0.42912, saving model to best_model2.keras
[1m745/745[0m [32

<keras.src.callbacks.history.History at 0x7ae17c980150>

In [19]:
np.save("label_classes2.npy", le.classes_)