In [None]:
import json
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"

In [None]:
!chmod 600 /content/kaggle.json

In [None]:
!kaggle datasets download -d dmitrybabko/speech-emotion-recognition-en
!unzip -q speech-emotion-recognition-en.zip -d dataset

Dataset URL: https://www.kaggle.com/datasets/dmitrybabko/speech-emotion-recognition-en
License(s): copyright-authors
Downloading speech-emotion-recognition-en.zip to /content
 98% 964M/987M [00:11<00:00, 222MB/s]
100% 987M/987M [00:11<00:00, 89.2MB/s]


In [None]:
import os
import random
import numpy as np
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
DATA_PATH = "/content/dataset/Crema"
EMOTIONS = {
    "ANG": "angry",
    "DIS": "disgust",
    "FEA": "fear",
    "HAP": "happy",
    "NEU": "neutral",
    "SAD": "sad"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)



In [None]:
def add_noise(waveform, noise_level=0.005):
    noise = noise_level * torch.randn(waveform.size())
    return waveform + noise

def change_pitch(waveform, sample_rate, semitones=2):
    rate = int(sample_rate * (2.0 ** (semitones / 12.0)))
    return torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=rate)(waveform)

def augment(waveform, sample_rate):
    if random.random() < 0.3:
        waveform = add_noise(waveform)
    if random.random() < 0.3:
        waveform = change_pitch(waveform, sample_rate)
    return waveform


In [None]:
def extract_wav2vec_features(file_path, augment_data=False):
    waveform, sr = torchaudio.load(file_path)
    waveform = waveform.squeeze(0)

    if augment_data:
        waveform = augment(waveform, sr)

    inputs = processor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = wav2vec(**inputs.to(device)).last_hidden_state.mean(dim=1)  # (1, 768)

    return embeddings.cpu().numpy()

In [None]:
def get_classifier(input_dim=768, num_classes=6):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
from tqdm import tqdm
import os
import numpy as np

os.makedirs("features", exist_ok=True)
os.makedirs("labels", exist_ok=True)

X, y = [], []
for file in tqdm(os.listdir(DATA_PATH)):
    if not file.endswith(".wav"):
        continue

    try:
        emotion_key = file.split("_")[2]
        emotion = EMOTIONS.get(emotion_key)
        if not emotion:
            continue

        file_path = os.path.join(DATA_PATH, file)
        feat_path = f"features/{file.replace('.wav', '.npy')}"
        label_path = f"labels/{file.replace('.wav', '.label')}"

        if os.path.exists(feat_path) and os.path.exists(label_path):
            vec = np.load(feat_path)
            with open(label_path) as f:
                emotion = f.read().strip()
        else:
            vec = extract_wav2vec_features(file_path, augment_data=False)
            np.save(feat_path, vec)
            with open(label_path, "w") as f:
                f.write(emotion)

        X.append(vec.squeeze())
        y.append(emotion)

    except Exception as e:
        print(f"Skipping {file}: {e}")

100%|██████████| 7442/7442 [03:21<00:00, 36.99it/s]


In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(np.array(X), y_cat, test_size=0.2, stratify=y_cat)

model = get_classifier(input_dim=X_train.shape[1], num_classes=y_cat.shape[1])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.3312 - loss: 1.5747 - val_accuracy: 0.4493 - val_loss: 1.3884
Epoch 2/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.4441 - loss: 1.3854 - val_accuracy: 0.4728 - val_loss: 1.3330
Epoch 3/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4579 - loss: 1.3430 - val_accuracy: 0.4856 - val_loss: 1.3081
Epoch 4/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4720 - loss: 1.3199 - val_accuracy: 0.5131 - val_loss: 1.2665
Epoch 5/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4882 - loss: 1.2665 - val_accuracy: 0.5003 - val_loss: 1.2690
Epoch 6/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5113 - loss: 1.2406 - val_accuracy: 0.5212 - val_loss: 1.2320
Epoch 7/30
[1m187/187[0m 

<keras.src.callbacks.history.History at 0x7deac4286250>

In [None]:
model.save("wav2vec_classifier3 .keras")
np.save("label_classes3.npy", le.classes_)