In [8]:
import os
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import speech_to_text
from sklearn.feature_extraction.text import TfidfVectorizer




 Much good can come from the prudent use of power, and much good can come of this. A world once divided into two armed camps now recognizes one soul and preeminent power, the United States of America. And they regard this with no dread, for the world trusts us with power, and the world is right. They trust us to be fair and restrained. They trust us to be on the side of decency. They trust us to do what's right. And I use those words advisedly.
 The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.
 The big argument I have with the governor on this is this taking different positions on different issues. Trying to be one thing to one person here that's opposing the NAFTA agreement and then for it what we call waffling. You can't turn the White House into the Waffle House. You've got to say what you're for. The Washington Post 

In [16]:
IMG_SIZE = (128, 128)
DATASET_PATH = "./data/Emotions"
BATCH_SIZE = 32
EPOCHS = 30


In [17]:
import librosa
import numpy as np

def extract_mel_spectrogram(file_path, img_size=(128, 128)):
    y, sr = librosa.load(file_path, sr=22050)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=img_size[0])
    S_DB = librosa.power_to_db(S, ref=np.max)
    S_DB = librosa.util.fix_length(S_DB, size=img_size[1], axis=1)
    return S_DB


In [20]:
import os
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


def load_dataset():
    X, y = [], []

    for emotion_folder in os.listdir(DATASET_PATH):
        folder_path = os.path.join(DATASET_PATH, emotion_folder)
        if not os.path.isdir(folder_path):
            continue

        for file in tqdm(os.listdir(folder_path), desc=emotion_folder):
            if not file.endswith(".wav"):
                continue
            try:
                file_path = os.path.join(folder_path, file)
                spec = extract_mel_spectrogram(file_path, IMG_SIZE)
                X.append(spec)
                y.append(emotion_folder)
            except:
                continue

    X = np.array(X)[..., np.newaxis]  # (samples, H, W, 1)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_cat = to_categorical(y_encoded)
    return X, y_cat, le


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

def build_cnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.2),

        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [22]:
# cnn_audio_emotion/train.py

import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from dataset_loader import load_dataset

from config import BATCH_SIZE, EPOCHS

# Incarca datele
X, y, label_encoder = load_dataset()

# Impartire train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Model
model = build_cnn_model(input_shape=X.shape[1:], num_classes=y.shape[1])
model.summary()

# Callback-uri pentru performanta
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

# Antrenare
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks
)

# Evaluare
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

# Salvare model in format Keras modern
model.save("emotion_cnn_audio_only.keras")


Angry: 100%|██████████| 2167/2167 [00:13<00:00, 160.93it/s]
Disgusted: 100%|██████████| 1863/1863 [00:12<00:00, 145.56it/s]
Fearful: 100%|██████████| 2047/2047 [00:14<00:00, 139.50it/s]
Happy: 100%|██████████| 2167/2167 [00:19<00:00, 112.67it/s]
Neutral: 100%|██████████| 1795/1795 [00:19<00:00, 92.17it/s] 
Sad: 100%|██████████| 2167/2167 [00:25<00:00, 84.54it/s] 
Suprised: 100%|██████████| 592/592 [00:06<00:00, 89.28it/s] 
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.1908 - loss: 26.4775 - val_accuracy: 0.3324 - val_loss: 1.6462
Epoch 2/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 72ms/step - accuracy: 0.3659 - loss: 1.5549 - val_accuracy: 0.4418 - val_loss: 1.3681
Epoch 3/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 75ms/step - accuracy: 0.4526 - loss: 1.3390 - val_accuracy: 0.4719 - val_loss: 1.2895
Epoch 4/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 71ms/step - accuracy: 0.5031 - loss: 1.2307 - val_accuracy: 0.4906 - val_loss: 1.2664
Epoch 5/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 64ms/step - accuracy: 0.5212 - loss: 1.1800 - val_accuracy: 0.5289 - val_loss: 1.1864
Epoch 6/30
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 63ms/step - accuracy: 0.5683 - loss: 1.0895 - val_accuracy: 0.5547 - val_loss: 1.1174
Epoch 7/30
[1m



Test Accuracy: 0.61


AttributeError: module 'h5py' has no attribute 'File'

In [7]:
from sklearn.model_selection import train_test_split
from dataset_loader import load_dataset
from model import build_cnn_model
from config import BATCH_SIZE, EPOCHS

# Încarcă datele
X, y, label_encoder = load_dataset()

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Construiește modelul
model = build_cnn_model(input_shape=X.shape[1:], num_classes=y.shape[1])
model.summary()

# Antrenează
model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test))

# Evaluează
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

# Salvează modelul
model.save("emotion_cnn_audio_only.h5")


Epoch 1/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 59ms/step - accuracy: 0.1841 - loss: 60.2922 - val_accuracy: 0.2089 - val_loss: 1.8802
Epoch 2/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.2132 - loss: 1.8399 - val_accuracy: 0.2203 - val_loss: 1.8631
Epoch 3/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.2273 - loss: 1.8198 - val_accuracy: 0.2950 - val_loss: 1.7456
Epoch 4/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 64ms/step - accuracy: 0.3460 - loss: 1.6384 - val_accuracy: 0.4088 - val_loss: 1.4524
Epoch 5/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 68ms/step - accuracy: 0.4017 - loss: 1.4852 - val_accuracy: 0.4476 - val_loss: 1.3690
Epoch 6/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.4638 - loss: 1.3286 - val_accuracy: 0.4430 - val_loss: 1.3663
Epoch 7/30
[1m

In [26]:
# ------------------ Config + Imports ------------------

import os
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

IMG_SIZE = (128, 128)
DATASET_PATH = "./data/Emotions"
BATCH_SIZE = 32
EPOCHS = 50

# ------------------ Feature Extraction ------------------

def extract_mel_spectrogram(file_path, img_size=(128, 128), augment=False):
    y, sr = librosa.load(file_path, sr=22050)

    if augment:
        if np.random.rand() < 0.5:
            y = y + 0.005 * np.random.randn(len(y))  # Zgomot alb
        if np.random.rand() < 0.5:
            y = librosa.effects.pitch_shift(y, sr=sr, n_steps=np.random.choice([-1, 1]))

    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=img_size[0])
    S_DB = librosa.power_to_db(S, ref=np.max)
    S_DB = librosa.util.fix_length(S_DB, size=img_size[1], axis=1)
    return S_DB

# ------------------ Dataset Loader ------------------

def load_dataset():
    X, y = [], []

    for emotion_folder in os.listdir(DATASET_PATH):
        folder_path = os.path.join(DATASET_PATH, emotion_folder)
        if not os.path.isdir(folder_path):
            continue

        for file in tqdm(os.listdir(folder_path), desc=emotion_folder):
            if not file.endswith(".wav"):
                continue
            try:
                file_path = os.path.join(folder_path, file)
                spec = extract_mel_spectrogram(file_path, IMG_SIZE, augment=True)
                X.append(spec)
                y.append(emotion_folder)
            except Exception as e:
                print(f"Error with file {file}: {e}")
                continue

    X = np.array(X)[..., np.newaxis]  # (samples, H, W, 1)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_cat = to_categorical(y_encoded)
    return X, y_cat, le

# ------------------ CNN Model ------------------

def build_cnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        Conv2D(256, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        GlobalAveragePooling2D(),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer=Adam(learning_rate=3e-4),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# ------------------ Training Pipeline ------------------

# Încarcă datele
X, y, label_encoder = load_dataset()

# Împărțire train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Construiește modelul
model = build_cnn_model(input_shape=X.shape[1:], num_classes=y.shape[1])
model.summary()

# Callback-uri
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

# Antrenează modelul
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks
)

# Evaluează modelul
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

# Salvează modelul în format compatibil TensorFlow
model.save("emotion_cnn_audio_only.keras", save_format="tf")


Angry: 100%|██████████| 2167/2167 [00:54<00:00, 39.95it/s]
Disgusted: 100%|██████████| 1863/1863 [01:32<00:00, 20.21it/s]
Fearful: 100%|██████████| 2047/2047 [01:38<00:00, 20.84it/s]
Happy: 100%|██████████| 2167/2167 [02:06<00:00, 17.19it/s]
Neutral: 100%|██████████| 1795/1795 [01:57<00:00, 15.22it/s]
Sad: 100%|██████████| 2167/2167 [02:51<00:00, 12.62it/s]
Suprised: 100%|██████████| 592/592 [00:41<00:00, 14.17it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 454ms/step - accuracy: 0.2314 - loss: 1.8772 - val_accuracy: 0.2816 - val_loss: 1.7990 - learning_rate: 3.0000e-04
Epoch 2/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 462ms/step - accuracy: 0.3704 - loss: 1.5330 - val_accuracy: 0.2922 - val_loss: 1.8122 - learning_rate: 3.0000e-04
Epoch 3/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 463ms/step - accuracy: 0.4533 - loss: 1.3647 - val_accuracy: 0.4316 - val_loss: 1.4204 - learning_rate: 3.0000e-04
Epoch 4/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 311ms/step - accuracy: 0.4997 - loss: 1.2728 - val_accuracy: 0.5137 - val_loss: 1.2533 - learning_rate: 3.0000e-04
Epoch 5/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 312ms/step - accuracy: 0.5209 - loss: 1.2160 - val_accuracy: 0.3887 - val_loss: 1.8365 - learning_rate: 3.0000e-04
Epoch 6/50
[1m320/320[0m [3



Test Accuracy: 0.65


AttributeError: module 'h5py' has no attribute 'File'