<a href="https://colab.research.google.com/github/suryavamshi866/speech-emotion-recognition/blob/main/Untitled67.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install os numpy librosa keras soundfile

In [1]:
import os
import numpy as np
import librosa
from keras.models import load_model
# Use soundfile for reading audio to explicitly avoid common audioread/backend issues
import soundfile as sf

# --- Environment Setup (Critical for avoiding system conflicts) ---
# Force TensorFlow to run on CPU if you don't need a GPU (good for debugging)
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# Hide TensorFlow logs (0=show all, 3=hide all warnings/errors)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# --- Global Constants ---
MODEL_PATH = "speech_emotion_model.h5"
AUDIO_PATH = "recorded_audio.wav"
EMOTION_LABELS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'ps', 'sad']
N_MFCC = 40

# --- Model Loading ---
try:
    # Ensure the model is loaded safely
    model = load_model(MODEL_PATH, compile=False)
    print("✅ Model loaded successfully!")
except FileNotFoundError:
    print(f"❌ Error: Model file not found at {MODEL_PATH}")
    exit()
except Exception as e:
    print(f"❌ Error loading model: {e}")
    exit()

# --- Feature Extraction Function ---
def extract_mfcc(filename: str):
    """
    Extracts MFCC features from an audio file using soundfile/librosa.
    """
    try:
        # Use soundfile to load the audio data, which is more stable than librosa.load's default backend
        y, sr = sf.read(filename, dtype='float32')

        # If the audio is stereo, convert it to mono
        if len(y.shape) > 1:
            y = librosa.to_mono(y.T)

        # Resample y to a standard rate if needed, though librosa.feature.mfcc handles this
        # with its 'sr' argument. Using sr=None during sf.read reads the original rate.

        # Trim the signal to the first 5 seconds after 0.5s offset, similar to the original code logic
        start_sample = int(0.5 * sr)
        end_sample = start_sample + int(5 * sr)
        y = y[start_sample:end_sample]

        # Extract MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
        # Calculate the mean across time (axis=1 in the new librosa versions)
        mfcc_mean = np.mean(mfcc.T, axis=0)

        return mfcc_mean

    except FileNotFoundError:
        print(f"❌ Error: Audio file not found at {filename}")
        return None
    except Exception as e:
        print(f"❌ Error during feature extraction: {e}")
        return None

# --- Prediction Function ---
def predict_emotion(audio_path: str = AUDIO_PATH):
    """
    Loads audio, extracts features, and predicts the emotion.
    """
    mfcc = extract_mfcc(audio_path)

    if mfcc is None:
        return

    # Keras expects shape (batch_size, time_steps, features) or (batch_size, time_steps, features, 1)
    # Since we are using mean MFCC, we need (1, features, 1)
    # The first dim is batch_size (1), the third dim is the channel (1)
    mfcc = np.expand_dims(mfcc, axis=0) # Shape (1, 40)
    mfcc = np.expand_dims(mfcc, axis=-1) # Shape (1, 40, 1)

    try:
        prediction = model.predict(mfcc, verbose=0)
        emotion_index = np.argmax(prediction)
        emotion = EMOTION_LABELS[emotion_index]

        print(f"\n--- Prediction Result ---")
        print(f"File: {audio_path}")
        print(f"Confidence Scores: {dict(zip(EMOTION_LABELS, prediction[0].round(3)))}")
        print(f"🎯 Predicted Emotion: {emotion.upper()}")
        print("-------------------------\n")

    except Exception as e:
        print(f"❌ Error during model prediction: {e}")

# --- Main Execution ---
if __name__ == "__main__":
    predict_emotion()

✅ Model loaded successfully!

--- Prediction Result ---
File: recorded_audio.wav
Confidence Scores: {'angry': np.float32(0.027), 'disgust': np.float32(0.061), 'fear': np.float32(0.826), 'happy': np.float32(0.002), 'neutral': np.float32(0.055), 'ps': np.float32(0.027), 'sad': np.float32(0.002)}
🎯 Predicted Emotion: FEAR
-------------------------



In [4]:
pip install portaudio


[31mERROR: Could not find a version that satisfies the requirement portaudio (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for portaudio[0m[31m
[0m

In [5]:
pip install pyaudio


Collecting pyaudio
  Downloading PyAudio-0.2.14.tar.gz (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyaudio
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for pyaudio [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for pyaudio (pyproject.toml) ... [?25l[?25herror
[31m  ERROR: Failed building wheel for pyaudio[0m

In [2]:
import numpy as np
import librosa
import pyaudio
import wave
from keras.models import load_model

# Load the trained model
model = load_model("speech_emotion_model.h5")
print("✅ Model loaded successfully!")

# Emotion labels (update if your model uses different ones)
emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'ps', 'sad']

# Audio recording configuration
FORMAT = pyaudio.paInt16  # 16-bit resolution
CHANNELS = 1              # Mono audio
RATE = 44100              # Sampling rate
CHUNK = 1024              # Buffer size
RECORD_SECONDS = 5        # Recording duration
WAVE_OUTPUT_FILENAME = "recorded_audio.wav"


def record_audio():
    """Record 5 seconds of audio from the microphone and save it as a WAV file."""
    audio = pyaudio.PyAudio()
    print("🎤 Recording for 5 seconds... Speak now!")

    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)
    frames = []

    for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("✅ Recording complete!")

    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save the recorded audio
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(audio.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    return WAVE_OUTPUT_FILENAME


def extract_mfcc(filename):
    """Extract MFCC features from an audio file."""
    y, sr = librosa.load(filename, duration=5, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    return mfcc


def predict_emotion():
    """Record voice, extract MFCC, and predict emotion."""
    audio_path = record_audio()
    mfcc = extract_mfcc(audio_path)
    mfcc = np.expand_dims(mfcc, axis=(0, -1))

    prediction = model.predict(mfcc)
    emotion = emotion_labels[np.argmax(prediction)]

    print(f"🎯 Predicted Emotion: {emotion.upper()}")


if __name__ == "__main__":
    predict_emotion()


ModuleNotFoundError: No module named 'pyaudio'