In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import Adam


In [2]:
# Set the path to the RAVDESS dataset
data_path = "C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features"

# Define function to extract emotion, gender, and file path for each audio file
def load_data(data_path):
    emotion_labels = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    }

    data = []
    for subdir, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(subdir, file)
                file_name = os.path.basename(file_path).split(".")[0]
                emotion = emotion_labels[file_name.split("-")[2]]
                data.append((file_path, emotion))
    return data

data = load_data(data_path)


In [3]:
def extract_features(data):
    X, y = [], []
    for file_path, emotion in data:
        features = librosa.feature.mfcc(y=librosa.load(file_path)[0], n_mfcc=40)
        X.append(np.mean(features, axis=1))
        y.append(emotion)
    return np.array(X), np.array(y)

X, y = extract_features(data)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [6]:
y_train = pd.get_dummies(y_train).values
y_test_encoded = pd.get_dummies(y_test).values


In [7]:
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.1)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [10]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test_encoded, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Accuracy: {accuracy}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)
print(f"Confusion Matrix:\n{conf_matrix}")

# Create a function to predict the emotion from an audio file
def predict_emotion(file_path, model, scaler):
    features = librosa.feature.mfcc(y=librosa.load(file_path)[0], n_mfcc=40)
    features_scaled = scaler.transform(np.mean(features, axis=1).reshape(1, -1))
    prediction = model.predict(features_scaled)
    emotion_labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
    return emotion_labels[np.argmax(prediction)]

# Test the prediction function
file_path = "C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features/Actor_11/03-01-01-01-01-01-11.wav"
print(f"Predicted emotion: {predict_emotion(file_path, model, scaler)}")


Accuracy: 0.9137181874612043
Confusion Matrix:
[[224   0   3   1   2   2   1   5]
 [  0  65   1   0   0   3   2   0]
 [  1   4 177   0   2   2   0   2]
 [  8   2   0 203   2   0  17   3]
 [  2   6   2   3 224   4   0   2]
 [  0  13   2   0   0 181   3   0]
 [  0  11   1   9   3   2 204   2]
 [  1   1   2   0   4   2   1 194]]
Predicted emotion: calm


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')
print(f"F1 Score: {f1}")


Accuracy: 0.9137181874612043
Precision: 0.9194426128429559
Recall: 0.9137181874612043
F1 Score: 0.9152012730507811


In [12]:
from sklearn.metrics import matthews_corrcoef

# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test_classes, y_pred_classes)
print(f"Matthews Correlation Coefficient: {mcc}")


Matthews Correlation Coefficient: 0.9009190895114253


In [13]:
!pip install SpeechRecognition textblob





[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: C:\Python311\python.exe -m pip install --upgrade pip


In [14]:
pip install SpeechRecognition textblob


Note: you may need to restart the kernel to use updated packages.


In [16]:
import speech_recognition as sr
from textblob import TextBlob

def speech_to_text(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Unrecognized speech"
        except sr.RequestError as e:
            return f"Could not request results: {e}"

def sentiment_polarity(text):
    sentiment = TextBlob(text).sentiment
    return sentiment.polarity

data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features')

for file_path, emotion in data:
    text = speech_to_text(file_path)
    polarity = sentiment_polarity(text)
    print(f"Audio file: {file_path}\nEmotion: {emotion}\nText: {text}\nPolarity: {polarity}\n")


Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-01-01.wav
Emotion: neutral
Text: talking by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-02-01.wav
Emotion: neutral
Text: kids talking by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-01-01.wav
Emotion: neutral
Text: dogs sitting by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-02-01.wav
Emotion: neutral
Text: talk to Siri why the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-02-01-01-01-01.wav
Emotion: calm
Text: Unrecognized speech
Polarity: 0.0

Audio f

KeyboardInterrupt: 

In [43]:
import speech_recognition as sr
from textblob import TextBlob

def speech_to_text(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Unrecognized speech"
        except sr.RequestError as e:
            return f"Could not request results: {e}"

def sentiment_polarity(text):
    sentiment = TextBlob(text).sentiment
    return sentiment.polarity

def effective_emotion(emotion, polarity):
    if polarity > 0.2:
        return "positive"
    elif polarity < -0.2:
        return "negative"
    else:
        return emotion

data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features')

for file_path, emotion in data:
    text = speech_to_text(file_path)
    polarity = sentiment_polarity(text)
    eff_emotion = effective_emotion(emotion, polarity)
    print(f"Audio file: {file_path}\nEmotion: {emotion}\nText: {text}\nPolarity: {polarity}\nEffective emotion: {eff_emotion}\n")


Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-01-01.wav
Emotion: neutral
Text: talking by the door
Polarity: 0.0
Effective emotion: neutral

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-02-01.wav
Emotion: neutral
Text: kids talking by the door
Polarity: 0.0
Effective emotion: neutral

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-01-01.wav
Emotion: neutral
Text: dogs sitting by the door
Polarity: 0.0
Effective emotion: neutral

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-02-01.wav
Emotion: neutral
Text: talk to Siri why the door
Polarity: 0.0
Effective emotion: neutral

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/datas

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [27]:
from collections import defaultdict

def predict_average_emotion(data):
    # Create a dictionary to store the total polarity and count for each emotion category
    emotions = defaultdict(lambda: {"polarity": 0, "count": 0})

    # Iterate over the audio files and calculate the speech polarity and emotion for each one
    for file_path, emotion in data:
        text = speech_to_text(file_path)
        polarity = sentiment_polarity(text)

        # Add the polarity and count to the corresponding emotion category
        emotions[emotion]["polarity"] += polarity
        emotions[emotion]["count"] += 1

    # Calculate the average polarity for each emotion category
    average_emotions = {}
    for emotion, values in emotions.items():
        if values["count"] == 0:
            average_emotions[emotion] = 0
        else:
            average_emotions[emotion] = values["polarity"] / values["count"]

    return average_emotions
data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features/Actor_16/03-02-06-02-01-02-16.wav')
average_emotions = predict_average_emotion(data)
print(average_emotions)



{}


In [28]:
data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features/Actor_16/03-02-06-02-01-02-16.wav')
average_emotions = predict_average_emotion(data)
print(average_emotions)

{}


In [29]:
data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features')
print(data)


[('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-01-01-01-01-01.wav', 'neutral'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-01-01-01-02-01.wav', 'neutral'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-01-01-02-01-01.wav', 'neutral'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-01-01-02-02-01.wav', 'neutral'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-02-01-01-01-01.wav', 'calm'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-02-01-01-02-01.wav', 'calm'), ('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\\Actor_01\\03-01-02-01-02-01-01.wav', 'calm'), 

In [30]:
for file_path, emotion in data:
    text = speech_to_text(file_path)
    polarity = sentiment_polarity(text)
    print(f"Audio file: {file_path}\nEmotion: {emotion}\nText: {text}\nPolarity: {polarity}\n")


Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-01-01.wav
Emotion: neutral
Text: talking by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-01-02-01.wav
Emotion: neutral
Text: kids talking by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-01-01.wav
Emotion: neutral
Text: dogs sitting by the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-01-01-02-02-01.wav
Emotion: neutral
Text: talk to Siri why the door
Polarity: 0.0

Audio file: C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features\Actor_01\03-01-02-01-01-01-01.wav
Emotion: calm
Text: Unrecognized speech
Polarity: 0.0

Audio f

KeyboardInterrupt: 

In [32]:
def predict_average_emotion(data):
    emotions = defaultdict(lambda: {"polarity": 0, "count": 0})
    for file_path, emotion in data:
        text = speech_to_text(file_path)
        polarity = sentiment_polarity(text)
        emotions[emotion]["polarity"] += polarity
        emotions[emotion]["count"] += 1
    print(emotions)  # print the emotions dictionary to check if it is being populated correctly
    average_emotions = {}
    for emotion, values in emotions.items():
        if values["count"] == 0:
            average_emotions[emotion] = 0
        else:
            average_emotions[emotion] = values["polarity"] / values["count"]
    return average_emotions
print(average_emotions)


{}


In [18]:
pip install pocketsphinx


Collecting pocketsphinx
  Using cached pocketsphinx-5.0.0.tar.gz (33.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting sounddevice
  Using cached sounddevice-0.4.6-py3-none-win_amd64.whl (199 kB)
Building wheels for collected packages: pocketsphinx
  Building wheel for pocketsphinx (PEP 517): started
  Building wheel for pocketsphinx (PEP 517): finished with status 'error'
Failed to build pocketsphinx
Note: you may need to restart the kernel to use updated packages.


  ERROR: Command errored out with exit status 1:
   command: 'c:\Users\Vivek\AppData\Local\Programs\Python\Python39\python.exe' 'c:\Users\Vivek\AppData\Local\Programs\Python\Python39\lib\site-packages\pip\_vendor\pep517\_in_process.py' build_wheel 'C:\Users\Vivek\AppData\Local\Temp\tmpuovkqh44'
       cwd: C:\Users\Vivek\AppData\Local\Temp\pip-install-2ftfo78b\pocketsphinx
  Complete output (308 lines):
  
  
  --------------------------------------------------------------------------------
  -- Trying 'Ninja (Visual Studio 17 2022 x64 v143)' generator
  --------------------------------
  ---------------------------
  ----------------------
  -----------------
  ------------
  -------
  --
  Not searching for unused variables given on the command line.
  -- The C compiler identification is unknown
  CMake Error at CMakeLists.txt:3 (ENABLE_LANGUAGE):
    No CMAKE_C_COMPILER could be found.
  
    Tell CMake where to find the compiler by setting either the environment
    variable "CC" o

In [20]:
import os
from pocketsphinx import AudioFile, get_model_path, get_data_path
from textblob import TextBlob

def speech_to_text(file_path):
    model_path = get_model_path()
    data_path = get_data_path()

    config = {
        'verbose': False,
        'audio_file': file_path,
        'buffer_size': 2048,
        'no_search': False,
        'full_utt': False,
        'hmm': os.path.join(model_path, 'en-us'),
        'lm': os.path.join(model_path, 'en-us.lm.bin'),
        'dict': os.path.join(model_path, 'cmudict-en-us.dict')
    }

    recognizer = AudioFile(**config)
    text = ""
    for phrase in recognizer:
        text += phrase.segments(detailed=True)[0][0]
    return text

def sentiment_polarity(text):
    sentiment = TextBlob(text).sentiment
    return sentiment.polarity

data = load_data('C:/Users/Vivek/Desktop/6th_sem_miniproj_full_code/full_code_speech_emotion/dataset_features')

for file_path, emotion in data:
    text = speech_to_text(file_path)
    polarity = sentiment_polarity(text)
    print(f"Audio file: {file_path}\nEmotion: {emotion}\nText: {text}\nPolarity: {polarity}\n")


ModuleNotFoundError: No module named 'pocketsphinx'

In [37]:
from textblob import TextBlob

text = "i love you"
blob = TextBlob(text)
polarity_score = blob.sentiment.polarity

if polarity_score > 0:
    print("Positive emotion")
elif polarity_score < 0:
    print("Negative emotion")
else:
    print("Neutral emotion")


Positive emotion


In [42]:
import speech_recognition as sr
from textblob import TextBlob

def speech_to_text(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Unrecognized speech"
        except sr.RequestError as e:
            return f"Could not request results: {e}"

def get_emotion(file_path):
    text = speech_to_text(file_path)
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    print(polarity_score)

    if polarity_score > 0:
        return "Positive emotion"
    elif polarity_score < 0:
        return "Negative emotion"
    else:
        return "Neutral emotion"

# Example usage
file_path = "dataset_features/Actor_26/10-30-04-50-16-93-69.wav"
emotion_category = get_emotion(file_path)
print(emotion_category)


0.0
Neutral emotion
