In [83]:
!pip install SpeechRecognition
!pip install matplotlib
!pip install librosa
!pip install pandas
!pip install tensorflow



In [1]:
import os
import librosa
import numpy as np
import json
import pandas as pd

Directory not found: /usr4/cs350rm/wnapier/.cache/kagglehub


In [86]:
import speech_recognition as sr
recognizer = sr.Recognizer()
# input a file_path to a .wav file
# returns the transcribed audio as a string
# we can use BERT like in the homework to then tokenize/make into array and analyze it
def getVectorOfWords(file_path):
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        # print("Transcription:", recognizer.recognize_google(audio))
        return "" + recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        print("UnknownValue")
        return None

In [97]:
import librosa.display
import matplotlib.pyplot as plt
# input a file_path to a .wav file
# returns a png of the spectogram and a filepath to it
def getSpectogram(file_path, emotion_label):
    y, sr = librosa.load(file_path, sr=None) # load in the audio file and preserve its sample rate (replace with 16,000 if needed)
    
    # Compute the spectrogram
    D = librosa.stft(y)                        # Short-Time Fourier Transform
    S_db = librosa.amplitude_to_db(abs(D), ref=np.max)  # Convert to decibel scale

    # Plot and save the spectrogram
    plt.figure(figsize=(10, 6))                # Set the figure size
    # can change the cmap to "viridis" or "plasma" for different color themes
    librosa.display.specshow(S_db, sr=sr, x_axis="time", y_axis="log", cmap="magma")  # Log frequency scale to mimic human audio perception

    # TODO: at first try hiding as many extra features as possible and compare to when they're included
    # plt.colorbar(format="%+2.0f dB")           # Add a colorbar
    # plt.title("Spectrogram")
    # plt.xlabel("Time (s)")
    # plt.ylabel("Frequency (Hz)")
    plt.tight_layout()
    
    # Save the spectrogram as an image file
    processed_path = (file_path.split("/")[-1]).split(".")[0]
    output_image_path = f"./images/{emotion_label}/{processed_path}.png"  # TODO: figure out naming conventions for the file -- either use path or just have a counter that we pass in
    plt.savefig(output_image_path, dpi=300)    # Save as PNG with high resolution
    plt.close()                                # Close the figure to free memory
    
    return output_image_path

In [98]:
# removes all files from images folder so subsequent runs don't have weird overlaps
def clearImagesFolder():
    directory = os.getcwd() + "/images"
    for root, dirs, files in os.walk(directory, topdown=False):  # topdown=False to delete files before dirs
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if os.path.isfile(file_path) and file_name.endswith('.png'):
                os.remove(file_path)
                print("" + file_path + " has been removed successfully")

In [99]:
def getTargetEmotionFromCSV(audio_file_name):
    # parse audio_file_name to get distinguishing file info for CSV lookup
    dialogueID, utteranceID = (audio_file_name.split(".wav")[0]).split('_')
    dialogueID, utteranceID = int(dialogueID[3:]), int(utteranceID[3:])
    csv = pd.read_csv('./train_sent_emo.csv')
    # Filter the row(s) that satisfy both conditions
    condition1 = (csv['Dialogue_ID'] == dialogueID)  # First column matches 'dialogueID'
    condition2 = (csv['Utterance_ID'] == utteranceID)  # Second column matches 'utteranceID'
    filtered_rows = csv[condition1 & condition2]
    return filtered_rows['Emotion'].iloc[0]

In [102]:
def traverse_audio_files(directory="./train_splits_wav"):
    clearImagesFolder() # deletes everything from the image folder
    data = []
    
    # Traverse and process .wav files
    iterCount = 0
    for file_name in os.listdir(directory):
        # limit the number of loops so this doesn't take THAT long
        if iterCount >= 200:
            break
        file_path = os.path.join(directory, file_name)
        
        if os.path.isfile(file_path) and file_name.endswith('.wav'):
            transcription = getVectorOfWords(file_path)
            # filter out the audio files that can't get a clear transcription
            if not transcription:
                continue
            emotion = getTargetEmotionFromCSV(file_name)
            image_path = getSpectogram(file_path, emotion)
            data.append({"Transcription": transcription, "Spectogram": image_path, "Emotion": emotion})
        iterCount += 1
    df = pd.DataFrame(data)
    return df

In [103]:
df = traverse_audio_files()
print(df)

/projectnb/ds340/students/wnapier/images/joy/.ipynb_checkpoints/dia1001_utt6-checkpoint.png has been removed successfully
/projectnb/ds340/students/wnapier/images/joy/dia1001_utt6.png has been removed successfully
/projectnb/ds340/students/wnapier/images/neutral/dia532_utt1.png has been removed successfully
/projectnb/ds340/students/wnapier/images/neutral/dia575_utt1.png has been removed successfully
/projectnb/ds340/students/wnapier/images/neutral/dia845_utt8.png has been removed successfully
/projectnb/ds340/students/wnapier/images/neutral/dia689_utt12.png has been removed successfully
UnknownValue
UnknownValue
UnknownValue
UnknownValue
Transcription: Mrs M
UnknownValue
UnknownValue
UnknownValue
Transcription: why did you write
UnknownValue
Transcription: I heard what you said
Transcription: for a walk
Transcription: when did they made me head of purchasing thank you very much
Transcription: let's just get this out in the open okay
UnknownValue
Transcription: thank you Monica
Unknown

In [128]:
# start NN here
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import keras.callbacks
# code following homework model
def preprocessingNN(image_size=(180, 180), batch_size=32):
    # TODO: update image size for our images
    train_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "images",
        validation_split=0.2,
        subset="training",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='categorical' # one-hot encodes
    )
    val_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "images",
        validation_split=0.2,
        subset="validation",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='categorical' # one-hot encodes
    )
    return train_ds, val_ds

In [148]:
def fitAndRunNN(train_ds, val_ds):
    # Network structure from https://keras.io/examples/vision/mnist_convnet/
    input_shape = (180,180,3) # TODO: change input shape to match image size
    num_classes = 7
    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Rescaling(scale=1./255, offset=0.0),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dense(num_classes, activation="softmax"),
        ]
    )
    model.summary()
    
    # fit the model
    # batch_size = 128
    epochs = 15
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)  # Quit after 3 rounds of no validation loss improvement
    ]
    
    model.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=callbacks) # Validation data instead of fraction

In [149]:
## TODO: add some more preprocessing to even out the categories of data
def runNN():
    train_ds, val_ds = preprocessingNN()
    fitAndRunNN(train_ds, val_ds)
print(sum(df['Emotion'] == "neutral") / len(df['Emotion']) * 100, "percent of the dataset is labeled 'neutral'.")
runNN()

Found 200 files belonging to 7 classes.
Using 160 files for training.
Found 200 files belonging to 7 classes.
Using 40 files for validation.


Epoch 1/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 291ms/step - accuracy: 0.5189 - loss: 3.0521 - val_accuracy: 0.1500 - val_loss: 2.0626
Epoch 2/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 288ms/step - accuracy: 0.1201 - loss: 1.9040 - val_accuracy: 0.1000 - val_loss: 1.6342
Epoch 3/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 274ms/step - accuracy: 0.3530 - loss: 1.5624 - val_accuracy: 0.6250 - val_loss: 1.3508
Epoch 4/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 269ms/step - accuracy: 0.5510 - loss: 1.3484 - val_accuracy: 0.6250 - val_loss: 1.3497
Epoch 5/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 274ms/step - accuracy: 0.5016 - loss: 1.4180 - val_accuracy: 0.6250 - val_loss: 1.3578
Epoch 6/15
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 275ms/step - accuracy: 0.4885 - loss: 1.3505 - val_accuracy: 0.6250 - val_loss: 1.3019
Epoch 7/15
[1m5/5[0m [32m━━━━━━━━━━━━