In [1]:
!pip install SpeechRecognition
!pip install matplotlib
!pip install librosa
!pip install pandas
!pip install tensorflow
!pip install transformers datasets evaluate seqeval



In [2]:
import os
import librosa
import numpy as np
import json
import pandas as pd

In [3]:
import speech_recognition as sr
recognizer = sr.Recognizer()
# input a file_path to a .wav file
# returns the transcribed audio as a string
# we can use BERT like in the homework to then tokenize/make into array and analyze it
def getVectorOfWords(file_path):
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        # print("Transcription:", recognizer.recognize_google(audio))
        return "" + recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return None

In [4]:
# removes all files from images folder so subsequent runs don't have weird overlaps
def clearImagesFolder():
    print("Deleting all data from images folder")
    directory = os.getcwd() + "/images"
    for root, dirs, files in os.walk(directory, topdown=False):  # topdown=False to delete files before dirs
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if os.path.isfile(file_path) and file_name.endswith('.png'):
                os.remove(file_path)
                # print("" + file_path + " has been removed successfully")
    print("All images removed successfully!")

In [5]:
import librosa.display
import matplotlib.pyplot as plt
# input a file_path to a .wav file
# returns a png of the spectogram and a filepath to it
def getSpectogram(file_path, emotion_label):
    y, sr = librosa.load(file_path, sr=None) # load in the audio file and preserve its sample rate (replace with 16,000 if needed)
    
    # Compute the spectrogram
    D = librosa.stft(y)                        # Short-Time Fourier Transform
    S_db = librosa.amplitude_to_db(abs(D), ref=np.max)  # Convert to decibel scale

    # Plot and save the spectrogram
    fig = plt.figure(figsize=(6, 6))                # Set the figure size -- > num pixels will be 100 times this
    # can change the cmap to "viridis" or "plasma" for different color themes
    librosa.display.specshow(S_db, sr=sr, x_axis="time", y_axis="log", cmap="magma")  # Log frequency scale to mimic human audio perception

    # TODO: at first try hiding as many extra features as possible and compare to when they're included
    # plt.colorbar(format="%+2.0f dB")           # Add a colorbar
    # plt.title("Spectrogram")
    # plt.xlabel("Time (s)")
    # plt.ylabel("Frequency (Hz)")
    plt.tight_layout()
    
    # Save the spectrogram as an image file
    processed_path = (file_path.split("/")[-1]).split(".")[0]
    output_image_path = f"./images/{emotion_label}/{processed_path}.png"  # TODO: figure out naming conventions for the file -- either use path or just have a counter that we pass in
    plt.savefig(output_image_path, dpi=300)    # Save as PNG with high resolution
    plt.close()                                # Close the figure to free memory
    
    return output_image_path

In [6]:
def getTargetEmotionFromCSV(audio_file_name):
    # parse audio_file_name to get distinguishing file info for CSV lookup
    dialogueID, utteranceID = (audio_file_name.split(".wav")[0]).split('_')
    dialogueID, utteranceID = int(dialogueID[3:]), int(utteranceID[3:])
    csv = pd.read_csv('./train_sent_emo.csv')
    # Filter the row(s) that satisfy both conditions
    condition1 = (csv['Dialogue_ID'] == dialogueID)  # First column matches 'dialogueID'
    condition2 = (csv['Utterance_ID'] == utteranceID)  # Second column matches 'utteranceID'
    filtered_rows = csv[condition1 & condition2]
    return filtered_rows['Emotion'].iloc[0]

In [7]:
def traverse_audio_files(directory="./train_splits_wav"):
    # clearImagesFolder() # deletes everything from the image folder
    data = []
    
    # Traverse and process .wav files
    print("Starting audio file traversal")
    iterCount = 0
    for file_name in os.listdir(directory):
        # limit the number of loops so this doesn't take THAT long
        if iterCount >= 3000:
            break
        file_path = os.path.join(directory, file_name)
        
        if os.path.isfile(file_path) and file_name.endswith('.wav'):
            transcription = getVectorOfWords(file_path)
            # filter out the audio files that can't get a clear transcription
            if not transcription:
                continue
            emotion = getTargetEmotionFromCSV(file_name)
            image_path = getSpectogram(file_path, emotion)
            data.append({"Transcription": transcription, "Spectogram": image_path, "Emotion": emotion})
        iterCount += 1
    df = pd.DataFrame(data)
    print("Finished creating dataframe and traversing audio files")
    return df

In [None]:
# df = traverse_audio_files()
# df.to_csv('data3000.csv', index=False)
# print(df)

In [23]:
# start NN here
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import keras.callbacks
### code following homework model
def preprocessingNN(image_size=(600, 600), batch_size=32):
    train_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "images9000",
        validation_split=0.2,
        subset="training",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='categorical' # one-hot encodes
    )
    val_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "images9000",
        validation_split=0.2,
        subset="validation",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='categorical' # one-hot encodes
    )
    return train_ds, val_ds

### BALANCED DATASET
# from tensorflow.keras.utils import image_dataset_from_directory
# import tensorflow as tf
# import os
# import numpy as np
# from tensorflow.keras.utils import image_dataset_from_directory

# def preprocessingNN(image_size=(600, 600), batch_size=32, max_neutral=1000):
#     print("cutting down dataset to use just", max_neutral, "neutral samples")
#     # Load the full dataset without splitting
#     dataset = image_dataset_from_directory(
#         "images9000",
#         image_size=image_size,
#         label_mode='categorical',  # One-hot encodes labels
#         batch_size=None  # Load as individual samples for easier processing
#     )

#     # Class names
#     class_names = dataset.class_names
#     neutral_idx = class_names.index("neutral")
    
#     # Separate "neutral" and "other" images
#     neutral_images = []
#     other_images = []

#     for image, label in dataset:
#         if tf.argmax(label).numpy() == neutral_idx:  # Check if it's neutral
#             neutral_images.append((image, label))
#         else:
#             other_images.append((image, label))
    
#     # Limit the number of "neutral" images
#     np.random.shuffle(neutral_images)
#     limited_neutral_images = neutral_images[:max_neutral]

#     # Combine limited "neutral" and other images
#     balanced_dataset = limited_neutral_images + other_images
#     np.random.shuffle(balanced_dataset)

#     # Convert lists back to TensorFlow dataset
#     balanced_images, balanced_labels = zip(*balanced_dataset)
#     balanced_dataset = tf.data.Dataset.from_tensor_slices((list(balanced_images), list(balanced_labels)))

#     # Split into training and validation datasets
#     split_index = int(len(balanced_dataset) * 0.8)
#     train_ds = balanced_dataset.take(split_index).batch(batch_size)
#     val_ds = balanced_dataset.skip(split_index).batch(batch_size)

#     return train_ds, val_ds

In [24]:
def fitAndRunNN(train_ds, val_ds):
    # Network structure from https://keras.io/examples/vision/mnist_convnet/
    input_shape = (600,600,3)
    num_classes = 7
    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Rescaling(scale=1./255, offset=0.0),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dropout(0.5), # added drouput to model4
            layers.Dense(num_classes, activation="softmax"),
        ]
    )
    model.summary()
    
    # fit the model
    # batch_size = 128
    epochs = 15
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)  # Quit after 5 rounds of no validation loss improvement
    ]
    
    model.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=callbacks) # Validation data instead of fraction
    return model

In [25]:
## TODO: add some more preprocessing to even out the categories of data
def runNN():
    train_ds, val_ds = preprocessingNN()
    model = fitAndRunNN(train_ds, val_ds)
    return model
df = pd.read_csv('data9000.csv')
print(sum(df['Emotion'] == "neutral") / len(df['Emotion']) * 100, "percent of the dataset is labeled 'neutral'.")
runNN()

49.09945184025059 percent of the dataset is labeled 'neutral'.
Found 3831 files belonging to 7 classes.
Using 3065 files for training.
Found 3831 files belonging to 7 classes.
Using 766 files for validation.


Epoch 1/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 574ms/step - accuracy: 0.5116 - loss: 1.5581 - val_accuracy: 0.4961 - val_loss: 1.5076
Epoch 2/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 576ms/step - accuracy: 0.5099 - loss: 1.5082 - val_accuracy: 0.4961 - val_loss: 1.5004
Epoch 3/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 575ms/step - accuracy: 0.5074 - loss: 1.4977 - val_accuracy: 0.4961 - val_loss: 1.4976
Epoch 4/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 575ms/step - accuracy: 0.5096 - loss: 1.4927 - val_accuracy: 0.4961 - val_loss: 1.5085
Epoch 5/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 584ms/step - accuracy: 0.5094 - loss: 1.4904 - val_accuracy: 0.4961 - val_loss: 1.4853
Epoch 6/15
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 583ms/step - accuracy: 0.5054 - loss: 1.4872 - val_accuracy: 0.4961 - val_loss: 1.5001
Epoch 7/15
[1m96/96[

<Sequential name=sequential_3, built=True>

In [15]:
## Start BERT here
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification
# below imports are from github repo linked in NLP homework
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import torch
import transformers as ppb

In [16]:
def modelBERT(df):
    # Convert emotion labels to integers
    batch = df.loc[:, ['Transcription','Emotion']]
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df['Emotion'])
    batch['Emotion'] = labels # replace the emotions with numerical values (should we one-hot encode instead?)
    print("batch", batch)
    print("Emotion (value) counts:\n", batch['Emotion'].value_counts()) # ask for count of each value
    
    # Get the mapping for later interpretation
    emotion_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))) # key val pairs of emotion, int
    print(emotion_mapping)
    
    ###
    # num_labels = len(label_encoder.classes_) # Number of unique labels
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels) # Load pre-trained BERT for classification
    ###
    
    # For DistilBERT:
    model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)
    return tokenizer, model, batch

In [17]:
def prepData(tokenizer, batch):
    tokenized = batch['Transcription'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

    # pad them all to same length
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    print("Padded shape:", np.array(padded).shape)

    # create mask to tell BERT to ignore padding
    attention_mask = np.where(padded != 0, 1, 0)
    print("Attention shape:", attention_mask.shape)
    return padded, attention_mask

In [20]:
def trainAndRunBERT(padded, attention_mask):
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    features = last_hidden_states[0][:,0,:].numpy()
    labels = batch['Emotion']
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(train_features, train_labels)
    print("Score:", rf_clf.score(test_features, test_labels))

In [19]:
df = pd.read_csv('data3000.csv')
tokenizer, model, batch = modelBERT(df)
padded, attention_mask = prepData(tokenizer, batch)
trainAndRunBERT(padded, attention_mask)

batch                                           Transcription  Emotion
0                                                 Mrs M        4
1                                     why did you write        4
2                                 I heard what you said        4
3                                            for a walk        4
4     when did they made me head of purchasing thank...        3
...                                                 ...      ...
2995                                               fine        0
2996                                      say something        6
2997                       no he's leaving for a better        4
2998                                      you stole the        0
2999                               we need a porn break        4

[3000 rows x 2 columns]
Emotion (value) counts:
 Emotion
4    1469
3     479
0     326
6     317
5     252
2      80
1      77
Name: count, dtype: int64
{'anger': np.int64(0), 'disgust': np.int64(1), 'fear': np.in