# Add noise to data

## Goal

The goal is to add noise to data to test whether model works well.

## Model

We plan to use the Whisper embedings from OpenAI and train a classification model, either using Whisper with a sequence classification head or another classification LLM.

## Data

The data in this notebook is publicly available voice recordings featuring hypernasality and control groups. In the future we hope to train our model on private patient data from Vanderbilt University Medical Center (VUMC).

In [5]:
!pip install torch
!pip install datasets
!pip install librosa
!pip install transformers

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [18]:
# import libraries
import datasets
from datasets import load_dataset, DatasetDict,  Audio
import pandas as pd
import os
import glob
import librosa
import io
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.metrics import f1_score, classification_report, accuracy_score

## Practice

### Preperation

In [19]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
folder_path = "/content/drive/My Drive/vandy 24spring/SLP project/data/Public data/updated_data" #set a path for the files
files = os.listdir(folder_path)
print(files)

train_catalog = "/content/drive/My Drive/vandy 24spring/SLP project/data/Public data/updated_data/train.csv"
test_catalog = "/content/drive/My Drive/vandy 24spring/SLP project/data/Public data/updated_data/test.csv"

['test.csv', 'train.csv']


In [None]:
!pip install SoundFile
import soundfile as sf

### Define the function

In [80]:
# Define the Noise-Adding Function
def add_gaussian_noise(audio_path, noise_level):
    # Load the audio file
    signal, sr = librosa.load(audio_path, sr=None)

    # Generate Gaussian noise
    noise = np.random.normal(0, noise_level, signal.shape)

    # Add the noise to the original signal
    noisy_signal = signal + noise

    return noisy_signal, sr

### Add noise

In [91]:
# Define a directory to save the noisy audio files
noisy_audio_dir = "C:/Users/hhanl/Downloads/"
os.makedirs(noisy_audio_dir, exist_ok=True)

# Specify the path to your MP3 file
audio_path = '/content/drive/My Drive/vandy 24spring/SLP project/data/Public data/CASES/Video 6_7 (buy baby a bib).mp3'

# Add Gaussian noise to the audio file
noisy_signal, sr = add_gaussian_noise(audio_path, noise_level=0.1) # you can change the noise level

# Construct a path for the noisy audio file
noisy_file_path = os.path.join(noisy_audio_dir, 'noisy_' + os.path.basename(audio_path))

# Save the noisy audio file
sf.write(noisy_file_path, noisy_signal, sr)

print(f"Noisy audio file saved to: {noisy_file_path}")

Noisy audio file saved to: C:/Users/hhanl/Downloads/noisy_Video 6_7 (buy baby a bib).mp3


### Play the sound

In [92]:
from IPython.display import Audio

audio_path = noisy_file_path
Audio(audio_path)

# Draft

### Split Data

We need to split our data into train and test sets, then save those for further experiments.

In [29]:
train_metadata = pd.read_csv(train_catalog)
train_metadata

Unnamed: 0,File_Name,Sampling_Rate_(Hz),Channels,Duration_(seconds),folder,hypernasality,original_text,OPENAI_Whisper_text,WAV_filename,WAV_folder
0,ACPA ted had a dog with white feet-3.mp3,44100.0,1.0,4.13,CASES,1.0,ted had a dog with white feet,Ted and a dog with white feet.,ACPA ted had a dog with white feet-3.wav,CASES_WAV
1,cdc 4 (and then go to school).mp3,44100.0,2.0,1.41,CONTROLS,0.0,and then go to school,and then go to school.,cdc 4 (and then go to school).wav,CONTROLS_WAV
2,Video 1_4 (and can I have some more material).mp3,44100.0,2.0,3.60,CONTROLS,0.0,and can I have some more material,And can I have some more material?,Video 1_4 (and can I have some more material).wav,CONTROLS_WAV
3,NEW - video 2 (three times).mp3,44100.0,2.0,1.28,CONTROLS,0.0,three times,Three times.,NEW - video 2 (three times).wav,CONTROLS_WAV
4,cdc 4 (and then he brushed his teeth).mp3,44100.0,2.0,1.52,CONTROLS,0.0,and then he brushed his teeth,And then he brushed his teeth.,cdc 4 (and then he brushed his teeth).wav,CONTROLS_WAV
...,...,...,...,...,...,...,...,...,...,...
142,video 1 (pizza bundt).mp3,44100.0,2.0,1.80,CONTROLS,0.0,pizza bundt,Pizza Funt!,video 1 (pizza bundt).wav,CONTROLS_WAV
143,ACPA most boys like to play football-3.mp3,48000.0,1.0,3.31,CASES,1.0,most boys like to play football,Most boys like to play football.,ACPA most boys like to play football-3.wav,CASES_WAV
144,Facebook (take a tire).mp3,44100.0,1.0,1.75,CASES,1.0,take a tire,See you next time!,Facebook (take a tire).wav,CASES_WAV
145,Video 5_1 (feet).mp3,44100.0,2.0,1.04,CASES,1.0,feet,Peace.,Video 5_1 (feet).wav,CASES_WAV


# References

In [30]:
# add cols for wav data

# Replace ".mp3" with ".wav" in the "Filename" column
train_metadata['WAV_filename'] = train_metadata['File_Name'].str.replace('.mp3', '.wav')

# Create "WAV_folder" column by concatenating "_WAV" to the "folder" column
train_metadata['WAV_folder'] = train_metadata['folder'] + "_WAV"

train_metadata


  train_metadata['WAV_filename'] = train_metadata['File_Name'].str.replace('.mp3', '.wav')


Unnamed: 0,File_Name,Sampling_Rate_(Hz),Channels,Duration_(seconds),folder,hypernasality,original_text,OPENAI_Whisper_text,WAV_filename,WAV_folder
0,ACPA ted had a dog with white feet-3.mp3,44100.0,1.0,4.13,CASES,1.0,ted had a dog with white feet,Ted and a dog with white feet.,ACPA ted had a dog with white feet-3.wav,CASES_WAV
1,cdc 4 (and then go to school).mp3,44100.0,2.0,1.41,CONTROLS,0.0,and then go to school,and then go to school.,cdc 4 (and then go to school).wav,CONTROLS_WAV
2,Video 1_4 (and can I have some more material).mp3,44100.0,2.0,3.60,CONTROLS,0.0,and can I have some more material,And can I have some more material?,Video 1_4 (and can I have some more material).wav,CONTROLS_WAV
3,NEW - video 2 (three times).mp3,44100.0,2.0,1.28,CONTROLS,0.0,three times,Three times.,NEW - video 2 (three times).wav,CONTROLS_WAV
4,cdc 4 (and then he brushed his teeth).mp3,44100.0,2.0,1.52,CONTROLS,0.0,and then he brushed his teeth,And then he brushed his teeth.,cdc 4 (and then he brushed his teeth).wav,CONTROLS_WAV
...,...,...,...,...,...,...,...,...,...,...
142,video 1 (pizza bundt).mp3,44100.0,2.0,1.80,CONTROLS,0.0,pizza bundt,Pizza Funt!,video 1 (pizza bundt).wav,CONTROLS_WAV
143,ACPA most boys like to play football-3.mp3,48000.0,1.0,3.31,CASES,1.0,most boys like to play football,Most boys like to play football.,ACPA most boys like to play football-3.wav,CASES_WAV
144,Facebook (take a tire).mp3,44100.0,1.0,1.75,CASES,1.0,take a tire,See you next time!,Facebook (take a tire).wav,CASES_WAV
145,Video 5_1 (feet).mp3,44100.0,2.0,1.04,CASES,1.0,feet,Peace.,Video 5_1 (feet).wav,CASES_WAV


In [None]:
train_df, val_df = train_test_split(train_metadata, test_size = 0.3, random_state = 42)

In [None]:
train_files = train_df["WAV_filename"].tolist()

train_folder = train_df["WAV_folder"].tolist()

train_full_paths = [os.path.join(data_path,train_folder[i], train_files[i]) for i in range(0,len(train_files))]

#train_full_paths

In [None]:
train_labels = train_df["hypernasality"].tolist()

train_labels[0:10]

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]

In [None]:
# val set
val_files = val_df["WAV_filename"].tolist()

val_folder = val_df["WAV_folder"].tolist()

val_full_paths = [os.path.join(data_path,val_folder[i], val_files[i]) for i in range(0,len(val_files))]

val_labels = val_df["hypernasality"].tolist()

In [None]:
len(val_labels)

45

In [None]:
test_metadata = pd.read_csv(test_catalog)

In [None]:
# add cols for wav data

# Replace ".mp3" with ".wav" in the "Filename" column
test_metadata['WAV_filename'] = test_metadata['File_Name'].str.replace('.mp3', '.wav')

# Create "WAV_folder" column by concatenating "_WAV" to the "folder" column
test_metadata['WAV_folder'] = test_metadata['folder'] + "_WAV"

  test_metadata['WAV_filename'] = test_metadata['File_Name'].str.replace('.mp3', '.wav')


In [None]:
test_files = test_metadata["WAV_filename"].tolist()

test_folder = test_metadata["WAV_folder"].tolist()

test_full_paths = [os.path.join(data_path,test_folder[i], test_files[i]) for i in range(0,len(test_files))]

#test_full_paths

In [None]:
test_labels = test_metadata["hypernasality"].tolist()

### Create PyTorch datasets

In [None]:


train_audio_dataset = datasets.Dataset.from_dict({"audio": train_full_paths,
                                                  "labels":train_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

test_audio_dataset = datasets.Dataset.from_dict({"audio": test_full_paths,
                                                  "labels": test_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

val_audio_dataset = datasets.Dataset.from_dict({"audio": val_full_paths,
                                                 "labels": val_labels }
                                             ).cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


In [None]:
train_dataset = SpeechClassificationDataset(train_audio_dataset,  feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset,  feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset,  feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Fine Tune Whisper Model

Whisper model from HuggingFace.

In [None]:

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits



In [None]:
num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()



In [None]:
# Define the training function NO VAL
def train(model, train_loader, optimizer, criterion, device, num_epochs):

  for epoch in range(num_epochs):

    model.train()

    for i, batch in enumerate(train_loader):

          input_features, decoder_input_ids, labels = batch

          input_features = input_features.squeeze()
          input_features = input_features.to(device)

          decoder_input_ids = decoder_input_ids.squeeze()
          decoder_input_ids = decoder_input_ids.to(device)

          labels = labels.view(-1)
          labels = labels.type(torch.LongTensor)
          labels = labels.to(device)

          optimizer.zero_grad()

          logits = model(input_features, decoder_input_ids)

          loss = criterion(logits, labels)
          loss.backward()

          optimizer.step()

          if (i+1) % 8 == 0:
              print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

    torch.save(model.state_dict(), 'best_model.pt')

In [None]:

# Define the training function
def train(model, train_loader, val_loader, optimizer,  criterion, device, num_epochs):

    best_accuracy = 0.0

    for epoch in range(num_epochs):

        model.train()

        for i, batch in enumerate(train_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.type(torch.LongTensor)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()

            if (i+1) % 8 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item() :.4f}')
                train_loss = 0.0

        val_loss, val_accuracy, val_f1, _ , _ = evaluate(model, val_loader, device)

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pt')

        print("========================================================================================")
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}, Best Accuracy: {best_accuracy:.4f}')
        print("========================================================================================")



In [None]:
def evaluate(model, data_loader,  device):

    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():

        for i, batch in enumerate(data_loader):

          input_features, decoder_input_ids, labels = batch

          input_features = input_features.squeeze()
          input_features = input_features.to(device)

          decoder_input_ids = decoder_input_ids.squeeze()
          decoder_input_ids = decoder_input_ids.to(device)

          labels = labels.view(-1)
          labels = labels.type(torch.LongTensor)
          labels = labels.to(device)

          optimizer.zero_grad()

          logits = model(input_features, decoder_input_ids)

          loss = criterion(logits, labels)
          total_loss += loss.item()

          _, preds = torch.max(logits, 1)
          all_labels.append(labels.cpu().numpy())
          all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds


In [None]:
import librosa
num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Epoch 1/5, Batch 8/19, Train Loss: 0.6754
Epoch 1/5, Batch 16/19, Train Loss: 0.5096
Epoch 1/5, Val Loss: 0.2350, Val Accuracy: 0.9556, Val F1: 0.9554, Best Accuracy: 0.9556
Epoch 2/5, Batch 8/19, Train Loss: 0.0305
Epoch 2/5, Batch 16/19, Train Loss: 0.0067
Epoch 2/5, Val Loss: 0.0244, Val Accuracy: 0.9778, Val F1: 0.9777, Best Accuracy: 0.9778
Epoch 3/5, Batch 8/19, Train Loss: 0.0070
Epoch 3/5, Batch 16/19, Train Loss: 0.0028
Epoch 3/5, Val Loss: 0.1729, Val Accuracy: 0.9556, Val F1: 0.9555, Best Accuracy: 0.9778
Epoch 4/5, Batch 8/19, Train Loss: 0.0006
Epoch 4/5, Batch 16/19, Train Loss: 0.0517
Epoch 4/5, Val Loss: 0.1358, Val Accuracy: 0.9778, Val F1: 0.9777, Best Accuracy: 0.9778
Epoch 5/5, Batch 8/19, Train Loss: 0.0002
Epoch 5/5, Batch 16/19, Train Loss: 0.0005
Epoch 5/5, Val Loss: 0.9425, Val Accuracy: 0.8667, Val F1: 0.8650, Best Accuracy: 0.9778


### Validation

Before running the model on the test set, let's examine the validation set and see how our model is doing.

In [None]:
#VALIDATION
state_dict = torch.load('best_model.pt')

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

In [None]:
#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        22
           1       0.96      1.00      0.98        23

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

0.9777777777777777


This is too good to be true. Checking the contents of labels, preds, and data balance.

In [None]:
all_labels

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1])

In [None]:
all_preds

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1])

In [None]:
sum(train_labels)/len(train_labels)

0.5238095238095238

In [None]:
sum(val_labels)/len(val_labels)

0.5111111111111111

In [None]:
# TESTING ONLY
state_dict = torch.load('best_model.pt')

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, test_loader, device)


print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

I don't want to run testing yet as we want to explore more models.

### Model Troubleshooting

So far our results look too good to be true (98% validation accuracy). In the cells below I run through some troubleshooting methods to ensure our model is not overfit or learning the wrong representations.

Ensure that the labels are correct.

In [None]:
train_df[train_df["WAV_folder"] == "CONTROLS_WAV"]["hypernasality"]

93     0.0
140    0.0
108    0.0
65     0.0
28     0.0
117    0.0
84     0.0
142    0.0
44     0.0
15     0.0
114    0.0
47     0.0
110    0.0
78     0.0
5      0.0
120    0.0
77     0.0
34     0.0
111    0.0
43     0.0
95     0.0
131    0.0
8      0.0
13     0.0
3      0.0
38     0.0
72     0.0
6      0.0
109    0.0
2      0.0
123    0.0
112    0.0
46     0.0
79     0.0
41     0.0
90     0.0
75     0.0
32     0.0
141    0.0
37     0.0
1      0.0
52     0.0
103    0.0
74     0.0
121    0.0
146    0.0
20     0.0
14     0.0
Name: hypernasality, dtype: float64

In [None]:
train_df

Unnamed: 0,File_Name,Sampling_Rate_(Hz),Channels,Duration_(seconds),folder,hypernasality,original_text,OPENAI_Whisper_text,WAV_filename,WAV_folder
93,ACPA Santa came home since the snow fell.mp3,44100.0,1.0,3.19,CONTROLS,0.0,Santa came home since the snow fell,Santa came home since the snow fell.,ACPA Santa came home since the snow fell.wav,CONTROLS_WAV
140,cdc 5 (can I play with Jack).mp3,44100.0,2.0,1.57,CONTROLS,0.0,can I play with Jack,Can I play with Jack?,cdc 5 (can I play with Jack).wav,CONTROLS_WAV
108,cdc 6 (the polar bears are dancing).mp3,44100.0,2.0,2.32,CONTROLS,0.0,the polar bears are dancing,"Um, the polar bears are dancing.",cdc 6 (the polar bears are dancing).wav,CONTROLS_WAV
0,ACPA ted had a dog with white feet-3.mp3,44100.0,1.0,4.13,CASES,1.0,ted had a dog with white feet,Ted and a dog with white feet.,ACPA ted had a dog with white feet-3.wav,CASES_WAV
73,Video 1_4 (seesaw).mp3,44100.0,2.0,1.15,CASES,1.0,seesaw,P.S.A.,Video 1_4 (seesaw).wav,CASES_WAV
...,...,...,...,...,...,...,...,...,...,...
71,Video 4_4 (well it will help me).mp3,44100.0,2.0,2.32,CASES,1.0,well it will help me,"Wow, em vừa học đĩa",Video 4_4 (well it will help me).wav,CASES_WAV
106,ACPA buy baby a bib.mp3,48000.0,1.0,1.92,CASES,1.0,buy baby a bib,"Hi, I'm Hayley Mim.",ACPA buy baby a bib.wav,CASES_WAV
14,Video 1_18 (pretend it stops running when the ...,44100.0,2.0,5.80,CONTROLS,0.0,pretend it stops running when the car is going,"When it stops running, when the car is going.",Video 1_18 (pretend it stops running when the ...,CONTROLS_WAV
92,Video 2_4 (daddy).mp3,44100.0,2.0,0.57,CASES,1.0,daddy,Fanny,Video 2_4 (daddy).wav,CASES_WAV


Making a dummy label set to make sure that my model isn't taking random guesses.

In [None]:
# dummy data
import random

# Define the length of the list you want
length = len(train_labels)  # Change this to your desired length

# Generate a list of random 1s and 0s of the specified length
dummy_list = [random.choice([0, 1]) for _ in range(length)]



In [None]:
dummy_df = train_df
dummy_df["DUMMY"] = dummy_list

In [None]:
dummy_audio_dataset = datasets.Dataset.from_dict({"audio": train_full_paths,
                                                  "labels":dummy_list}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

dummy_dataset = SpeechClassificationDataset(dummy_audio_dataset,  feature_extractor)

batch_size = 8

dummy_loader = DataLoader(dummy_dataset, batch_size=batch_size, shuffle=True)


In [None]:
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

In [None]:
num_epochs = 5
train(model, dummy_loader, val_loader, optimizer, criterion, device, num_epochs)

Epoch 1/5, Batch 8/19, Train Loss: 0.6309
Epoch 1/5, Batch 16/19, Train Loss: 0.6838
Epoch 1/5, Val Loss: 0.6959, Val Accuracy: 0.5111, Val F1: 0.3382, Best Accuracy: 0.5111
Epoch 2/5, Batch 8/19, Train Loss: 0.7486
Epoch 2/5, Batch 16/19, Train Loss: 0.7218
Epoch 2/5, Val Loss: 0.6875, Val Accuracy: 0.5778, Val F1: 0.4974, Best Accuracy: 0.5778
Epoch 3/5, Batch 8/19, Train Loss: 0.7116
Epoch 3/5, Batch 16/19, Train Loss: 0.7223
Epoch 3/5, Val Loss: 0.6800, Val Accuracy: 0.5778, Val F1: 0.4738, Best Accuracy: 0.5778
Epoch 4/5, Batch 8/19, Train Loss: 0.6936
Epoch 4/5, Batch 16/19, Train Loss: 0.6941
Epoch 4/5, Val Loss: 0.6729, Val Accuracy: 0.7111, Val F1: 0.6890, Best Accuracy: 0.7111
Epoch 5/5, Batch 8/19, Train Loss: 0.6948
Epoch 5/5, Batch 16/19, Train Loss: 0.7082
Epoch 5/5, Val Loss: 0.6789, Val Accuracy: 0.5111, Val F1: 0.3382, Best Accuracy: 0.7111


Model is not learning with the dummy data....

## Simpler Model

Let's train a simpler model to see how our model does compared to a simpler one such as SVM or Random Forrest. Generated with help from ChatGPT4

### SVM

Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


# Define a function to extract MFCCs from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    mfccs_scaled = np.mean(mfccs.T, axis=0)  # Taking the average across time
    return mfccs_scaled

# Paths to your audio files (replace these with your actual file paths)
audio_files = train_full_paths + test_full_paths  # Add more paths as needed
labels = train_labels + test_labels  # Corresponding labels for your audio files

# Extract features from each audio file
features = [extract_mfcc_features(file) for file in audio_files]

# Split the dataset into training and testing sets
X_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')  # You can experiment with different kernels
svm_model.fit(x_train, y_train)

# Predictions
y_pred = svm_model.predict(x_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:", classification_report(y_val, y_pred))


Accuracy: 0.8717948717948718
Classification Report:               precision    recall  f1-score   support

         0.0       0.89      0.84      0.86        19
         1.0       0.86      0.90      0.88        20

    accuracy                           0.87        39
   macro avg       0.87      0.87      0.87        39
weighted avg       0.87      0.87      0.87        39



### Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100)  # You can adjust the number of trees
rf_model.fit(x_train, y_train)

# Make predictions - VAL
y_pred = rf_model.predict(x_val)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:", classification_report(y_val, y_pred))

Accuracy: 0.9230769230769231
Classification Report:               precision    recall  f1-score   support

         0.0       0.94      0.89      0.92        19
         1.0       0.90      0.95      0.93        20

    accuracy                           0.92        39
   macro avg       0.92      0.92      0.92        39
weighted avg       0.92      0.92      0.92        39

