# Fine-tuning Whisper on Speech Pathology Dataset
## Goal
- The goal of the Cleft Palate project at Vanderbilt DSI is to classify audio clips of patients' voices as containing hypernasality (a speech impediment) or not. The patients with hypernasality can then be recommended for speech pathology intervention. This is currently evaluated by human speech pathologists, which requires access to these medical providers. Our hope is to train a model that can classify this speech impediment for expedited patient access to a speech pathologist.

- This model is trained with noise added to the public data sample. The tiny, small, base, medium, and large models are trained all together in this notebook.

## Model
- In this notebook we train the Whisper model with a Sequence Classification Head.

## Data
- The data in this notebook is publicly available voice recordings featuring hypernasality and control groups. In the future we hope to train our model on private patient data from Vanderbilt University Medical Center (VUMC).

# Import Libraries

In [None]:
# Install necessary packages only once
!pip install datasets transformers openai whisper librosa soundfile mutagen torch

# Upgrade packages if necessary
!pip install --upgrade typing_extensions

# Import necessary libraries
import os
import io
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
import librosa
import soundfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
from datasets import load_dataset, load_from_disk, DatasetDict, Audio

# Special imports from IPython for audio handling in notebooks
from IPython.display import Audio

# Preprocessing the data

In [None]:
data_path = "/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/"

train_catalog = '/workspace/cleft_palate_choja/test_noise.csv'
test_catalog = '/workspace/cleft_palate_choja/test_noise.csv'
train_metadata = pd.read_csv(train_catalog)
test_metadata = pd.read_csv(test_catalog)

train_df, val_df = train_test_split(train_metadata, test_size = 0.3, random_state = 42)

# train set
train_files = train_df["WAV_filename"].tolist()

train_folder = train_df["WAV_folder"].tolist()

train_full_paths = [os.path.join(data_path,train_folder[i], train_files[i]) for i in range(0,len(train_files))]

train_labels = train_df["hypernasality"].tolist()
# val set
val_files = val_df["WAV_filename"].tolist()

val_folder = val_df["WAV_folder"].tolist()

val_full_paths = [os.path.join(data_path,val_folder[i], val_files[i]) for i in range(0,len(val_files))]

val_labels = val_df["hypernasality"].tolist()

# test set
test_files = test_metadata["WAV_filename"].tolist()

test_folder = test_metadata["WAV_folder"].tolist()

test_full_paths = [os.path.join(data_path,test_folder[i], test_files[i]) for i in range(0,len(test_files))]

test_labels = test_metadata["hypernasality"].tolist()

train_audio_dataset = datasets.Dataset.from_dict({"audio": train_full_paths,
                                                  "labels":train_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

test_audio_dataset = datasets.Dataset.from_dict({"audio": test_full_paths,
                                                  "labels": test_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

val_audio_dataset = datasets.Dataset.from_dict({"audio": val_full_paths,
                                                 "labels": val_labels }
                                             ).cast_column("audio", Audio(sampling_rate=16_000))
# save data to disk for later loading
train_audio_dataset.save_to_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset.save_to_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset.save_to_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/51 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/74 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23 [00:00<?, ? examples/s]

# 4. Creating the SpeechClassificationDataset

In [None]:
# load data from disk
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-base"
encoder = WhisperModel.from_pretrained(model_checkpoint)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class SpeechClassificationDataset(torch.utils.data.Dataset):
    # Creates a dataset object in PyTorch which can be used with PyTorch models
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


# add feature extractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)

train_dataset = SpeechClassificationDataset(train_audio_dataset,  feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset,  feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset,  feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Iterate through the datasets by using the train, test, and val loader objects
# Pass data samples to the feature_extractor and store in arrays

train_results = []
test_results = []
val_results = []


# Whisper model Base Noise
- Load PyTorch datasets & Training

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from transformers import WhisperFeatureExtractor, WhisperModel

# Load datasets
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-base"
model_path = '/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data, text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):
        inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                     return_tensors="pt",
                                     sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id
        labels = np.array(self.audio_data[index]['labels'])
        return input_features, decoder_input_ids, torch.tensor(labels)

train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

# Define the training function
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 8 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

        #model_path = '/content/drive/MyDrive/VUMC_postcleft/WAV_PUBLIC_SAMPLES/noise/whisper_noise_base_best_model.pt'
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")

        val_loss, val_accuracy, val_f1, _, _ = evaluate(model, val_loader, device)
        print("========================================================================================")
        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader, device):
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds

num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)
#VALIDATION
state_dict = torch.load(model_path)

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

2024-04-14 20:38:46.766171: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt
Epoch 1/5, Val Loss: 0.6530, Val Accuracy: 0.6087, Val F1: 0.5548
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt
Epoch 2/5, Val Loss: 0.5181, Val Accuracy: 0.7391, Val F1: 0.7346
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt
Epoch 3/5, Val Loss: 0.4309, Val Accuracy: 0.7391, Val F1: 0.7346
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt
Epoch 4/5, Val Loss: 0.8335, Val Accuracy: 0.8261, Val F1: 0.8175
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_base_best_model.pt
Epoch 5/5, Val Loss: 1.3502, Val Accuracy: 0.7826, Val F1: 0.7758
              precision    recall  f1-score   support

           0       0.88      0.64      0.74        11
           1       0.73      0.92      0.81   

# Whisper model Noise Tiny - openai/whisper-tiny
- Load PyTorch datasets & Training

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from transformers import WhisperFeatureExtractor, WhisperModel

# Load datasets
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-tiny"
model_path = '/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data, text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):
        inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                     return_tensors="pt",
                                     sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id
        labels = np.array(self.audio_data[index]['labels'])
        return input_features, decoder_input_ids, torch.tensor(labels)

train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

# Define the training function
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 8 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

        #model_path = '/content/drive/MyDrive/VUMC_postcleft/WAV_PUBLIC_SAMPLES/noise/whisper_noise_base_best_model.pt'
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")

        val_loss, val_accuracy, val_f1, _, _ = evaluate(model, val_loader, device)
        print("========================================================================================")
        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader, device):
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds

num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)
#VALIDATION
state_dict = torch.load(model_path)

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))



Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt
Epoch 1/5, Val Loss: 0.6756, Val Accuracy: 0.5217, Val F1: 0.4103
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt
Epoch 2/5, Val Loss: 0.5130, Val Accuracy: 0.8696, Val F1: 0.8686
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt
Epoch 3/5, Val Loss: 0.3356, Val Accuracy: 0.9565, Val F1: 0.9565
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt
Epoch 4/5, Val Loss: 0.2838, Val Accuracy: 0.9130, Val F1: 0.9129
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_tiny_best_model.pt
Epoch 5/5, Val Loss: 0.4477, Val Accuracy: 0.9130, Val F1: 0.9129
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        11
           1       1.00      0.83      0.91   

# Whisper model Noise small - openai/whisper-small
- Load PyTorch datasets & Training

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from transformers import WhisperFeatureExtractor, WhisperModel

# Load datasets
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-small"
model_path = '/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data, text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):
        inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                     return_tensors="pt",
                                     sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id
        labels = np.array(self.audio_data[index]['labels'])
        return input_features, decoder_input_ids, torch.tensor(labels)

train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

# Define the training function
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 8 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

        #model_path = '/content/drive/MyDrive/VUMC_postcleft/WAV_PUBLIC_SAMPLES/noise/whisper_noise_base_best_model.pt'
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")

        val_loss, val_accuracy, val_f1, _, _ = evaluate(model, val_loader, device)
        print("========================================================================================")
        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader, device):
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds

num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)
#VALIDATION
state_dict = torch.load(model_path)

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))



Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt
Epoch 1/5, Val Loss: 0.6011, Val Accuracy: 0.7826, Val F1: 0.7758
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt
Epoch 2/5, Val Loss: 0.4256, Val Accuracy: 0.7826, Val F1: 0.7810
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt
Epoch 3/5, Val Loss: 0.5614, Val Accuracy: 0.7391, Val F1: 0.7346
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt
Epoch 4/5, Val Loss: 0.8236, Val Accuracy: 0.7826, Val F1: 0.7758
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_small_best_model.pt
Epoch 5/5, Val Loss: 1.1039, Val Accuracy: 0.7826, Val F1: 0.7758
              precision    recall  f1-score   support

           0       0.88      0.64      0.74        11
           1       0.73      0.92      0.

# Whisper model Noise Medium - openai/whisper-medium
- Load PyTorch datasets & Training

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from transformers import WhisperFeatureExtractor, WhisperModel

# Load datasets
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-medium"
model_path = '/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data, text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):
        inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                     return_tensors="pt",
                                     sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id
        labels = np.array(self.audio_data[index]['labels'])
        return input_features, decoder_input_ids, torch.tensor(labels)

train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

# Define the training function
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 8 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

        #model_path = '/content/drive/MyDrive/VUMC_postcleft/WAV_PUBLIC_SAMPLES/noise/whisper_noise_base_best_model.pt'
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")

        val_loss, val_accuracy, val_f1, _, _ = evaluate(model, val_loader, device)
        print("========================================================================================")
        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader, device):
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds

num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)
#VALIDATION
state_dict = torch.load(model_path)

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))



Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt
Epoch 1/5, Val Loss: 0.6141, Val Accuracy: 0.8261, Val F1: 0.8258
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt
Epoch 2/5, Val Loss: 0.3366, Val Accuracy: 0.9130, Val F1: 0.9115
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt
Epoch 3/5, Val Loss: 0.3206, Val Accuracy: 0.9130, Val F1: 0.9115
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt
Epoch 4/5, Val Loss: 0.4247, Val Accuracy: 0.9130, Val F1: 0.9115
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_medium_best_model.pt
Epoch 5/5, Val Loss: 0.5796, Val Accuracy: 0.9130, Val F1: 0.9115
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           1       0.86      1.00   

# Whisper model Noise Large - openai/whisper-large-v2
- Load PyTorch datasets & Training

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from transformers import WhisperFeatureExtractor, WhisperModel

# Load datasets
train_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/train_dataset')
test_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/test_dataset')
val_audio_dataset = load_from_disk('/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/val_dataset')

model_checkpoint = "openai/whisper-large-v2"
model_path = '/workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data, text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):
        inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                     return_tensors="pt",
                                     sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id
        labels = np.array(self.audio_data[index]['labels'])
        return input_features, decoder_input_ids, torch.tensor(labels)

train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()

# Define the training function
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(train_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 8 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(train_loader)}, Train Loss: {loss.item():.4f}')

        #model_path = '/content/drive/MyDrive/VUMC_postcleft/WAV_PUBLIC_SAMPLES/noise/whisper_noise_base_best_model.pt'
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")

        val_loss, val_accuracy, val_f1, _, _ = evaluate(model, val_loader, device)
        print("========================================================================================")
        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader, device):
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze().to(device)
            decoder_input_ids = decoder_input_ids.squeeze().to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)

            logits = model(input_features, decoder_input_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds

num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)
#VALIDATION
state_dict = torch.load(model_path)

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]



Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt
Epoch 1/5, Val Loss: 0.5672, Val Accuracy: 0.9130, Val F1: 0.9129
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt
Epoch 2/5, Val Loss: 0.4439, Val Accuracy: 0.8261, Val F1: 0.8175
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt
Epoch 3/5, Val Loss: 0.2304, Val Accuracy: 0.9565, Val F1: 0.9562
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt
Epoch 4/5, Val Loss: 0.5277, Val Accuracy: 0.8696, Val F1: 0.8655
Model saved to /workspace/cleft_palate_choja/WAV_PUBLIC_SAMPLES/NOISE/whisper_noise_large_best_model.pt
Epoch 5/5, Val Loss: 0.1738, Val Accuracy: 0.9130, Val F1: 0.9129
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        11
           1       1.00      0.83      0.