# Data Access and Transformation for Speech Classification

Cleft palate dataset that is analyzed to identify hypernasality in speech

## 1. Importing Necessary Libraries

In [3]:
!pip install datasets
import datasets
import pandas as pd
import os
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import WhisperModel, WhisperFeatureExtractor
from datasets import load_from_disk
import torch

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

## 2. Loading the Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_path = "/content/drive/MyDrive/WAV_PUBLIC_SAMPLES/"

train_catalog = f"{data_path}/noisy_data/train/train.csv"
test_catalog = f"{data_path}/noisy_data/test/test.csv"

In [6]:
train_metadata = pd.read_csv('train_catalog.csv')
test_metadata = pd.read_csv('test_catalog.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train_catalog.csv'

## 3. Preprocessing the Data


In [None]:
train_df, val_df = train_test_split(train_metadata, test_size = 0.3, random_state = 42)

In [None]:
# train set
train_files = train_df["WAV_filename"].tolist()

train_folder = train_df["WAV_folder"].tolist()

train_full_paths = [os.path.join(data_path,train_folder[i], train_files[i]) for i in range(0,len(train_files))]

train_labels = train_df["hypernasality"].tolist()

In [None]:
# val set
val_files = val_df["WAV_filename"].tolist()

val_folder = val_df["WAV_folder"].tolist()

val_full_paths = [os.path.join(data_path,val_folder[i], val_files[i]) for i in range(0,len(val_files))]

val_labels = val_df["hypernasality"].tolist()

In [None]:
# test set
test_files = test_metadata["WAV_filename"].tolist()

test_folder = test_metadata["WAV_folder"].tolist()

test_full_paths = [os.path.join(data_path,test_folder[i], test_files[i]) for i in range(0,len(test_files))]

test_labels = test_metadata["hypernasality"].tolist()

In [None]:

train_audio_dataset = datasets.Dataset.from_dict({"audio": train_full_paths,
                                                  "labels":train_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

test_audio_dataset = datasets.Dataset.from_dict({"audio": test_full_paths,
                                                  "labels": test_labels}
                                                 ).cast_column("audio", Audio(sampling_rate=16_000))

val_audio_dataset = datasets.Dataset.from_dict({"audio": val_full_paths,
                                                 "labels": val_labels }
                                             ).cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
# save data to disk for later loading
train_audio_dataset.save_to_disk(f"{data_path}/noisy_data/train/train_dataset")
test_audio_dataset.save_to_disk(f"{data_path}/noisy_data/test/test_dataset")
val_audio_dataset.save_to_disk(f"{data_path}/noisy_data/val/val_dataset")

## 4. Creating the SpeechClassificationDataset

In [None]:
# load data from disk
train_audio_dataset = load_from_disk(f"{data_path}/noisy_data/train/train_dataset")
test_audio_dataset = load_from_disk(f"{data_path}/noisy_data/test/test_dataset")
val_audio_dataset = load_from_disk(f"{data_path}/noisy_data/val/val_dataset")

In [None]:
class SpeechClassificationDataset(torch.utils.data.Dataset):
    # Creates a dataset object in PyTorch which can be used with PyTorch models
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


In [None]:
# add feature extractor
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)

In [None]:
train_dataset = SpeechClassificationDataset(train_audio_dataset,  feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset,  feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset,  feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Iterate through the datasets by using the train, test, and val loader objects
# Pass data samples to the feature_extractor and store in arrays

train_results = []
test_results = []
val_results = []

for x in train_loader:
  train_results += feature_extractor.generate(x)
for x in test_loader:
  test_results += feature_extractor.generate(x)
for x in val_loader:
  val_results += feature_extractor.generate(x)

## Visualizing the Data

In [None]:
# Visualize the data by printing the results from the feature_extractor
print(train_results)
print(test_results)
print(val_results)