In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [2]:
root_dir = r"tensorflow-speech-recognition-challenge\train\audio"

data = []
waveforms = []

for label in os.listdir(root_dir)[:5]:
	label_dir = os.path.join(root_dir, label)
	print(f"{label = }")
	
	for audio_file in tqdm(os.listdir(label_dir)):
		name = audio_file.split('.')[0]
		audio_path = os.path.join(label_dir, audio_file)  

		try:
			waveform, _ = torchaudio.load(audio_path)
			waveforms.append(waveform)
			data.append({
				"filename": name,
				"path": audio_path,
				"word": label
			})
		except Exception as e:
			print(f"Error loading {name}.wav from {label_dir}")

label = 'bed'


100%|██████████| 1713/1713 [00:01<00:00, 1660.87it/s]


label = 'bird'


100%|██████████| 1713/1713 [00:01<00:00, 1659.57it/s]


label = 'cat'


100%|██████████| 1713/1713 [00:01<00:00, 1701.81it/s]


label = 'dog'


100%|██████████| 1713/1713 [00:01<00:00, 1613.01it/s]


label = 'down'


100%|██████████| 1713/1713 [00:01<00:00, 1670.96it/s]


In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,filename,path,word
0,00176480_nohash_0,tensorflow-speech-recognition-challenge\train\...,bed
1,004ae714_nohash_0,tensorflow-speech-recognition-challenge\train\...,bed
2,004ae714_nohash_1,tensorflow-speech-recognition-challenge\train\...,bed
3,00f0204f_nohash_0,tensorflow-speech-recognition-challenge\train\...,bed
4,00f0204f_nohash_1,tensorflow-speech-recognition-challenge\train\...,bed


In [4]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

print(f"Label: {df['word'][0]}")

audio, sr = torchaudio.load(df["path"][0])
audio = audio[0].numpy().squeeze()
audio = librosa.resample(y= np.asarray(audio), orig_sr= sr, target_sr= 16_000)
ipd.Audio(data=np.asarray(audio), autoplay=True, rate=16000)

Label: bed


In [5]:
csv_path = r"tensorflow-speech-recognition-challenge\train"

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["word"])

In [7]:
train_df.to_csv(f"{csv_path}/train.csv", index=False)
test_df.to_csv(f"{csv_path}/test.csv", index=False)


print(train_df.shape)
print(test_df.shape)

(6852, 3)
(1713, 3)


In [12]:
train_df = pd.read_csv(f"{csv_path}/train.csv")
test_df = pd.read_csv(f"{csv_path}/test.csv")

In [13]:
from transformers import AutoConfig, Wav2Vec2Processor

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
model_name_or_path = "facebook/wav2vec2-base-960h"
pooling_mode = "mean"
label_list = list(train_df['word'].unique())
num_labels = len(label_list)

In [15]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [16]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [17]:
def audio_to_array(path: str):
	audio, sr = torchaudio.load(path)
	resampler = torchaudio.transforms.Resample(sr, target_sampling_rate)
	return resampler(audio).squeeze().numpy()

In [18]:
# import torch

In [19]:
# def pad_audios(audios):
#   padded_audios = []
#   max_length = max([audio.shape[1] for audio in audios])
  
#   for audio in audios:
#     padding_length = max_length - audio.shape[1]
#     padded_audio = torch.nn.functional.pad(audio, (0, padding_length), mode= 'constant', value= 0.0)
#     padded_audios.append(padded_audio)
  
#   return padded_audios

In [20]:
def label_to_id(label, label_list):

	if len(label_list) > 0:
			return label_list.index(label) if label in label_list else -1

	return label

In [78]:
def preprocess_function(examples):
	speech_list = [audio_to_array(path) for path in examples['path'][:300]]
	target_list = [label_to_id(label, label_list) for label in examples['word'][:300]]

	result = processor(speech_list, sampling_rate=target_sampling_rate, padding="longest")
	result["labels"] = list(target_list)

	return result

In [22]:
from sklearn.svm import SVC

In [79]:
train_data = preprocess_function(train_df)
test_data = preprocess_function(test_df)

In [19]:
# Initialize SVM classifier
svm_model = SVC(probability=True, random_state=42)

In [19]:
svm_model.fit(train_data["input_values"], train_data["labels"])

In [24]:
import joblib

In [None]:
# save
joblib.dump(svm_model, "models/svm_model.pkl") 

In [26]:
# load
svm_model = joblib.load("models/svm_model.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [27]:
# Make predictions
y_pred = svm_model.predict(test_data["input_values"])
y_prob = svm_model.predict_proba(test_data["input_values"])

In [28]:
from sklearn.metrics import classification_report

In [29]:
# Evaluate the model
print(classification_report(test_data["labels"], y_pred))

              precision    recall  f1-score   support

           0       0.01      0.01      0.01       478
           1       0.94      0.79      0.86       340
           2       0.00      0.00      0.00       363
           3       0.09      0.21      0.13       331
           4       0.01      0.01      0.01       345

    accuracy                           0.19      1857
   macro avg       0.21      0.20      0.20      1857
weighted avg       0.20      0.19      0.19      1857



In [30]:
test_data["labels"][1]

2

In [31]:
# Example of predicting probabilities for a new audio sample
# new_audio_features = extract_features(new_audio_file)
probabilities = svm_model.predict_proba([test_data["input_values"][1]])
print(probabilities)

[[0.14126703 0.11652079 0.18071419 0.1854912  0.37600679]]


In [None]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "/content/data/train.csv", 
    "validation": "/content/data/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
