In [1]:
import torchaudio
import pandas as pd
from transformers import Wav2Vec2Processor
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name_or_path = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
csv_path = r"tensorflow-speech-recognition-challenge\train"
train_df = pd.read_csv(f"{csv_path}/train.csv")
test_df = pd.read_csv(f"{csv_path}/test.csv")
label_list = list(train_df['word'].unique())

In [3]:
def audio_to_array(path: str):
	audio, sr = torchaudio.load(path)
	resampler = torchaudio.transforms.Resample(sr, target_sampling_rate)
	return resampler(audio).squeeze().numpy()

In [4]:
def label_to_id(label, label_list):
	if len(label_list) > 0:
			return label_list.index(label) if label in label_list else -1
	return label

In [5]:
def id_to_label(label_id):
  return label_list[int(label_id)]

In [6]:
def preprocess_function(examples):
	speech_list = [audio_to_array(path) for path in examples['path'][:1000]]
	target_list = [label_to_id(label, label_list) for label in examples['word'][:1000]]

	result = processor(speech_list, sampling_rate=target_sampling_rate, padding="longest")
	result["labels"] = list(target_list)

	return result

In [7]:
train_data = preprocess_function(train_df)
test_data = preprocess_function(test_df)
#svm_model.fit(train_data["input_values"], train_data["labels"])

In [8]:
from collections import Counter

# Count occurrences of each element
element_counts = Counter(train_data["labels"])

# Print the counts
for element, count in element_counts.items():
    print(f"Element {id_to_label(element)} appears {count} times.")

Element dog appears 198 times.
Element down appears 198 times.
Element cat appears 209 times.
Element bird appears 191 times.
Element bed appears 204 times.


In [90]:
def rem_dups(data_list1, data_list2):
  # Get indexes of element 2 using list comprehension
	for i in range(5):
		indexes = [index for index, value in enumerate(data_list1) if value == i][151:].reverse()
		print(indexes)
		for index in indexes:
			element = data_list1.pop(index)
			element = data_list2.pop(index)
	return data_list1, data_list2

In [9]:
def evaluate_model(model, model_name):
	print(f"========{model_name}========")
	# Make predictions
	y_pred = model.predict(test_data["input_values"])
	y_prob = model.predict_proba(test_data["input_values"])
	# Evaluate the model
	print(classification_report(test_data["labels"], y_pred))
	print(f"prdictions of {id_to_label(test_data['labels'][2])}")
	probabilities = model.predict_proba([test_data["input_values"][2]])
	for i in range(5):
		print(f"{id_to_label(i)} : {probabilities[0][i]}")
	print(f"{model_name} prediction: {id_to_label(model.predict([test_data['input_values'][2]])[0])}")

In [10]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [11]:
# svm_model = SVC(probability=True, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=5, random_state=42)
# logreg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
# nb_model = GaussianNB()
knn_model = KNeighborsClassifier(n_neighbors=5)

In [12]:
# svm_model.fit(train_data["input_values"], train_data["labels"])

In [13]:
# Train the model
rf_model.fit(train_data["input_values"], train_data["labels"])

In [14]:
xgb_model.fit(train_data["input_values"], train_data["labels"])

In [15]:
# logreg_model.fit(train_data["input_values"], train_data["labels"])

In [16]:
# nb_model.fit(train_data["input_values"], train_data["labels"])

In [17]:
knn_model.fit(train_data["input_values"], train_data["labels"])

In [18]:
# models_list = (svm_model, rf_model, xgb_model, logreg_model, nb_model, knn_model)
# model_names_list = ("svm_model", "rf_model", "xgb_model", "logreg_model", "nb_model", "knn_model")
models_list = (rf_model, xgb_model, knn_model)
model_names_list = ("rf_model", "xgb_model", "knn_model")

In [19]:
for i in range(3):
	evaluate_model(models_list[i], model_names_list[i])
	print()

              precision    recall  f1-score   support

           0       0.20      0.17      0.18       199
           1       0.38      0.37      0.37       196
           2       0.33      0.46      0.38       200
           3       0.22      0.14      0.17       214
           4       0.25      0.29      0.27       191

    accuracy                           0.28      1000
   macro avg       0.27      0.29      0.27      1000
weighted avg       0.27      0.28      0.27      1000

prdictions of down
dog : 0.23
down : 0.22
cat : 0.18
bird : 0.15
bed : 0.22
rf_model prediction: dog

              precision    recall  f1-score   support

           0       0.17      0.16      0.16       199
           1       0.26      0.24      0.25       196
           2       0.31      0.38      0.34       200
           3       0.20      0.13      0.16       214
           4       0.24      0.31      0.27       191

    accuracy                           0.24      1000
   macro avg       0.24      