### Experiment with wav2vec2 + noice reduction

In [6]:
import joblib
import torchaudio
import numpy as np
import pandas as pd
from transformers import Wav2Vec2Processor
from sklearn.metrics import classification_report

In [82]:
csv_path = r"CSVs"
train_df = pd.read_csv(f"{csv_path}/train.csv")
test_df = pd.read_csv(f"{csv_path}/test.csv")
target_length = 41760
model_name_or_path = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate

In [68]:
def audio_to_array(path: str):
	audio, sr = torchaudio.load(path)
	resampler = torchaudio.transforms.Resample(sr, target_sampling_rate)
	return resampler(audio).squeeze().numpy()

In [81]:
def preprocess_function(examples):
	speech_list = [audio_to_array(path) for path in examples['path']]
	target_list = [label for label in examples['word']]
	result = processor(speech_list, sampling_rate=target_sampling_rate, padding="longest")
	result["labels"] = target_list
	return result

In [83]:
train_data = preprocess_function(train_df)
test_data = preprocess_function(test_df)

Shapes - Speech List: [(18400,), (16000,), (28320,), (16000,), (16000,), (13920,), (15840,), (19840,), (16000,), (24960,), (21120,), (33120,), (12971,), (17120,), (15680,), (16000,), (16480,), (21120,), (16000,), (14400,), (16000,), (27840,), (16000,), (15680,), (28000,), (33760,), (28640,), (16000,), (13920,), (16000,), (15040,), (29760,), (16320,), (32480,), (15680,), (16000,), (15040,), (16000,), (15680,), (21120,), (20320,), (15680,), (16000,), (15680,), (15604,), (16000,), (16000,), (25280,), (16000,), (20480,), (16000,), (14400,), (19680,), (18080,), (14080,), (25600,), (18400,), (17760,), (34400,), (16000,), (30720,), (17440,), (16000,), (20320,), (20320,), (16000,), (19200,), (6880,), (21120,), (23040,), (13920,), (16000,), (16000,), (13760,), (16000,), (16000,), (16000,), (15040,), (18080,), (11889,), (16000,), (16000,), (21120,), (21920,), (16320,), (17120,), (32480,), (16000,), (25760,), (27360,), (18080,), (25760,), (16000,), (22560,), (15680,), (16000,), (16000,), (25920,)

In [24]:
label_list = list(train_df['word'].unique())
def label_to_id(label, label_list):
	if len(label_list) > 0:
			return label_list.index(label) if label in label_list else -1
	return label

In [25]:
X = train_data["input_values"]
y = train_data['labels']
y_encoded = [label_to_id(label, label_list) for label in y]

In [85]:
test_X = test_data["input_values"]
test_y = test_data['labels']
test_y_encoded = [label_to_id(label, label_list) for label in test_y]

In [36]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [37]:
svm_model = SVC(probability=True, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=3, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=3)
lgr_model = LogisticRegression(random_state=42)

In [38]:
svm_model.fit(X, y)

In [39]:
rf_model.fit(X, y)

In [40]:
xgb_model.fit(X, y_encoded)

In [41]:
knn_model.fit(X, y)

In [42]:
lgr_model.fit(X, y)

In [47]:
joblib.dump(svm_model, r"models\svm_model.pkl")
joblib.dump(rf_model, r"models\rf_model.pkl")
joblib.dump(xgb_model, r"models\xgb_model.pkl")
joblib.dump(knn_model, r"models\knn_model.pkl")
joblib.dump(lgr_model, r"models\lgr_model.pkl")

['models\\lgr_model.pkl']

In [44]:
models_list = (svm_model, rf_model, xgb_model, knn_model, lgr_model)
model_names_list = ("svm_model", "rf_model", "xgb_model", "knn_model", "lgr_model")

In [45]:
def evaluate_model(model, model_name, x, y):
	print(f"========{model_name}========")
	# Make predictions
	y_pred = model.predict(x)
	# Evaluate the model
	print(classification_report(y, y_pred))

In [86]:
for i in range(5):
	if model_names_list[i] == "xgb_model":
		y = test_y_encoded
	else:
		y = test_y
	evaluate_model(models_list[i], model_names_list[i], test_X, y)
	print()

              precision    recall  f1-score   support

        Fuck       0.46      0.52      0.49        21
       Other       0.45      0.25      0.32        20
        Shit       0.31      0.40      0.35        20

    accuracy                           0.39        61
   macro avg       0.41      0.39      0.39        61
weighted avg       0.41      0.39      0.39        61


              precision    recall  f1-score   support

        Fuck       0.50      0.43      0.46        21
       Other       0.72      0.90      0.80        20
        Shit       0.61      0.55      0.58        20

    accuracy                           0.62        61
   macro avg       0.61      0.63      0.61        61
weighted avg       0.61      0.62      0.61        61


              precision    recall  f1-score   support

           0       0.55      0.57      0.56        21
           1       0.75      0.90      0.82        20
           2       0.60      0.45      0.51        20

    accuracy      