In [1]:
import os
import joblib
import librosa
import torchaudio
# import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from loader.load_audio import audio_array, extract_mfcc

## Data Loading and Dataframe & CSV Creation

In [6]:
root_dir = r"Dataset-ai\mono_audio"

data = []
waveforms = []
target_sampling_rate = 16000

for label in os.listdir(root_dir)[:5]:
	label_dir = os.path.join(root_dir, label)
	print(f"{label = }")
	
	for audio_file in tqdm(os.listdir(label_dir)):
		name = audio_file.split('.')[0]
		audio_path = os.path.join(label_dir, audio_file)  

		try:
			waveform, sr = torchaudio.load(audio_path)
			resampler = torchaudio.transforms.Resample(sr, target_sampling_rate)
			waveforms.append(resampler(waveform).squeeze().numpy())
			# waveforms.append(waveform)
			data.append({
				"filename": name,
				"path": audio_path,
				"word": label
			})
		except Exception as e:
			print(f"Error loading {name}.wav from {label_dir}")

label = 'Fuck'


100%|██████████| 100/100 [00:00<00:00, 184.13it/s]


label = 'Other'


100%|██████████| 100/100 [00:00<00:00, 675.11it/s]


label = 'Shit'


100%|██████████| 100/100 [00:00<00:00, 225.20it/s]


In [7]:
df = pd.DataFrame(data)
csv_path = r"CSVs"


In [8]:
shapes = [speech.shape for speech in waveforms]
target_length = max([size[0] for size in shapes])
print(f"{target_length = }")

target_length = 33793


In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["word"])
train_df.to_csv(f"{csv_path}/train-ai.csv", index=False)
test_df.to_csv(f"{csv_path}/test-ai.csv", index=False)

## Data Pre-Processing

In [30]:
# train_df = pd.read_csv(f"{csv_path}/train.csv")
# test_df = pd.read_csv(f"{csv_path}/test.csv")
# target_length = 41760

In [31]:
# Load audio file and corresponding label
def load_audio(file_path):
	audio, sr = librosa.load(file_path, sr=16_000)  # Load audio file
	return audio, sr

# Load all audio files and labels into a list
audio_files = [...]  # List of file paths
labels = [label for label in train_df['word']]  # List of corresponding labels
audio_data = [(load_audio(file)) for file in train_df['path']]


In [1]:
# Function to extract MFCC features from audio
def extract_mfcc(audio, sr, num_mfcc=13, n_fft=2048, hop_length=512, pad_to=None):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
    if pad_to:
        pad_width = pad_to - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return mfcc.flatten()  # Flatten MFCC matrix

# Extract MFCC features for all audio data
mfcc_features = [extract_mfcc(audio, sr, pad_to=target_length) for (audio, sr) in audio_data]


In [13]:
# Create DataFrame with MFCC features and labels
df = pd.DataFrame(mfcc_features)
df['label'] = labels


In [14]:
# Prepare features and target
X = df.drop('label', axis=1)
y = df['label']


In [None]:
y_encoded = y.replace({'Fuck': 0, 'Shit': 1, 'Other': 2})

  y_encoded = y.replace({'Fuck': 0, 'Shit': 1, 'Other': 2})


In [None]:
test_audio_files = [...]  # List of file paths
test_labels = [label for label in test_df['word']]  # List of corresponding labels
test_audio_data = [(load_audio(file)) for file in test_df['path']]

test_mfcc_features = [extract_mfcc(audio, sr, pad_to=target_length) for (audio, sr) in test_audio_data]

test_df = pd.DataFrame(test_mfcc_features)
test_df['label'] = test_labels

In [None]:
test_X = test_df.drop('label', axis=1)
test_y = test_df['label']
test_y_encoded = test_y.replace({'Fuck': 0, 'Shit': 1, 'Other': 2})

  test_y_encoded = test_y.replace({'Fuck': 0, 'Shit': 1, 'Other': 2})


## Model Training

In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier

In [17]:
svm_model = SVC(probability=True, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=3, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=3)

In [18]:
svm_model.fit(X, y)

In [19]:
rf_model.fit(X, y)

In [20]:
xgb_model.fit(X, y_encoded)

In [21]:
knn_model.fit(X, y)

In [None]:
# save models
# joblib.dump(svm_model, r"models\svm_model.pkl")
joblib.dump(rf_model, r"models\rf_model.pkl")
joblib.dump(xgb_model, r"models\xgb_model.pkl")
joblib.dump(knn_model, r"models\knn_model.pkl")

In [5]:
# load models
# svm_model = joblib.load(r"models\svm_model.pkl")
rf_model = joblib.load(r"models\rf_model.pkl")
xgb_model = joblib.load(r"models\xgb_model.pkl")
knn_model = joblib.load(r"models\knn_model.pkl")

In [6]:
# models_list = (svm_model, rf_model, xgb_model, knn_model)
# model_names_list = ("svm_model", "rf_model", "xgb_model", "knn_model")
models_list = (rf_model, xgb_model, knn_model)
model_names_list = ("rf_model", "xgb_model", "knn_model")

## Model Evaluation

In [7]:
def evaluate_model(model, model_name, x, y):
	print(f"========{model_name}========")
	# Make predictions
	y_pred = model.predict(x)
	# Evaluate the model
	print(classification_report(y, y_pred))

In [19]:
for i in range(3):
	if model_names_list[i] == "xgb_model":
		y = test_y_encoded
	else:
		y = test_y
	evaluate_model(models_list[i], model_names_list[i], test_X, y)
	print()

              precision    recall  f1-score   support

        Fuck       0.69      0.45      0.55        20
       Other       0.56      1.00      0.71        20
        Shit       0.64      0.35      0.45        20

    accuracy                           0.60        60
   macro avg       0.63      0.60      0.57        60
weighted avg       0.63      0.60      0.57        60


              precision    recall  f1-score   support

           0       0.77      0.85      0.81        20
           1       0.22      0.30      0.26        20
           2       0.00      0.00      0.00        20

    accuracy                           0.38        60
   macro avg       0.33      0.38      0.35        60
weighted avg       0.33      0.38      0.35        60




MemoryError: bad allocation

### Single Evaluation

In [2]:
from shared_data.shared_data import target_length
from segmenter.audio_segmenter import audio_segmenter
from loader.load_audio import extract_mfcc

In [3]:
path = r"TestData\noise_removed_audio\pewdiepie angry threat.wav"

In [4]:
segments, timestamps = audio_segmenter(audio_path= path, segment_length_sec= 2.0)

In [5]:
mfcc_features = []
for segment in segments[:2]:
	mfcc_feature = extract_mfcc(audio= segment[0], sr= segment[1], pad_to= target_length)
	mfcc_features.append(mfcc_feature)

In [6]:
# rf_model = joblib.load(r"models\Experiment 1\models_rf_model.pkl")
xgb_model = joblib.load(r"models\xgb_model.pkl")
# lgr_model = joblib.load(r"models\Experiment 1\models_lgr_model.pkl")
# svm_model = joblib.load(r"models\Experiment 1\models_svm_model.pkl")
# knn_model = joblib.load(r"models\Experiment 1\models_knn_model.pkl")

In [7]:
# models_list = (rf_model, xgb_model, lgr_model, svm_model, knn_model)
# model_names_list = ("rf_model", "xgb_model", "lgr_model", "svm_model", "knn_model")
# models_list = (xgb_model, lgr_model, svm_model, knn_model)
# model_names_list = ("xgb_model", "lgr_model", "svm_model", "knn_model")
models_list = (xgb_model)
model_names_list = ("xgb_model")

In [11]:
def evaluate_models(model, model_name, mfcc):
	print(f"========{model_name}========")
	# probabilities = model.predict_proba(mfcc)
	# for i in range(3):
	# 	print(f"{['Fuck', 'Shit', 'Other'][i]} : {probabilities[0][i]}")
	pred = model.predict(mfcc)
	print(f"{model_name} prediction: {pred}")
	print(type(pred))
	print(int(pred[0]))
	print(type(int(pred[0])))

In [9]:
timestamps_sec = [(start/16_000, end/16_000) for start, end in timestamps]

In [12]:
for index, mfcc in enumerate(mfcc_features):
	print(f"=========={timestamps_sec[index]}==========")
	# for i in range(4):
	# 	evaluate_models(models_list[0], model_names_list[0], mfcc)
	evaluate_models(xgb_model, "xgb_model", mfcc)
	print()

xgb_model prediction: [0]
<class 'numpy.ndarray'>
0
<class 'int'>

xgb_model prediction: [2]
<class 'numpy.ndarray'>
2
<class 'int'>

