In [1]:

# Import all modules
from common import concatenate_features, extract_features, load_audio
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from time import time

In [2]:
# How to use load_audio() function
wav_files = list(map(lambda f: 'wavs/' + f, os.listdir('wavs/')))
male_voices, female_voices = load_audio(wav_files)

# Use the extract_features() function
male_features = extract_features(male_voices)
female_features = extract_features(female_voices)

# How the function is used
male_concatenated = concatenate_features(male_features)
female_concatenated = concatenate_features(female_features)

print(male_concatenated.shape) 		# Output: (117576, 26)
print(female_concatenated.shape)	# Output: (124755, 26)

100%|██████████| 601/601 [00:02<00:00, 215.70it/s]
100%|██████████| 300/300 [00:18<00:00, 16.17it/s]
100%|██████████| 300/300 [00:42<00:00,  7.13it/s]
100%|██████████| 300/300 [00:00<00:00, 527.83it/s] 
100%|██████████| 300/300 [00:00<00:00, 645.63it/s] 

(117667, 26)
(124815, 26)





In [3]:
# Concatenate male voices and female voices
X = np.vstack((male_concatenated, female_concatenated))

# Create labels
y = np.append([15] * len(male_concatenated), [99] * len(female_concatenated))

# Check whether X and y are already having the exact same length
print(X.shape)		# Output: (242268, 26)
print(y.shape)		# Output: (242268,)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

(242482, 26)
(242482,)


In [4]:
# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
start = time()
knn.fit(X_train, y_train)
end = time()
print('Training time: {} seconds'.format(end - start))

# Test the classifier
start = time()
y_pred = knn.predict(X_test)
end = time()
print('Testing time: {} seconds'.format(end - start))

# Calculate the accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print('Accuracy: {}'.format(accuracy))

Training time: 0.013572931289672852 seconds
Testing time: 5.855248928070068 seconds
Accuracy: 0.8901993937769347


In [None]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,8))
plt.title('Confusion matrix on test data')
sns.heatmap(cm, annot=True, fmt='d', 
            cmap=plt.cm.Blues, cbar=False, annot_kws={'size':14})
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [8]:
import librosa
from pydub import AudioSegment

user_file = input('Enter the path to the audio file: ')

if not os.path.isfile(user_file):
    print('File not found!')
    # exit without killing kernel
    raise SystemExit

# get file format
file_format = user_file.split('.')[-1]
if file_format != 'wav':
    sound = AudioSegment.from_file(user_file, format=file_format)
    sound.export('user_audio.wav', format='wav')
    user_file = 'user_audio.wav'

audio_file = librosa.load(user_file)

pruebas = np.array([list(audio_file)], dtype=object)

prueba_features = extract_features(pruebas)

prediccion = knn.predict(prueba_features[0])

total = len(prediccion)
hombres = len(prediccion[prediccion == 15])
mujeres = len(prediccion[prediccion == 99])

if hombres > mujeres:
    print('El audio es de un hombre, con un {:.2f}% de confianza'.format(hombres / total * 100))
else:
    print('El audio es de una mujer, con un {:.2f}% de confianza'.format(mujeres / total * 100))

100%|██████████| 1/1 [00:00<00:00, 17.64it/s]


El audio es de un hombre, con un 64.96% de confianza
