In [2]:
import os, time
import librosa
import random
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
import sounddevice as sd
from scipy.io import wavfile
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
def feature_extraction():
    # Define the path to the main folder containing the voice samples
    folder_path = "voice_samples"

    # Define the desired duration in seconds
    desired_duration = 5

    # Initialize empty lists to store the features and corresponding labels
    all_features = []
    all_labels = []

    # Iterate over the subfolders and files within the main folder
    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            label = subfolder  # Assume the subfolder name is the label
            for filename in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, filename)
                if filename.endswith(".wav"):
                    # Load the audio file using librosa
                    audio, sr = librosa.load(file_path)

                    # Calculate the number of samples for the desired duration
                    desired_samples = int(desired_duration * sr)

                    # Trim the audio to the desired duration
                    if len(audio) > desired_samples:
                        audio = audio[:desired_samples]
                    else:
                        # If the audio is shorter than the desired duration, pad it with zeros
                        audio = np.pad(audio, (0, desired_samples - len(audio)), "constant")

                    # Extract the MFCC features
                    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=50)

                    # Append the MFCC features and label to the main lists
                    all_features.append(mfcc)
                    all_labels.append(label)

    # Convert the feature and label lists to numpy arrays
    all_features = np.array(all_features)
    all_labels = np.array(all_labels)

    # Save the features and labels as .npy files
    np.save("features.npy", all_features)
    np.save("labels.npy", all_labels)
    print('features and label saved succesfully')

    # Print the shape of the extracted features and labels
    print("Features shape:", all_features.shape)
    print("Labels shape:", all_labels.shape)
    
def train_model(x_train, x_test):
    # Reshape the input features
    input_shape = x_train.shape[1:]
    x_train = x_train.reshape((*x_train.shape, 1))
    x_test = x_test.reshape((*x_test.shape, 1))

    # Define the CNN model architecture
    model = models.Sequential()
    model.add(layers.Reshape((50, 216, 1), input_shape=input_shape))
    model.add(layers.Conv2D(32, kernel_size = (3, 3), strides = (1, 1), activation='relu', input_shape = input_shape))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(64, kernel_size=(3,3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(96, kernel_size=(3,3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(96, kernel_size=(3,3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.2))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(3 ,activation='softmax'))

    model.summary()

    # Compile and train the model
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.fit(x_train, y_train, epochs=10, batch_size=32)

    # Save the trained model
    model.save("newest_trained_model.h5")
    print('model trained succesfully')
    
def calculate_similarity(mfcc1, mfcc2):
    """
    Calculate the cosine similarity between two sets of MFCC features.
    
    Parameters:
        mfcc1 (numpy.ndarray): MFCC features of the first speaker.
        mfcc2 (numpy.ndarray): MFCC features of the second speaker.
    
    Returns:
        float: Cosine similarity score.
    """
    # Reshape MFCC arrays to (n_frames, n_mfcc)
    mfcc1 = mfcc1.reshape(-1, mfcc1.shape[-1])
    mfcc2 = mfcc2.reshape(-1, mfcc2.shape[-1])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(mfcc1, mfcc2)
    
    # Take the average similarity score
    similarity_score = np.mean(similarity_matrix)
    
    return similarity_score

def augment_audio(audio, sr):
    # Randomly choose augmentation types and parameters
    augmentation_types = ['add_noise', 'change_pitch', 'change_speed']
    chosen_augmentations = random.sample(augmentation_types, k=random.randint(1, len(augmentation_types)))

    augmented_audio = audio.copy()

    for augmentation in chosen_augmentations:
        if augmentation == 'add_noise':
            # Add random noise to the audio
            noise_level = random.uniform(0.001, 0.01)  # Adjust noise level as needed
            augmented_audio = librosa.effects.preemphasis(augmented_audio, coef=noise_level)

        elif augmentation == 'change_pitch':
            # Randomly change the pitch within a certain range
            pitch_shift = random.uniform(-2, 2)  # Adjust pitch shift range as needed
            augmented_audio = librosa.effects.pitch_shift(augmented_audio, sr=sr, n_steps=pitch_shift)

        elif augmentation == 'change_speed':
            # Randomly change the playback speed within a certain range
            speed_factor = random.uniform(0.9, 1.1)  # Adjust speed factor range as needed
            augmented_audio = librosa.effects.time_stretch(augmented_audio, rate=speed_factor)

    return augmented_audio

def apply_snr_filter(audio_data, threshold = 20):
    preemphasized_audio = librosa.effects.preemphasis(audio_data)
    spectogram = librosa.feature.melspectrogram(y=preemphasized_audio)
    s_centroid = librosa.feature.spectral_centroid(S=spectogram)[0, :]
    
    min_length = min(len(audio_data), len(s_centroid))
    audio_data = audio_data[:min_length]
    s_centroid = s_centroid[:min_length]
    
    filtered_audio = audio_data[s_centroid > threshold]
    return filtered_audio

def extract_mfcc_features_from_file(file_path, desired_duration=5):
    try:
        # Load the audio file using librosa
        audio, sr = librosa.load(file_path)
    
        # Apply data augmentation to the audio (you can remove this line if not needed)
        #audio = augment_audio(audio, sr)
        #audio = apply_snr_filter(audio_aug)
        
    
        # Calculate the number of samples for the desired duration
        desired_samples = int(desired_duration * sr)
    
        # Trim or pad the audio to the desired duration
        if len(audio) > desired_samples:
            audio = audio[:desired_samples]
        else:
            audio = np.pad(audio, (0, desired_samples - len(audio)), "constant")
    
        # Extract the MFCC features
        user_mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=50)
        
        return user_mfcc
    
    except Exception as e:
        print("An error occurred:", str(e))
        return None
    
def record_audio_and_save_it(duration=5, sample_rate=44100):
    for i in range(3,-1,-1):
        print(f'Recording in {i}', end='\r')
        time.sleep(1)
    print('Listening...')
    recorded_audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype=np.float32)
    sd.wait()
    print('Recording complete.')

    # Save the recorded audio as a .wav file
    file_path = 'recorded_audio.wav'
    wavfile.write(file_path, sample_rate, recorded_audio)
    return extract_mfcc_features_from_file(file_path)


def make_predcitions(user_mfcc):
    user_mfcc = user_mfcc.reshape((1, 50, 216, 1))
    # Load the pre-trained model
    model = models.load_model(r'C:\Users\UKKASHA\Anaconda3\man\newest_trained_model.h5')

    # Define confidence and similarity thresholds
    confidence_threshold = 99  # Adjust this threshold as needed
    similarity_threshold = 75  # Adjust this threshold as needed

    # Perform the prediction for the user's voice
    predictions = model.predict(user_mfcc)
    predicted_index = np.argmax(predictions[0])
    confidence = np.round((predictions[0][predicted_index] * 100), 2)
    predicted_label = label_encoder.classes_[predicted_index]
    #user_name = input('Enter your name: ')
    user_name = predicted_label
    
    if user_name in ''.join(label_encoder.classes_):
#         print(predicted_label)
#         print(confidence)
        if user_name in predicted_label:
            if confidence > confidence_threshold:
                return f'{predicted_label} - ({confidence}%)'
            else:
                return f'predicted: Unknown ({confidence}%)'
# #                 print('Access Granted!')
# #                 print(f"Predicted Speaker: {predicted_label}")
# #                 print(f"Confidence: {confidence}")
#             else:
# #                 print('Access Denied!')
# #                 print("Can't tell confidently if you are who you are trying to claim")
#         else:
# #             print('You are not who you are trying to claim')
#     else:
# #         print('Unknown user!')

from gtts import gTTS
import os

def say(text):
    tts = gTTS(text=text, lang='en')
    tts.save('ouput.mp3')
    os.system('start ouput.mp3')

In [12]:
pip install pyttsx3

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pyttsx3 (from versions: none)
ERROR: No matching distribution found for pyttsx3


In [3]:
# Load the extracted features and labels
features = np.load(r"C:\Users\UKKASHA\Anaconda3\man\features.npy")
labels = np.load(r"C:\Users\UKKASHA\Anaconda3\man\labels.npy")
    
# Perform label encoding
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels_encoded,
                                                    test_size=0.2,
                                                    random_state=42)

In [1]:
path = r'C:\Users\UKKASHA\Downloads\recorded_audio.wav'
user_mfcc = extract_mfcc_features_from_file(path)
print(make_predcitions(user_mfcc))

NameError: name 'extract_mfcc_features_from_file' is not defined

In [26]:
actual = ['Albani zaria','Albani zaria','Albani zaria',
          'Okasha Kameny', 'Okasha Kameny', 'Okasha Kameny', 'Okasha Kameny',
          'Okasha Kameny', 'Okasha Kameny', 'Okasha Kameny', 'Okasha Kameny',
          'Okasha Kameny','Omar Sulaiman','Omar Sulaiman','Omar Sulaiman',
          'Omar Sulaiman','Omar Sulaiman']
for i in range(15):
    path = fr'C:\Users\UKKASHA\Anaconda3\man\test data\test_{i+1}.wav'
    user_mfcc = extract_mfcc_features_from_file(path)
    print(make_predcitions(user_mfcc))

actual: Albani zaria, predicted: albani zaria (100.0%)
actual: Albani zaria, predicted: albani zaria (100.0%)
actual: Albani zaria, predicted: albani zaria (100.0%)
actual: Okasha Kameny, predicted: okasha kameny (99.99%)
actual: Okasha Kameny, predicted: okasha kameny (100.0%)
actual: Okasha Kameny, predicted: okasha kameny (100.0%)
actual: Okasha Kameny, predicted: Unknown (93.5%)
actual: Okasha Kameny, predicted: okasha kameny (100.0%)
actual: Okasha Kameny, predicted: okasha kameny (100.0%)
actual: Okasha Kameny, predicted: Unknown (98.4%)
actual: Okasha Kameny, predicted: okasha kameny (100.0%)
actual: Okasha Kameny, predicted: omar sulaiman (100.0%)
actual: Omar Sulaiman, predicted: omar sulaiman (100.0%)
actual: Omar Sulaiman, predicted: omar sulaiman (100.0%)
actual: Omar Sulaiman, predicted: omar sulaiman (100.0%)


In [None]:
import matplotlib.pyplot as plt
audio, sr = librosa.load('recorded_audio.wav')
spectogram = librosa.feature.melspectrogram(y=audio, sr=sr)

librosa.display.specshow(librosa.power_to_db(spectogram, ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.show

In [None]:
import matplotlib.pyplot as plt
audio, sr = librosa.load('recorded_audioo.wav')
spectogram = librosa.feature.melspectrogram(y=audio, sr=sr)

librosa.display.specshow(librosa.power_to_db(spectogram, ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.show

In [None]:
import matplotlib.pyplot as plt
audio, sr = librosa.load(r'C:\Users\UKKASHA\Anaconda3\man\test data\test_1.wav')
spectogram = librosa.feature.melspectrogram(y=audio, sr=sr)

librosa.display.specshow(librosa.power_to_db(spectogram, ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.show

In [None]:
import matplotlib.pyplot as plt
audio, sr = librosa.load(r'C:\Users\UKKASHA\Anaconda3\man\test data\test_5.wav')
spectogram = librosa.feature.melspectrogram(y=audio, sr=sr)

librosa.display.specshow(librosa.power_to_db(spectogram, ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.show

In [None]:
import matplotlib.pyplot as plt
audio, sr = librosa.load(r'C:\Users\UKKASHA\Anaconda3\man\test data\test_15.wav')
spectogram = librosa.feature.melspectrogram(y=audio, sr=sr)

librosa.display.specshow(librosa.power_to_db(spectogram, ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.show

In [None]:
audio, sr = librosa.load(r'C:\Users\UKKASHA\Anaconda3\man\test data\test_1.wav')
librosa.display.waveshow(y=audio, sr=sr)
plt.xlabel('time (s)')
plt.ylabel('Amplitude')
plt.show()

In [None]:
audio, sr = librosa.load(r'C:\Users\UKKASHA\Anaconda3\man\test data\test_5.wav')
librosa.display.waveshow(y=audio, sr=sr)
plt.xlabel('time (s)')
plt.ylabel('Amplitude')
plt.show()

In [None]:
# print(y_test[80])
# print(np.argmax(y_pred[80]))
np.argmax(y_pred)
print(y_pred)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import confusion_matrix
from tensorflow.keras import models

model = models.load_model(r'C:\Users\UKKASHA\Anaconda3\man\newest_trained_model.h5')

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
cm = confusion_matrix(y_test, y_pred_classes)

In [None]:
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
report = classification_report(y_test, y_pred_classes, target_names=list(set(labels)), output_dict=True)
classification_report_values = list(report.values())
classification_report_values

In [None]:
import seaborn as sns
data = np.array([[item['precision'], item['recall'], item['f1-score']] for item in classification_report_values if isinstance(item, dict)])

# Create a heatmap using seaborn
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data, annot=True, fmt=".3f", cmap="Blues", xticklabels=['Precision', 'Recall', 'F1-Score'], yticklabels=list(set(labels))
plt.title('Classification Metrics by Class')
plt.show()

In [None]:
import seaborn as sns
classification_report_values = [
    {'precision': 1.0, 'recall': 0.9965811965811966, 'f1-score': 0.9982876712328768, 'support': 585},
    {'precision': 0.9950248756218906, 'recall': 0.9983361064891847, 'f1-score': 0.9966777408637874, 'support': 601},
    {'precision': 0.998371335504886, 'recall': 0.998371335504886, 'f1-score': 0.998371335504886, 'support': 614},
    0.9977777777777778,
    {'precision': 0.9977987370422587, 'recall': 0.997762879525089, 'f1-score': 0.9977789158671833, 'support': 1800}
]

class_names = ['Class 1', 'Class 2', 'Class 3']

# Extract values for plotting, excluding float values (support for all classes combined)
data = np.array([[item['precision'], item['recall'], item['f1-score']] for item in classification_report_values if isinstance(item, dict)])

# Create a heatmap using seaborn
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data, annot=True, fmt=".3f", cmap="Blues", xticklabels=['Precision', 'Recall', 'F1-Score'], yticklabels=list(set(labels)))
plt.title('Classification Report')
plt.show()