In [None]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **1DCNN**

# Importing Required Libraries

In [None]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import pickle

# Define Directory and Hyperparameters

In [None]:
# Directory containing 50 folders for respective speakers
data_path = '/content/drive/MyDrive/archive (14)/50_speakers_audio_data'

# Hyperparameters
n_mfcc = 13         # Number of MFCC features
max_pad_len = 100   # Pad or truncate MFCCs to this length
batch_size = 64     # Optimized batch size for batch processing

# MFCC Feature Extraction

In [None]:
def extract_mfcc_features(audio, sample_rate, n_mfcc=n_mfcc, max_pad_len=max_pad_len):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    if mfccs is None or len(mfccs) == 0:
        print("Warning: MFCC extraction returned None or empty array.")
        return None
    mfccs = np.pad(mfccs, ((0, 0), (0, max(0, max_pad_len - mfccs.shape[1]))), mode='constant')
    return mfccs[:, :max_pad_len]

# Data Augmentation

In [None]:
def augment_audio(audio):
    speed_change = np.random.uniform(0.9, 1.1)
    augmented_audio = librosa.effects.time_stretch(audio, rate=speed_change)
    noise = np.random.randn(len(augmented_audio)) * 0.005
    augmented_audio += noise
    return augmented_audio

# Load data and extract features

In [None]:
features = []
labels = []

for speaker in os.listdir(data_path):
    speaker_folder = os.path.join(data_path, speaker)
    if os.path.isdir(speaker_folder):
        for file_name in os.listdir(speaker_folder):
            file_path = os.path.join(speaker_folder, file_name)
            audio, sample_rate = librosa.load(file_path, sr=None)
            if audio is None or len(audio) == 0:
                print(f"Warning: Audio file {file_path} could not be loaded.")
                continue
            mfcc_features = extract_mfcc_features(audio, sample_rate)
            if mfcc_features is not None:
                features.append(mfcc_features)
                labels.append(speaker)

                # Augment and extract features for the augmented audio
                augmented_audio = augment_audio(audio)
                augmented_mfcc = extract_mfcc_features(augmented_audio, sample_rate)
                if augmented_mfcc is not None:
                    features.append(augmented_mfcc)
                    labels.append(speaker)

# Preprocess data

In [None]:
# Convert features and labels to numpy arrays
X = np.array(features)
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Model architecture

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(n_mfcc, max_pad_len, 1)),
    tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2), padding='same'),

    tf.keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2), padding='same'),

    tf.keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile and Train Model

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks for training
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=batch_size,
                    validation_data=(X_test, y_test), callbacks=[early_stopping, lr_scheduler])

Epoch 1/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 243ms/step - accuracy: 0.0961 - loss: 3.5952 - val_accuracy: 0.1512 - val_loss: 3.3519 - learning_rate: 0.0010
Epoch 2/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 239ms/step - accuracy: 0.2875 - loss: 2.5973 - val_accuracy: 0.3065 - val_loss: 2.6096 - learning_rate: 0.0010
Epoch 3/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 240ms/step - accuracy: 0.4260 - loss: 2.0365 - val_accuracy: 0.4488 - val_loss: 2.1142 - learning_rate: 0.0010
Epoch 4/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 239ms/step - accuracy: 0.5282 - loss: 1.6138 - val_accuracy: 0.6050 - val_loss: 1.4654 - learning_rate: 0.0010
Epoch 5/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 238ms/step - accuracy: 0.6364 - loss: 1.2684 - val_accuracy: 0.6756 - val_loss: 1.2012 - learning_rate: 0.0010
Epoch 6/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# Evaluate and Save Model

In [None]:
# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save the model and the Label Encoder
model.save('speaker_identification_cnn_model.h5')
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9371 - loss: 0.1620




Test Accuracy: 93.23%


# Prediction function

In [None]:
def predict_speaker(file_path):
    audio, sample_rate = librosa.load(file_path, sr=None)
    if audio is None or len(audio) == 0:
        print(f"Warning: Audio file {file_path} could not be loaded.")
        return None
    mfcc_features = extract_mfcc_features(audio, sample_rate)
    if mfcc_features is None:
        return None
    mfcc_features = np.reshape(mfcc_features, (1, mfcc_features.shape[0], mfcc_features.shape[1], 1))
    prediction = model.predict(mfcc_features)
    speaker_label = np.argmax(prediction)
    speaker_name = label_encoder.inverse_transform([speaker_label])[0]
    return speaker_name

# Main function

In [None]:
if __name__ == "__main__":
    test_audio_path = '/content/drive/MyDrive/archive (14)/50_speakers_audio_data/Speaker0026/Speaker26_000.wav'
    identified_speaker = predict_speaker(test_audio_path)
    print(f"The identified speaker is: {identified_speaker}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
The identified speaker is: Speaker0026


# **GMM**

In [None]:
from sklearn.model_selection import train_test_split
import os
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
import pickle

# Directory containing folders for respective speakers
data_path = '/content/drive/MyDrive/archive (14)/50_speakers_audio_data'

# Hyperparameters
n_mfcc = 13      # Number of MFCC features
max_pad_len = 100  # Pad or truncate MFCCs to this length
n_components = 16  # Number of components for each speaker's GMM

# Function to load and preprocess audio files
def extract_mfcc_features(audio, sample_rate, n_mfcc=n_mfcc, max_pad_len=max_pad_len):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    if mfccs is None or len(mfccs) == 0:
        print("Warning: MFCC extraction returned None or empty array.")
        return None
    mfccs = np.pad(mfccs, ((0, 0), (0, max(0, max_pad_len - mfccs.shape[1]))), mode='constant')
    return mfccs[:, :max_pad_len].T  # Transpose for GMM compatibility

# Prepare data and labels
features = []
labels = []

for speaker in os.listdir(data_path):
    speaker_folder = os.path.join(data_path, speaker)
    if os.path.isdir(speaker_folder):
        for file_name in os.listdir(speaker_folder):
            file_path = os.path.join(speaker_folder, file_name)
            audio, sample_rate = librosa.load(file_path, sr=None)
            if audio is None or len(audio) == 0:
                print(f"Warning: Audio file {file_path} could not be loaded.")
                continue
            mfcc_features = extract_mfcc_features(audio, sample_rate)
            if mfcc_features is not None:
                features.append(mfcc_features)
                labels.append(speaker)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
speakers = label_encoder.classes_

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, y_encoded, test_size=0.2, random_state=42)

# Train GMM for each speaker on training data
gmm_models = {}
for speaker_index, speaker_name in enumerate(speakers):
    # Collect all MFCC features for the current speaker in the training set
    speaker_features = np.vstack([X_train[i] for i in range(len(X_train)) if y_train[i] == speaker_index])
    gmm = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=200, random_state=42)
    gmm.fit(speaker_features)
    gmm_models[speaker_name] = gmm

# Save the GMM models and label encoder for future use
with open('gmm_models.pkl', 'wb') as f:
    pickle.dump(gmm_models, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Function to predict speaker for a new audio file
def predict_speaker(mfcc_features, gmm_models):
    log_likelihoods = {speaker: gmm.score(mfcc_features) for speaker, gmm in gmm_models.items()}
    return max(log_likelihoods, key=log_likelihoods.get)

# Evaluate the model
correct_predictions = 0
for i in range(len(X_test)):
    mfcc_features = X_test[i]
    true_speaker = label_encoder.inverse_transform([y_test[i]])[0]
    predicted_speaker = predict_speaker(mfcc_features, gmm_models)

    if predicted_speaker == true_speaker:
        correct_predictions += 1

# Calculate accuracy
accuracy = correct_predictions / len(X_test) * 100
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 93.04%


In [None]:
def predict_speaker_from_audio(file_path, gmm_models, label_encoder):
    audio, sample_rate = librosa.load(file_path, sr=None)
    if audio is None or len(audio) == 0:
        print(f"Warning: Audio file {file_path} could not be loaded.")
        return None
    mfcc_features = extract_mfcc_features(audio, sample_rate)
    if mfcc_features is None:
        return None

    # Compute the average log likelihood for each model
    log_likelihoods = {speaker: gmm.score(mfcc_features) for speaker, gmm in gmm_models.items()}
    predicted_speaker = max(log_likelihoods, key=log_likelihoods.get)
    return predicted_speaker

# Example usage of predicting speaker from a single audio file
if __name__ == "__main__":
    test_audio_path = '/content/drive/MyDrive/archive (14)/50_speakers_audio_data/Speaker0026/Speaker26_000.wav'
    identified_speaker = predict_speaker_from_audio(test_audio_path, gmm_models, label_encoder)
    print(f"The identified speaker for the given audio file is: {identified_speaker}")

The identified speaker for the given audio file is: Speaker0026
