In [1]:
import os  # Provides functions to interact with the operating system
import tkinter as tk  # Used for creating GUI applications
from tkinter import filedialog, messagebox  # Provides file dialog boxes and pop-up messages for the GUI
from tkinter.ttk import Progressbar  # Provides a themed progress bar widget for the GUI
import threading  # Enables multi-threading for running tasks without freezing the GUI
import numpy as np  # Library for handling numerical operations and working with arrays
from tensorflow.keras.utils import to_categorical  # Converts labels into one-hot encoding for classification tasks
import matplotlib.pyplot as plt  # Library for creating plots and visualizations
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score  # Metrics for evaluating model performance
import seaborn as sns  # Enhances matplotlib visuals, used for creating heatmaps, etc.

# Import necessary components for speech recognition tasks
from speechbrain.inference import SpeakerRecognition  # Pre-built inference pipeline for speaker recognition tasks
import torchaudio  # PyTorch library for audio processing
import tensorflow as tf  # Framework for building and training machine learning models
from tensorflow.keras import Sequential, layers  # Provides utilities for creating deep learning models
from sklearn.model_selection import train_test_split  # Splits datasets into training and testing subsets
from sklearn.preprocessing import LabelEncoder  # Encodes class labels as integers for model compatibility
import tkinter as tk  # Used again to handle GUI components
from tkinter import ttk, filedialog, messagebox, simpledialog  # Provides additional tkinter components


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [3]:
# Define ECAPA-TDNN architecture as a custom Keras model
class ECAPA_TDNN(tf.keras.Model):
    def __init__(self, input_dim, num_classes, channels=512, bottleneck_channels=1536, embedding_dim=384):
        super(ECAPA_TDNN, self).__init__()  # Initialize the base class
        
        # First TDNN layer: Applies a convolution with kernel size 5, stride 1, and 'same' padding
        # 'relu' activation is used to introduce non-linearity
        self.tdnn1 = layers.Conv1D(channels, kernel_size=5, strides=1, padding='same', activation='relu')
        
        # Second TDNN layer: Applies a convolution with kernel size 3
        self.tdnn2 = layers.Conv1D(channels, kernel_size=3, strides=1, padding='same', activation='relu')
        
        # Third TDNN layer: Another convolution with kernel size 3
        self.tdnn3 = layers.Conv1D(channels, kernel_size=3, strides=1, padding='same', activation='relu')
        
        # Attention mechanism: Introduces a dense layer to learn important features (tanh activation)
        self.attention = layers.Dense(channels, activation='tanh')
        
        # Scaling mechanism: Outputs softmax weights to emphasize important features
        self.scale = layers.Dense(channels, activation='softmax')
        
        # Fully connected layer to reduce dimensionality to the embedding size
        self.fc1 = layers.Dense(embedding_dim, activation='relu')
        
        # Dropout layer for regularization to prevent overfitting
        self.dropout = layers.Dropout(0.5)
        
        # Final fully connected layer with softmax for classification into `num_classes`
        self.fc2 = layers.Dense(num_classes, activation='softmax')

    # Forward pass of the model
    def call(self, inputs):
        # Pass input through the first TDNN layer
        x = self.tdnn1(inputs)
        
        # Pass through the second TDNN layer
        x = self.tdnn2(x)
        
        # Pass through the third TDNN layer
        x = self.tdnn3(x)
        
        # Apply the attention mechanism: learn attention scores for each feature
        attn = self.attention(x)  # Dense layer for attention
        scale = self.scale(attn)  # Convert attention scores to probabilities with softmax
        
        # Weighted average using the attention scores
        x = tf.reduce_mean(x * scale, axis=1)
        
        # Pass through the first fully connected layer
        x = self.fc1(x)
        
        # Apply dropout for regularization
        x = self.dropout(x)
        
        # Final layer: softmax output for classification
        output = self.fc2(x)
        
        return output  # Return the output of the model

    # Build method to initialize weights and shapes of the model
    def build(self, input_shape):
        # Call the parent class's build method
        super(ECAPA_TDNN, self).build(input_shape)
        
        # Build each layer with the appropriate input shapes
        self.tdnn1.build(input_shape)  # Build the first TDNN layer
        self.tdnn2.build(self.tdnn1.compute_output_shape(input_shape))  # Build the second TDNN layer
        self.tdnn3.build(self.tdnn2.compute_output_shape(input_shape))  # Build the third TDNN layer
        self.attention.build(self.tdnn3.compute_output_shape(input_shape))  # Build the attention layer
        self.scale.build(self.tdnn3.compute_output_shape(input_shape))  # Build the scaling layer
        self.fc1.build(self.attention.compute_output_shape(input_shape))  # Build the first fully connected layer
        self.dropout.build(self.fc1.compute_output_shape(input_shape))  # Build the dropout layer
        self.fc2.build(self.fc1.compute_output_shape(input_shape))  # Build the final fully connected layer


# Complete Code with Added Functionality

In [4]:
# Global variables to store data, model, and GUI elements
directories_and_labels = []  # To store file paths and their associated labels
all_features = []            # List to store extracted features from audio files
all_labels = []              # List to store corresponding labels for features
label_encoder = None         # Label encoder to encode text labels to numeric values
ecapa_tdnn_model = None      # Placeholder for the ECAPA-TDNN model


# GUI Initialization
root = tk.Tk()  # Create the main window for the GUI
root.title("Language Recognition GUI")  # Set the title of the main window
root.geometry("800x700")  # Set the size of the main window (800px wide, 700px tall)


# Create a frame to hold the file path input widgets
frame_files = tk.Frame(root)
frame_files.pack(pady=10)  # Add padding above and below the frame


# Label for the entry box where the user inputs the number of file paths to load
tk.Label(frame_files, text="Number of File Paths to Load").grid(row=0, column=0, padx=5)

# Entry widget for the user to input the number of file paths
entry_num_files = tk.Entry(frame_files, width=10)  # Entry box with width set to 10 characters
entry_num_files.grid(row=0, column=1, padx=5)  # Place the entry box in the grid layout

# Button to load file paths when clicked
btn_load_files = tk.Button(frame_files, text="Load File Paths", width=15)  # Button labeled "Load File Paths"
btn_load_files.grid(row=0, column=2, padx=5)  # Place the button in the grid layout

# Listbox to display the loaded file paths
listbox_files = tk.Listbox(root, width=80, height=10)  # Listbox widget with width and height specified
listbox_files.pack(pady=10)  # Add padding above and below the Listbox




# Function to collect file paths and labels
def load_file_paths():
    try:
        # Get the number of file paths the user wants to load from the entry widget
        num_files = int(entry_num_files.get())  
        
        # Loop to allow the user to select directories and input corresponding labels
        for i in range(num_files):
            # Open a dialog for the user to select a directory for each file
            file_path = filedialog.askdirectory(title=f"Select Directory for File {i+1}")
            
            # If no directory is selected, skip to the next iteration
            if not file_path:
                continue  
            
            # Open a dialog for the user to enter a label for the selected directory
            label = simpledialog.askstring("Input", f"Enter Label for {file_path}")
            
            # If a label is entered, store the directory and label pair
            if label:
                directories_and_labels.append((file_path, label))  # Append the pair to the list
                
                # Display the directory and label in the Listbox widget
                listbox_files.insert(tk.END, f"Path: {file_path}, Label: {label}")  
    except ValueError:
        # Display an error message if the number of files entered is invalid
        messagebox.showerror("Error", "Please enter a valid number.")  

# Configure the "Load File Paths" button to call the `load_file_paths` function when clicked
btn_load_files.config(command=load_file_paths)

# Progress Bar
progress = Progressbar(root, orient=tk.HORIZONTAL, length=600, mode='determinate')  
# Create a horizontal progress bar widget with a length of 600 pixels
progress.pack(pady=10)  # Add padding above and below the progress bar

# Text Area for Logs
text_logs = tk.Text(root, height=10, width=80)  
# Create a text widget to display logs with a height of 10 lines and width of 80 characters
text_logs.pack(pady=10)  # Add padding above and below the text area




# Step 2: Feature Extraction and Saving
def start_feature_extraction():
    global max_files_per_directory  # Declare the variable as global to modify it across functions

    # Check if directories and labels have been loaded
    if not directories_and_labels:
        # Show an error message if no directories and labels have been loaded
        messagebox.showerror("Error", "No directories and labels loaded.")
        return  # Exit the function if no data is loaded

    # Prompt the user to input the maximum number of files to process from each directory
    max_files_per_directory = simpledialog.askinteger(
        "Input",  # Dialog title
        "Enter the maximum number of files to process from each directory (e.g., 100):",  # Prompt message
        parent=root  # Set the parent window for the dialog
    )
    
    # Check if the user provided a valid number
    if max_files_per_directory is None or max_files_per_directory <= 0:
        # Show an error message if the input is invalid (None or less than or equal to 0)
        messagebox.showerror("Error", "Please provide a valid number greater than 0.")
        return  # Exit the function

    # Start the feature extraction process in a separate thread
    # This prevents the GUI from freezing during the long-running task
    threading.Thread(target=extract_and_save_features).start()

    
    
    
    
    

# Function to extract features from audio files and save them
def extract_and_save_features():
    # Initialize the ECAPA-TDNN speaker recognition classifier from SpeechBrain
    classifier = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

    # Calculate the total number of files to process across all directories
    total_files = sum(
        len([f for f in os.listdir(directory) if f.endswith('.mp3')][:max_files_per_directory])
        for directory, _ in directories_and_labels
    )

    # Initialize progress tracking variables
    processed_files = 0  # Counter for processed files
    progress["maximum"] = total_files  # Set the maximum value for the progress bar
    progress["value"] = 0  # Initialize the progress bar value to 0

    # Iterate through each directory and label
    for directory, label_name in directories_and_labels:
        # Get a list of MP3 files in the directory, limited to the specified max files per directory
        files = [f for f in os.listdir(directory) if f.endswith('.mp3')][:max_files_per_directory]
        
        # Log the processing status for the current directory and label
        text_logs.insert(tk.END, f"\nProcessing directory: {directory} with label: {label_name}")
        root.update()  # Update the GUI to reflect the logged message

        # Temporary storage for features and labels
        temp = []

        # Process each file in the directory
        for file_name in files:
            # Construct the full file path
            file_path = os.path.join(directory, file_name)
            try:
                # Load the audio file using torchaudio
                signal, sample_rate = torchaudio.load(file_path)

                # Resample the audio if it is not at 16 kHz
                if sample_rate != 16000:
                    signal = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(signal)

                # Extract embeddings using the classifier
                embeddings = classifier.encode_batch(signal).numpy()

                # Append the embeddings and label to the temporary storage
                temp.append([embeddings, label_name])
            except Exception as e:
                # Log any errors encountered while processing the file
                text_logs.insert(tk.END, f"\nError processing {file_name}: {e}")
                root.update()  # Update the GUI to reflect the logged error message

            # Update the progress bar and GUI
            processed_files += 1
            progress["value"] = processed_files
            root.update()

        # Separate features and labels from the temporary storage
        features = [item[0] for item in temp]
        labels = [item[1] for item in temp]

        # Save features and labels to NumPy files for the current label
        np.save(f"ecapa_train_features_{label_name}.npy", np.array(features, dtype=object))
        np.save(f"ecapa_train_labels_{label_name}.npy", np.array(labels))

        # Log that features and labels have been saved
        text_logs.insert(tk.END, f"\nFeatures and labels saved for {label_name}.")
        root.update()  # Update the GUI to reflect the logged message

    # Show a message box when feature extraction for all directories is complete
    messagebox.showinfo("Feature Extraction Complete", "Feature extraction for all languages is complete!")

# Create a button to trigger feature extraction
btn_extract_features = tk.Button(root, text="Extract Features", command=start_feature_extraction)
btn_extract_features.pack(pady=10)  # Add padding to separate it from other GUI elements

# Frame for language count input
frame_languages = tk.Frame(root)  # Create a frame for organizing the language count input
frame_languages.pack(pady=10)  # Add padding to the frame

# Add a label to prompt the user for the number of languages to train on
tk.Label(frame_languages, text="Enter Number of Languages to Train On").grid(row=0, column=0, padx=5)

# Add an entry field for the user to input the number of languages
entry_num_languages = tk.Entry(frame_languages, width=10)
entry_num_languages.grid(row=0, column=1, padx=5)

# Add a button to confirm the number of languages entered
btn_confirm_languages = tk.Button(frame_languages, text="Confirm", width=10)
btn_confirm_languages.grid(row=0, column=2, padx=5)







# Step 3: Train and Evaluate Model with Accuracy Display
def train_and_evaluate_model():
    global all_features, all_labels

    try:
        # Get the number of languages to train on from the user
        num_languages = int(entry_num_languages.get())

        # Initialize lists to store paths to feature and label files
        feature_paths = []
        label_paths = []

        # Collect feature and label files for each language
        for i in range(num_languages):
            # Ask the user to select the feature file for the current language
            feature_file = filedialog.askopenfilename(
                title=f"Select Feature File for Language {i + 1}",
                filetypes=(("NumPy Files", "*.npy"),),
            )
            if not feature_file:
                # Show an error if the feature file is not provided
                messagebox.showerror("Error", f"Feature file for Language {i + 1} is required.")
                return

            # Ask the user to select the label file for the current language
            label_file = filedialog.askopenfilename(
                title=f"Select Label File for Language {i + 1}",
                filetypes=(("NumPy Files", "*.npy"),),
            )
            if not label_file:
                # Show an error if the label file is not provided
                messagebox.showerror("Error", f"Label file for Language {i + 1} is required.")
                return

            # Append the selected feature and label file paths to their respective lists
            feature_paths.append(feature_file)
            label_paths.append(label_file)

        # Load features and labels from the selected files
        for feature_path, label_path in zip(feature_paths, label_paths):
            features = np.load(feature_path, allow_pickle=True).astype(np.float32)  # Load features
            labels = np.load(label_path, allow_pickle=True)  # Load labels
            all_features.append(features)  # Add to global features list
            all_labels.append(labels)  # Add to global labels list

        # Prepare combined data by concatenating features and labels across all languages
        all_features_combined = np.concatenate(all_features, axis=0)
        all_labels_combined = np.concatenate(all_labels, axis=0)

        # Encode labels into numerical format using LabelEncoder
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(all_labels_combined)

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(all_features_combined, y, test_size=0.2, random_state=42)

        # Convert labels to one-hot encoding for training
        y_train_one_hot = to_categorical(y_train)
        y_test_one_hot = to_categorical(y_test)

        # Reshape the features to match ECAPA-TDNN input requirements (1 time step, 192 features)
        X_train = X_train.reshape(-1, 1, 192)
        X_test = X_test.reshape(-1, 1, 192)

        # Define the ECAPA-TDNN model
        input_shape = (1, 192)  # Shape of the input (1 time step, 192 features)
        num_classes = len(label_encoder.classes_)  # Number of language classes

        # Initialize the ECAPA-TDNN model with the defined input shape and number of classes
        ecapa_tdnn_model = ECAPA_TDNN(input_shape, num_classes)
        ecapa_tdnn_model.build(input_shape=(None, input_shape[0], input_shape[1]))

        # Compile the model with AdamW optimizer and categorical crossentropy loss
        optimizer = tf.keras.optimizers.AdamW(learning_rate=0.0001)
        ecapa_tdnn_model.compile(
            optimizer=optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        # Train the model using the training data, with validation split of 20%
        history = ecapa_tdnn_model.fit(
            X_train, y_train_one_hot,
            epochs=30,  # Number of epochs for training
            batch_size=32,  # Batch size for training
            validation_split=0.2  # Use 20% of training data for validation
        )

        # Evaluate the trained model on the testing data
        test_loss, test_accuracy = ecapa_tdnn_model.evaluate(X_test, y_test_one_hot)

        # Save the trained model and label encoder
        ecapa_tdnn_model.save("trained_ecapa_tdnn_model", save_format="tf")  # Save the model in TensorFlow format
        np.save("trained_label_encoder.npy", label_encoder.classes_)  # Save the label encoder classes

        # Retrieve training and validation accuracy from the training history
        training_accuracy = history.history['accuracy'][-1]  # Last epoch training accuracy
        validation_accuracy = history.history['val_accuracy'][-1]  # Last epoch validation accuracy

        # Display a message box with training, validation, and test accuracy
        messagebox.showinfo(
            "Training Complete",
            f"Model training complete!\n\n"
            f"Training Accuracy: {training_accuracy:.4f}\n"
            f"Validation Accuracy: {validation_accuracy:.4f}\n"
            f"Test Accuracy: {test_accuracy:.4f}"
        )

        # Log accuracy details in the text_logs area
        text_logs.insert(
            tk.END,
            f"\nTraining Accuracy: {training_accuracy:.4f}\n"
            f"Validation Accuracy: {validation_accuracy:.4f}\n"
            f"Test Accuracy: {test_accuracy:.4f}\n"
        )
        root.update()  # Update the GUI to reflect the logged details

    except Exception as e:
        # Show an error message if any exception occurs
        messagebox.showerror("Error", f"An error occurred: {e}")

# Button to start the training and evaluation process
btn_train_model = tk.Button(root, text="Train and Save Model", command=train_and_evaluate_model)
btn_train_model.pack(pady=10)  # Add padding for better UI spacing








# Function to load the trained model and make predictions
def load_model_and_predict():
    try:
        # Ask the user to select the trained model directory
        model_path = filedialog.askdirectory(title="Select Trained Model Directory")
        if not model_path:  # If no directory is selected, show an error message
            messagebox.showerror("Error", "Model directory is required.")
            return

        # Ask the user to select the label encoder file
        label_encoder_path = filedialog.askopenfilename(
            title="Select Label Encoder File", filetypes=(("NumPy Files", "*.npy"),)
        )
        if not label_encoder_path:  # If no file is selected, show an error message
            messagebox.showerror("Error", "Label encoder file is required.")
            return

        # Load the trained model and label encoder
        global ecapa_tdnn_model, label_encoder
        ecapa_tdnn_model = tf.keras.models.load_model(model_path)  # Load the trained model
        label_encoder_classes = np.load(label_encoder_path, allow_pickle=True)  # Load the label encoder classes
        label_encoder = LabelEncoder()  # Create a new label encoder
        label_encoder.classes_ = label_encoder_classes  # Assign loaded classes to the label encoder

        # Ask the user to select the MP3 file for prediction
        file_path = filedialog.askopenfilename(
            title="Select an MP3 File for Prediction", filetypes=(("MP3 Files", "*.mp3"),)
        )
        if not file_path:  # If no file is selected, show an error message
            messagebox.showerror("Error", "MP3 file is required.")
            return

        # Extract features from the selected MP3 file using the ECAPA-TDNN encoder
        classifier = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")  # Load the pre-trained classifier
        signal, sample_rate = torchaudio.load(file_path)  # Load the MP3 file into signal and sample rate
        if sample_rate != 16000:  # Resample the audio if the sample rate is not 16kHz
            signal = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(signal)
        embeddings = classifier.encode_batch(signal).numpy()  # Generate embeddings for the audio file
        embeddings = embeddings.reshape(1, 1, 192)  # Reshape the embeddings to match model input shape

        # Use the loaded model to predict the language
        predictions = ecapa_tdnn_model.predict(embeddings)  # Get softmax probabilities for each class
        predicted_index = np.argmax(predictions)  # Find the index of the class with the highest probability
        predicted_label = label_encoder.inverse_transform([predicted_index])[0]  # Map the index to the corresponding label

        # Generate confidence scores for all language labels
        confidence_scores = predictions[0]  # Extract the probabilities (softmax values) for each label
        confidence_info = "\n".join(
            [f"{label}: {confidence * 100:.2f}%" for label, confidence in zip(label_encoder.classes_, confidence_scores)]
        )

        # Show a message box with the predicted label and confidence scores for all labels
        messagebox.showinfo(
            "Prediction Complete",
            f"Predicted Language: {predicted_label}\n\n"
            f"Confidence for each label:\n{confidence_info}\n\n"
            f"Trained Language Labels: {list(label_encoder.classes_)}"
        )

    except Exception as e:  # Handle exceptions during prediction
        messagebox.showerror("Error", f"An error occurred: {e}")

# Create a button to trigger the prediction process
btn_predict_model = tk.Button(root, text="Load Model and Predict", command=load_model_and_predict)
btn_predict_model.pack(pady=10)  # Add padding around the button for better spacing

# Start the Tkinter main loop to run the GUI application
root.mainloop()


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, c

Epoch 1/30


  if hasattr(m, '__file__') and m.__file__ == obj_file:


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
INFO:tensorflow:Assets written to: trained_ecapa_tdnn_model\assets


INFO:tensorflow:Assets written to: trained_ecapa_tdnn_model\assets
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classi

