<a href="https://colab.research.google.com/github/theerisarak/voice-gender-recognition/blob/main/Voice_Gender_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydub resampy
!pip install pydub
!pip install librosa noisereduce
!pip install noisereduce
!pip show noisereduce
!pip install --upgrade noisereduce

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pydub, resampy
Successfully installed pydub-0.25.1 resampy-0.4.3
Collecting noisereduce
  Downloading noisereduce-3.0.2-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.2
Name: noisereduce
Version: 3.0.2
Summary: Noise reduction using Spectral Gating in Python
Home-page: https://github.com/timsainb/noisereduce
Author: Tim Sainburg
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: librosa, matplotlib, numpy, scipy, tqdm
Required-by: 


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Impoting Libraries
import os
import numpy as np
import librosa
import noisereduce as nr
from pydub import AudioSegment
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

def convert_and_load_audio(file_path):
    #Converts .m4a files to .wav
    temp_path = file_path.replace('.m4a', '.wav')
    if not os.path.exists(temp_path):
        audio = AudioSegment.from_file(file_path, format='m4a')
        audio.export(temp_path, format='wav')
    return temp_path

#Preprocess by normalizing and reducing noise.
def preprocess_signal(y, sr):
    y_normalized = librosa.util.normalize(y)
    noise_reduced_signal = nr.reduce_noise(y=y_normalized, sr=sr)
    return noise_reduced_signal


def extract_features(file_path):
    temp_path = convert_and_load_audio(file_path)
    try:
        audio, sample_rate = librosa.load(temp_path, res_type='kaiser_fast')
        preprocessed_audio = preprocess_signal(audio, sample_rate)

        #MFCCs
        mfccs = librosa.feature.mfcc(y=preprocessed_audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        #Chroma
        stft = np.abs(librosa.stft(audio))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)

        #Mel-spectrogram
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate).T,axis=0)

        #Spectral contrast
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)

        #Tonnetz
        try:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sample_rate).T, axis=0)
        except Exception as e:
            print("Tonnetz calculation failed: ", e)
            tonnetz = np.zeros(6)  #fallback plan

        #linking all features into one numpy array
        features = np.hstack([mfccs_processed, chroma, mel, contrast, tonnetz])

    except Exception as e:
        print(f"Error encountered while parsing file: {temp_path}\nError: {e}")
        return None
    return features

def load_data_and_balance(dataset_dir):
    labels = []
    features = []
    for label in ['Males', 'Females']:
        gender_folder = os.path.join(dataset_dir, label)
        for filename in os.listdir(gender_folder):
            file_path = os.path.join(gender_folder, filename)
            if file_path.endswith('.wav'):  # Ensure processing only .wav files
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    features = np.array(features)

    #Convert labels to numeric values
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)

    #Balancing dataset
    n_samples = min(sum(labels_encoded == 0), sum(labels_encoded == 1))
    features_balanced, labels_balanced = [], []
    for class_label in np.unique(labels_encoded):
        features_class = features[labels_encoded == class_label]
        labels_class = labels_encoded[labels_encoded == class_label]
        features_resampled, labels_resampled = resample(features_class, labels_class, replace=False, n_samples=n_samples, random_state=42)
        features_balanced.extend(features_resampled)
        labels_balanced.extend(labels_resampled)

    return np.array(features_balanced), np.array(labels_balanced)

dataset_dir = '/content/drive/MyDrive/VoxCeleb'
features_balanced, labels_balanced = load_data_and_balance(dataset_dir)


In [None]:
#Import libraries
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import os

#Load data, extract features, and balance the dataset
def load_data_and_balance(dataset_dir):
    labels = []
    features = []
    for label in ['Males', 'Females']:
        gender_folder = os.path.join(dataset_dir, label)
        for filename in os.listdir(gender_folder):
            file_path = os.path.join(gender_folder, filename)
            if file_path.endswith('.wav'):
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    features = np.array(features)
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)
    features_balanced, labels_balanced = resample(features, labels_encoded, replace=False, n_samples=min(np.bincount(labels_encoded)), random_state=42, stratify=labels_encoded)
    return features_balanced, labels_balanced, le

dataset_dir = '/content/drive/MyDrive/VoxCeleb'

#balance the dataset
features_balanced, labels_balanced, le = load_data_and_balance(dataset_dir)

#splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_balanced, labels_balanced, test_size=0.25, random_state=42)

#Model training, hyperparameter optimization
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
clf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

#Display the best parameters
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

#Train and evaluate the model
model = RandomForestClassifier(**random_search.best_params_, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30}
Best Score: 0.9052080536912752
              precision    recall  f1-score   support

     Females       0.85      0.82      0.84        78
       Males       0.92      0.94      0.93       172

    accuracy                           0.90       250
   macro avg       0.89      0.88      0.88       250
weighted avg       0.90      0.90      0.90       250

