In [1]:
!pip install datasets



In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset, Audio
import tqdm
from huggingface_hub import login

In [3]:
# Load Common Voice Uzbek Dataset
login(token="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")
dataset = load_dataset("mozilla-foundation/common_voice_13_0", "uz", split="train+validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df = dataset.to_pandas()

In [5]:
# Filter Audio Samples by Gender
df = df[df['gender'].isin(['male', 'female'])]  # Keep only male/female labels
df = df[['path', 'gender']]
df = df.dropna().reset_index(drop=True)

In [6]:
# Feature Extraction Function
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        if len(y) < 22050:  # Skip very short clips
            return None
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
        contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
        return np.concatenate((mfccs, chroma, contrast))
    except:
        return None

In [7]:
# Download Audio Files and Extract Features
dataset = dataset.cast_column("audio", Audio())

In [8]:
features = []
labels = []

In [9]:
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    try:
        audio_file = dataset[i]['audio']['path']
        feat = extract_features(audio_file)
        if feat is not None:
            features.append(feat)
            labels.append(row['gender'])
    except:
        continue

100%|██████████| 33464/33464 [29:23<00:00, 18.97it/s]


In [10]:
X = np.array(features)
y = np.array(labels)

In [11]:
# Encode Labels and Split Dataset
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [12]:
# Train a Simple Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# Evaluate Model Performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [14]:
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.88
              precision    recall  f1-score   support

      female       0.86      0.81      0.83      2447
        male       0.89      0.92      0.91      4246

    accuracy                           0.88      6693
   macro avg       0.87      0.86      0.87      6693
weighted avg       0.88      0.88      0.88      6693



In [15]:
# Save the Processed Dataset
final_df = pd.DataFrame(X)
final_df['label'] = y
final_df.to_csv("processed_audio_features.csv", index=False)