In [1]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import os
import soundfile as sf # For potentially handling different audio formats

# --- 1. Feature Extraction Function ---
def extract_features(audio_path, sr=22050):
    """
    Extracts pitch (F0) and Mel-Frequency Cepstral Coefficients (MFCCs) from an audio file.
    Handles potential FileNotFoundError.
    """
    try:
        y, sr = librosa.load(audio_path, sr=sr)
    except FileNotFoundError:
        print(f"Error: Audio file not found at '{audio_path}'. Skipping.")
        return None
    except Exception as e:
        print(f"Error loading audio file '{audio_path}': {e}. Skipping.")
        return None

    # Extract Pitch (F0)
    # Using 'pyin' for more robust pitch tracking
    # fmin and fmax define the range of frequencies to search for F0
    f0, voiced_flag, voiced_probs = librosa.soundfile.pyin(y, fmin=librosa.note_to_hz('C2'),
                                                        fmax=librosa.note_to_hz('C5'), sr=sr)
    f0_mean = np.nanmean(f0) if np.any(~np.isnan(f0)) else 0 # Handle cases with no detected pitch

    # Extract MFCCs
    # n_mfcc=13 is a common choice for speech
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Combine features
    features = np.concatenate(([f0_mean], mfccs_mean))
    return features

# --- 2. Dummy Dataset Creation (REPLACE THIS WITH YOUR REAL, LABELED DATA) ---
# For demonstration purposes, we'll create a very small, illustrative dummy dataset.
# In a real-world scenario, you would have many audio files for each gender.
# You need to replace these paths with actual paths to YOUR labeled male/female audio files.

# Create some dummy audio files for demonstration if they don't exist
# In a real scenario, you'd load your actual dataset
dummy_male_path = "dummy_male_voice.wav"
dummy_female_path = "dummy_female_voice.wav"

if not os.path.exists(dummy_male_path):
    print(f"Creating a dummy male voice audio file at {dummy_male_path} for demonstration.")
    # Simulate a male voice (lower frequency sine wave)
    sr_dummy = 22050
    duration_dummy = 2 # seconds
    t = np.linspace(0, duration_dummy, int(sr_dummy * duration_dummy), endpoint=False)
    male_freq = 120 # Hz (typical male pitch range)
    dummy_male_audio = 0.5 * np.sin(2 * np.pi * male_freq * t)
    sf.write(dummy_male_path, dummy_male_audio, sr_dummy)

if not os.path.exists(dummy_female_path):
    print(f"Creating a dummy female voice audio file at {dummy_female_path} for demonstration.")
    # Simulate a female voice (higher frequency sine wave)
    sr_dummy = 22050
    duration_dummy = 2 # seconds
    t = np.linspace(0, duration_dummy, int(sr_dummy * duration_dummy), endpoint=False)
    female_freq = 220 # Hz (typical female pitch range)
    dummy_female_audio = 0.5 * np.sin(2 * np.pi * female_freq * t)
    sf.write(dummy_female_path, dummy_female_audio, sr_dummy)

# Define the training data paths and labels
# IMPORTANT: Replace these with paths to your actual, properly labeled male and female audio files.
train_audio_paths = [
    dummy_male_path,
    dummy_male_path,
    dummy_male_path,
    dummy_female_path,
    dummy_female_path,
    dummy_female_path,
]

train_labels = [0, 0, 0, 1, 1, 1] # 0 for male, 1 for female

# --- Process the training data ---
X = []
y = []
print("\nExtracting features from training data...")
for i, audio_path in enumerate(train_audio_paths):
    features = extract_features(audio_path)
    if features is not None:
        X.append(features)
        y.append(train_labels[i])

X = np.array(X)
y = np.array(y)

if len(X) == 0:
    print("No features extracted from the training dataset. Cannot train model. Please check audio paths.")
else:
    # --- 3. Train a Machine Learning Model ---
    # Split data into training and testing sets
    # test_size=0.3 means 30% of data will be used for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize and train a Logistic Regression model
    # Logistic Regression is a simple, yet often effective, baseline classifier
    model = LogisticRegression(max_iter=1000, solver='lbfgs') # Increased max_iter and specified solver for convergence
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print("\n--- Model Evaluation (on dummy data) ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Male', 'Female']))
    print("Note: Accuracy on dummy data is not indicative of real-world performance.")

    # --- 4. Predict on Your Uploaded Audio File ---
    your_audio_file = "Video_Created_New_Veo_Launch.wav" # This is the file you provided

    print(f"\n--- Predicting gender for your audio file: '{your_audio_file}' ---")
    new_features = extract_features(your_audio_file)

    if new_features is not None:
        prediction = model.predict(new_features.reshape(1, -1)) # Reshape for single sample prediction
        predicted_gender = "Male" if prediction[0] == 0 else "Female"
        print(f"The predicted gender for '{your_audio_file}' is: {predicted_gender}")
        print("\nDisclaimer: This prediction is based on a very small, dummy training set.")
        print("For accurate results, train the model on a large and diverse dataset of male and female voices.")
    else:
        print(f"Could not extract features from '{your_audio_file}'. Please ensure the file is accessible and not corrupted.")

# --- Visualizing the Waveform, Frequency Spectrum, and Mel Spectrogram ---
# This part uses the actual 'Video_Created_New_Veo_Launch.wav' for visualization

print(f"\n--- Visualizing your audio file: '{your_audio_file}' ---")
try:
    y_plot, sr_plot = librosa.load(your_audio_file, sr=None)
    print(f"Audio loaded successfully for visualization! Sampling Rate (sr): {sr_plot} Hz")
    print(f"Audio duration: {len(y_plot)/sr_plot:.2f} seconds")

    # Waveform
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(y_plot, sr=sr_plot, color='blue')
    plt.title('Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Frequency Spectrum (FFT)
    Y_fft = np.fft.fft(y_plot)
    freqs_fft = np.fft.fftfreq(len(Y_fft), 1/sr_plot)
    magnitude_fft = np.abs(Y_fft)
    plt.figure(figsize=(12, 6))
    plt.plot(freqs_fft[:len(freqs_fft)//2], magnitude_fft[:len(magnitude_fft)//2])
    plt.title("Frequency Spectrum (FFT)")
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Magnitude")
    plt.grid()
    plt.show()

    # Mel Spectrogram
    S_mel = librosa.feature.melspectrogram(y=y_plot, sr=sr_plot)
    S_dB_mel = librosa.power_to_db(S_mel, ref=np.max)

    plt.figure(figsize=(12, 4))
    librosa.display.specshow(S_dB_mel, sr=sr_plot, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Mel Frequency (Hz)')
    plt.tight_layout()
    plt.show()

except FileNotFoundError:
    print(f"Error: Your audio file '{your_audio_file}' was not found for visualization.")
    print("Please make sure it's in the same directory as the script or provide the full path.")
except Exception as e:
    print(f"An unexpected error occurred during visualization: {e}")

Creating a dummy male voice audio file at dummy_male_voice.wav for demonstration.
Creating a dummy female voice audio file at dummy_female_voice.wav for demonstration.

Extracting features from training data...


AttributeError: No librosa attribute soundfile