# CNN

In [1]:
import pandas as pd

df = pd.read_csv("df_cleaned.csv")
df.head()

Unnamed: 0,path,gender
0,common_voice_en_41534732.mp3,male_masculine
1,common_voice_en_41472897.mp3,male_masculine
2,common_voice_en_41909191.mp3,male_masculine
3,common_voice_en_41650031.mp3,male_masculine
4,common_voice_en_41888663.mp3,male_masculine


In [2]:
import librosa
import parselmouth
import numpy as np

FILE_PATH = "../../datasets/cv-corpus-20.0-delta-2024-12-06/en/clips/"
SAMPLE_RATE = 16000
TARGET_RMS = 0.1  # Target RMS energy level

In [3]:
def trim_silence(y):
    """
    Trims leading and trailing silence from an audio signal while preserving small pauses.
    """
    y_trimmed, _ = librosa.effects.trim(y, top_db=20)
    return y_trimmed

In [4]:
def load_and_normalize_audio(file_name):
    """Loads and normalizes an audio file to a target RMS loudness."""
    # Load the audio as a waveform `y`
    y, sr = librosa.load(FILE_PATH + file_name, sr=SAMPLE_RATE)
    
    y = trim_silence(y)

    # Compute current RMS
    rms = np.sqrt(np.mean(y**2))

    # Compute scaling factor
    scaling_factor = TARGET_RMS / (rms + 1e-6)  # Avoid division by zero

    # Apply normalization
    y_normalized = y * scaling_factor

    return y_normalized, sr

In [5]:
X = []
y = []

count = 0
for index, row in df.iterrows():
    count += 1
    print("Sample:", count)
    samples, sr = load_and_normalize_audio(file_name=row["path"])
    X.append(samples)
    y.append(row["gender"])

Sample: 1
Sample: 2
Sample: 3
Sample: 4
Sample: 5
Sample: 6
Sample: 7
Sample: 8
Sample: 9
Sample: 10
Sample: 11
Sample: 12
Sample: 13
Sample: 14
Sample: 15
Sample: 16
Sample: 17
Sample: 18
Sample: 19
Sample: 20
Sample: 21
Sample: 22
Sample: 23
Sample: 24
Sample: 25
Sample: 26
Sample: 27
Sample: 28
Sample: 29
Sample: 30
Sample: 31
Sample: 32
Sample: 33
Sample: 34
Sample: 35
Sample: 36
Sample: 37
Sample: 38
Sample: 39
Sample: 40
Sample: 41
Sample: 42
Sample: 43
Sample: 44
Sample: 45
Sample: 46
Sample: 47
Sample: 48
Sample: 49
Sample: 50
Sample: 51
Sample: 52
Sample: 53
Sample: 54
Sample: 55
Sample: 56
Sample: 57
Sample: 58
Sample: 59
Sample: 60
Sample: 61
Sample: 62
Sample: 63
Sample: 64
Sample: 65
Sample: 66
Sample: 67
Sample: 68
Sample: 69
Sample: 70
Sample: 71
Sample: 72
Sample: 73
Sample: 74
Sample: 75
Sample: 76
Sample: 77
Sample: 78
Sample: 79
Sample: 80
Sample: 81
Sample: 82
Sample: 83
Sample: 84
Sample: 85
Sample: 86
Sample: 87
Sample: 88
Sample: 89
Sample: 90
Sample: 91
Sample: 

In [None]:
import numpy as np

# Use the 95th percentile as the max length (ignores extreme outliers)
max_length = int(np.percentile([len(x) for x in X], 95))

# Pad or truncate all samples to this length
X_padded = [np.pad(x[:max_length], (0, max(0, max_length - len(x))), mode='constant') for x in X]
X_padded = np.array(X_padded).reshape(len(X_padded), max_length, 1)

print("Padded X shape:", X_padded.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Model

X_waveform = X
y_labels = y

# Define CNN model for raw waveform input
inputs = Input(shape=(X_waveform.shape[1], 1))  # X_waveform is the raw waveform data
x = Conv1D(64, kernel_size=5, activation="relu")(inputs)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(128, kernel_size=5, activation="relu")(x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.3)(x)
output = Dense(1, activation="sigmoid")(x)  # Binary classification (Male/Female)

model = Model(inputs, output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train Model
model.fit(X_waveform, y_labels, epochs=20, batch_size=32)