In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
import matplotlib.image as mpimg
import librosa
import librosa.display

In [None]:
def audio_to_mel_spectrogram(file_path, sr=16000, n_mels=128, max_len=44):
    y, _ = librosa.load(file_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    log_mel_spec = log_mel_spec.T
    if log_mel_spec.shape[0] < max_len:
        pad_width = max_len - log_mel_spec.shape[0]
        log_mel_spec = np.pad(log_mel_spec, ((0, pad_width), (0, 0)))
    else:
        log_mel_spec = log_mel_spec[:max_len, :]
    return log_mel_spec

spec = audio_to_mel_spectrogram("six/00b01445_nohash_0.wav")

plt.figure(figsize=(10, 4))
librosa.display.specshow(spec.T, sr=16000, hop_length=512, x_axis='time', y_axis='mel')
plt.title("Mel Spectrogram")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

In [None]:
labels = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six':6,
    'seven': 7,
    'eight':8,
    'nine':9
}

In [None]:
X = []
y = []

for i in os.listdir():
    if '.' not in i:
        for j in os.listdir(i):
            file_name= f'{i}/{j}'
            y.append(labels[i])
            X.append(audio_to_mel_spectrogram(file_name))
X = np.array(X)
y = np.array(y)

In [None]:
X = X.reshape(17000,44, 128, 1)

In [None]:
from keras.utils import to_categorical
y = to_categorical(y, num_classes=10)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
from keras import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.optimizers import Adam

In [None]:
def build_stable_model(input_shape=(44, 128, 1), num_classes=10):
    model = Sequential([
        Input(shape=input_shape),
        
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3), 
        
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(), 
        
        Dense(128, activation='relu'),
        Dropout(0.5), 
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True, 
    verbose=1
)

lr_plateau = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=3, 
    verbose=1
)

model.fit(
    X_train, 
    y_train, 
    validation_data=(X_test, y_test), 
    epochs=30, 
    callbacks=[early_stop, lr_plateau]
)

In [None]:
model.save('model.keras')