In [196]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Define constants
DATA_DIR = 'GuitarNotes/train'  # Replace with your dataset path
CLASS_NAMES = os.listdir(DATA_DIR)
NUM_CLASSES = len(CLASS_NAMES)
SAMPLE_RATE = 44100  # Adjust based on your audio data
DURATION = 2  # Duration of each audio clip in seconds
NUM_MFCCS = 20  # Number of MFCCs to extract
BATCH_SIZE = 32
EPOCHS = 50  # Adjust based on your needs
MAX_NUM_FRAMES = 173

# Function to load and preprocess audio files
def load_and_preprocess_data(data_dir):
    X, y = [], []
    for class_idx, class_name in enumerate(CLASS_NAMES):
        class_dir = os.path.join(data_dir, class_name)
        for audio_file in os.listdir(class_dir):
            audio_path = os.path.join(class_dir, audio_file)
            audio, _ = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION)
            mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=NUM_MFCCS)
            # Transpose MFCCs to shape (num_frames, num_mfccs)
            mfccs = mfccs.T  # Transpose the MFCCs
            
            if mfccs.shape[0] < MAX_NUM_FRAMES:
                mfccs = np.pad(mfccs, ((0, MAX_NUM_FRAMES - mfccs.shape[0]), (0, 0)), 'constant')
            elif mfccs.shape[0] > MAX_NUM_FRAMES:
                mfccs = mfccs[:MAX_NUM_FRAMES, :]
            
            X.append(mfccs[:, :, np.newaxis])
            y.append(class_idx)
    return np.array(X), np.array(y)

# Load and preprocess the data
X, y = load_and_preprocess_data(DATA_DIR)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.GlobalAveragePooling2D(),  # Use Global Average Pooling
    layers.Dense(64, activation='relu'),
    layers.Dense(NUM_CLASSES, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=EPOCHS, batch_size=BATCH_SIZE)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

# Save the model if needed
# model.save('audio_classification_model.h5')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 30.00%


In [185]:
X_train.shape

(38, 173, 25, 1)

In [182]:
model.save('model.keras')

In [164]:

def process(audio_path):
    audio, _ = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION)
    mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=NUM_MFCCS)
                # Transpose MFCCs to shape (num_frames, num_mfccs)
    mfccs = mfccs.T  # Transpose the MFCCs
                
    if mfccs.shape[0] < MAX_NUM_FRAMES:
        mfccs = np.pad(mfccs, ((0, MAX_NUM_FRAMES - mfccs.shape[0]), (0, 0)), 'constant')
    elif mfccs.shape[0] > MAX_NUM_FRAMES:
        mfccs = mfccs[:MAX_NUM_FRAMES, :]
    
    return np.array([mfccs[:, :, np.newaxis]])

In [178]:
np.argmax(model.predict(process('k.wav')))



1

In [167]:
CLASS_NAMES

['A', 'B', 'D', 'Ehi', 'Elo', 'G']

In [168]:
from pydub import AudioSegment

In [173]:
import subprocess

def convert(file):  
# convert mp3 to wav file
    subprocess.call(['ffmpeg', '-i', file,   
                 'r.wav'])

In [176]:
os.path.exists('raw.mp3')

True

In [177]:
with open('raw.mp3', 'rb') as file:
    with open('k.wav', 'wb') as f:
        f.write(file.read())

In [187]:
import pickle

In [188]:
def predict(audio_path):
    data = process(audio_path)
    prediction = model.predict(data)
    prediction = CLASS_NAMES[np.argmax(prediction)]
    return prediction

In [189]:
predict('raw.wav')



'B'

In [192]:
with open('abc.pkl', 'wb') as file:
    pickle.dump(model, file)

In [199]:
import onnx

In [198]:
import tf2onnx


onnx_model, _ = tf2onnx.convert.from_keras(
    model,
    input_signature=[tf.TensorSpec(shape=[None, 173, 20, 1], dtype=tf.float32, name='x')],
    opset=13)


In [200]:
onnx.save(onnx_model, "model.onnx")

In [203]:
import onnxruntime as ort

In [205]:
sess = ort.InferenceSession('model.onnx', providers=['AzureExecutionProvider', 'CPUExecutionProvider'])

In [214]:
sess.run(None, {'x': process('raw.wav')})

[array([[0.06233804, 0.11958687, 0.14967401, 0.19894259, 0.02399746,
         0.445461  ]], dtype=float32)]