In [1]:
import sounddevice as sd
import numpy as np
import librosa
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import time

In [2]:
# Function to record audio
def record_audio(duration, sample_rate):
    try:
        print("Recording...")
        audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
        sd.wait() 
        print("Recording Finished")
        return audio.flatten()
    except Exception as e:
        print(f"Error during recording: {e}")
        return np.array([])

In [3]:
# Function to extract features
def extract_features(audio_data, sample_rate):
    # Check if the audio is silent (ZCR threshold)
    zcr = librosa.feature.zero_crossing_rate(y=audio_data)
    if np.mean(zcr) < 0.05:  
        return np.zeros(154)  

    try:
        
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
        mfccs_mean = np.mean(mfccs.T, axis=0)

        
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sample_rate)
        chroma_mean = np.mean(chroma.T, axis=0)

        
        mel = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
        mel_mean = np.mean(mel.T, axis=0)

    
        return np.concatenate((mfccs_mean, chroma_mean, mel_mean, np.mean(zcr.T, axis=0)), axis=0)
    except Exception as e:
        print(f"Error during feature extraction: {e}")
        return np.zeros(154)

In [4]:

# Load dataset and extract features
def load_data(dataset_path, sample_rate=22050):
    features = []
    labels = []

    if not os.path.exists(dataset_path):
        print(f"Dataset path {dataset_path} does not exist.")
        return np.array(features), np.array(labels)

    for genre in os.listdir(dataset_path):
        genre_folder = os.path.join(dataset_path, genre)

        if os.path.isdir(genre_folder):
            wav_files = [f for f in os.listdir(genre_folder) if f.lower().endswith('.wav')]
            if not wav_files:
                continue

            for file in wav_files:
                file_path = os.path.join(genre_folder, file)

                try:
                    audio_data, sr = librosa.load(file_path, sr=sample_rate)
                    features_matrix = extract_features(audio_data, sr)
                    features.append(features_matrix)
                    labels.append(genre)
                except Exception as e:
                    print(f"Error processing {file}: {e}")

    return np.array(features), np.array(labels)

In [5]:
# Dataset path
dataset_path = r"C:\dataset\genres_original"
sample_rate = 22050
X, y = load_data(dataset_path, sample_rate)

  audio_data, sr = librosa.load(file_path, sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing jazz.00054.wav: 


In [6]:

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Reshape the data for CNN input (samples, height, width, channels)
X_train_cnn = X_train.reshape(-1, 154, 1)  # 154 features per sample
X_test_cnn = X_test.reshape(-1, 154, 1)


model = Sequential()

# First Convolutional Layer
model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(154, 1)))
model.add(MaxPooling1D(pool_size=2))

# Second Convolutional Layer
model.add(Conv1D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Third Convolutional Layer
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Flatten the output
model.add(Flatten())


model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))

# Output Layer (Softmax for multi-class classification)
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Train the model
history = model.fit(X_train_cnn, y_train, epochs=20, batch_size=32, validation_data=(X_test_cnn, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_cnn, y_test)
print(f"Test accuracy: {test_acc * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.2100 - loss: 5.7906 - val_accuracy: 0.3200 - val_loss: 3.4529
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3921 - loss: 3.1100 - val_accuracy: 0.3400 - val_loss: 2.4907
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4415 - loss: 2.3813 - val_accuracy: 0.4550 - val_loss: 2.0824
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4384 - loss: 2.0174 - val_accuracy: 0.4300 - val_loss: 2.0319
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4638 - loss: 1.8770 - val_accuracy: 0.4650 - val_loss: 1.8678
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5517 - loss: 1.7032 - val_accuracy: 0.4200 - val_loss: 1.8417
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━

In [7]:
# Real-time genre prediction
def predict_genre_cnn(audio_data, sample_rate, model):
    features = extract_features(audio_data, sample_rate)  

    if np.all(features == 0):
        return "Unknown"

    # Normalize the features
    features_scaled = scaler.transform(features.reshape(1, -1))  

    # Reshape for CNN input (1D input shape for CNN)
    features_reshaped = features_scaled.reshape(1, 154, 1)  

    # Predict genre using the trained CNN model
    genre_prediction = model.predict(features_reshaped)

    # Get the predicted genre and its confidence
    genre = label_encoder.inverse_transform([np.argmax(genre_prediction)])
    confidence = np.max(genre_prediction) * 100

    print(f"Prediction confidence: {confidence:.2f}%")

    return genre[0] if confidence > 60 else "Unknown"  

In [8]:
# Real-time prediction loop
while True:
    try:
        audio_data = record_audio(duration=8, sample_rate=sample_rate)
        if audio_data.size > 0:
            genre = predict_genre_cnn(audio_data, sample_rate, model=model)
            print(f"Predicted genre: {genre}")
        else:
            print("No audio detected.")
        time.sleep(1)  
    except KeyboardInterrupt:
        print("Recording interrupted. Exiting...")
        break

Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Prediction confidence: 44.74%
Predicted genre: Unknown
Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction confidence: 95.98%
Predicted genre: classical
Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Prediction confidence: 61.37%
Predicted genre: hiphop
Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Prediction confidence: 49.09%
Predicted genre: Unknown
Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Prediction confidence: 75.30%
Predicted genre: metal
Recording...
Recording Finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Prediction confidence: 22.57%
Predicted genre: Unknown
Recording...
Recording Finished
[1m1/1[0m [3