In [1]:
import numpy as np
import librosa
from pathlib import Path
import random

directory = Path("Chords")

max_frames = 40  # fix length for all samples

features = []
labels = []

def augment_audio(y, sr):
    """Apply random audio augmentation to waveform y."""
    
    augmentations = ['gaussian', 'background', 'pitch', 'time', None]
    choice = random.choice(augmentations)
    
    if choice == 'gaussian':
        noise_factor = random.uniform(0.002, 0.02)
        noise = np.random.randn(len(y))
        y = y + noise_factor * noise
        y = np.clip(y, -1.0, 1.0)
    
    elif choice == 'background':
        snr_db = random.uniform(1, 10)  # Signal-to-noise ratio in dB
        noise = np.random.randn(len(y))
        rms_signal = np.sqrt(np.mean(y**2))
        rms_noise = np.sqrt(np.mean(noise**2))
        desired_rms_noise = rms_signal / (10**(snr_db/20))
        noise = noise * (desired_rms_noise / (rms_noise + 1e-6))
        y = np.clip(y + noise, -1.0, 1.0)
    
    elif choice == 'time':
        rate = random.uniform(0.93, 1.06)
        y = librosa.effects.time_stretch(y=y, rate=rate)
    
    return y

for file in directory.rglob("*"):
    if file.is_file():
        filename = file.stem
        chord_name = filename.split('_')[0]  # adjust as needed

        for i in range(1):
            y, sr = librosa.load(str(file))
            y = augment_audio(y, sr)  # apply augmentation

            # Chroma CQT feature
            chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
            chroma = chroma / (np.sum(chroma, axis=0, keepdims=True) + 1e-6)

            # Mel spectrogram feature (in dB)
            mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
            mel_db = librosa.power_to_db(mel, ref=np.max)

            # Normalize mel per frame (optional)
            mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)

            # Pad or truncate chroma to fixed length
            if chroma.shape[1] < max_frames:
                pad_width = max_frames - chroma.shape[1]
                chroma = np.pad(chroma, ((0, 0), (0, pad_width)), mode='constant')
            else:
                chroma = chroma[:, :max_frames]

            # Pad or truncate mel to fixed length (same max_frames)
            if mel_db.shape[1] < max_frames:
                pad_width = max_frames - mel_db.shape[1]
                mel_db = np.pad(mel_db, ((0, 0), (0, pad_width)), mode='constant')
            else:
                mel_db = mel_db[:, :max_frames]

            # Combine chroma and mel along frequency/channel axis
            combined = np.vstack([chroma, mel_db])  # shape: (12 + 40, max_frames)

            features.append(combined)
            labels.append(chord_name)
    
X_mel = np.array(features)  # shape: (num_samples, 40, max_frames)

# Convert to numpy arrays
X = np.array(features)   # shape: (samples, 12, max_frames)
y = np.array(labels)

import numpy as np

X_mean = np.mean(X, axis=2)  # mean chroma per pitch class → shape (samples, 12)
X_std = np.std(X, axis=2)    # std chroma per pitch class → shape (samples, 12)

# Concatenate mean and std features
X_summary = np.concatenate([X_mean, X_std], axis=1)  # shape (samples, 24)




In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_summary, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)



model = SVC(kernel="rbf")

cv = cross_val_score(model, X_summary, y, cv=5)

print(cv.mean())

# model.fit(X_train, y_train)

# y_pred = model.predict(X_valid)

# y_pred_labels = le.inverse_transform(y_pred)
# y_valid_labels = le.inverse_transform(y_valid)

# acc = accuracy_score(y_pred, y_valid)

# print(acc)

0.25622783812686445


# CNN

In [30]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

# Assuming X and y are loaded from your preprocessing

# Add channel dimension
X_cnn = X[..., np.newaxis]  # (samples, 12, max_frames, 1)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X_cnn, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

num_classes = len(le.classes_)
y_train_cat = to_categorical(y_train, num_classes)
y_valid_cat = to_categorical(y_valid, num_classes)

# Define model
model = Sequential([
    Conv2D(32, (3, 3), padding='same', input_shape=(12, X.shape[2], 1)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Conv2D(64, (3, 3), padding='same'),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Flatten(),

    Dense(256),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    Dropout(0.3),

    Dense(128),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    Dropout(0.2),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train_cat,
    validation_data=(X_valid, y_valid_cat),
    epochs=100,
    batch_size=32,
    verbose=2,
    callbacks=[early_stop]
)

y_pred_prob = model.predict(X_valid)
y_pred = np.argmax(y_pred_prob, axis=1)

acc = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {acc:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_69" is incompatible with the layer: expected axis -1 of input shape to have value 1920, but received input with shape (None, 8320)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 52, 40, 1), dtype=float32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [13]:
import numpy as np
import tensorflow as tf
import cv2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.applications import EfficientNetB0  # placeholder for PANNs CNN14

# Assuming you have:
# - X_mel: list or np.array of mel spectrograms (samples, mel_bins=40, time_frames=max_frames)
# - y: list or np.array of labels (strings or ints)

# Resize mel bins from 40 → 64 to match pretrained model input
def resize_mel(mel, target_bins=64):
    # mel shape: (mel_bins, time_frames)
    # cv2.resize expects (width, height) so order: (time_frames, target_bins)
    mel_resized = cv2.resize(mel, (mel.shape[1], target_bins))  # width=time_frames, height=target_bins
    return mel_resized

mel_resized = np.array([resize_mel(mel) for mel in X_mel])  # shape: (samples, 64, max_frames)
print("Resized mel shape:", mel_resized.shape)  # e.g. (num_samples, 64, max_frames)

# Add channel dimension for Conv2D
X_cnn = mel_resized[..., np.newaxis]  # (samples, 64, max_frames, 1)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)

# Split after resizing and channel add
X_train, X_valid, y_train, y_valid = train_test_split(
    X_cnn, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

y_train_cat = to_categorical(y_train, num_classes)
y_valid_cat = to_categorical(y_valid, num_classes)

# Model input shape must match X_cnn shape (excluding batch)
input_layer = Input(shape=(64, mel_resized.shape[2], 1))  # (64, max_frames, 1)

# Convert 1 channel to 3 channels for EfficientNet pretrained on ImageNet
x = tf.keras.layers.Conv2D(3, (3,3), padding='same')(input_layer)

backbone = EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(64, mel_resized.shape[2], 3)
)(x)

x = GlobalAveragePooling2D()(backbone)
x = Dropout(0.3)(x)
output_layer = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X_train, y_train_cat,
    validation_data=(X_valid, y_valid_cat),
    epochs=50,
    batch_size=32,
    verbose=2
)

# Evaluate
y_pred_prob = model.predict(X_valid)
y_pred = np.argmax(y_pred_prob, axis=1)
acc = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {acc:.4f}")


Resized mel shape: (1632, 64, 40)
Epoch 1/50
41/41 - 42s - 1s/step - accuracy: 0.0460 - loss: 3.4762 - val_accuracy: 0.0428 - val_loss: 3.2201
Epoch 2/50
41/41 - 8s - 201ms/step - accuracy: 0.1073 - loss: 3.1616 - val_accuracy: 0.0398 - val_loss: 3.3506
Epoch 3/50
41/41 - 8s - 204ms/step - accuracy: 0.1510 - loss: 2.9055 - val_accuracy: 0.0398 - val_loss: 3.3155
Epoch 4/50
41/41 - 8s - 201ms/step - accuracy: 0.2345 - loss: 2.6460 - val_accuracy: 0.0459 - val_loss: 3.3141
Epoch 5/50
41/41 - 8s - 200ms/step - accuracy: 0.3234 - loss: 2.3725 - val_accuracy: 0.0428 - val_loss: 3.3800
Epoch 6/50
41/41 - 8s - 199ms/step - accuracy: 0.3816 - loss: 2.1520 - val_accuracy: 0.0367 - val_loss: 3.3377
Epoch 7/50
41/41 - 8s - 201ms/step - accuracy: 0.4437 - loss: 1.9389 - val_accuracy: 0.0489 - val_loss: 3.2870
Epoch 8/50
41/41 - 8s - 204ms/step - accuracy: 0.5142 - loss: 1.6888 - val_accuracy: 0.0428 - val_loss: 3.2900
Epoch 9/50
41/41 - 9s - 210ms/step - accuracy: 0.5556 - loss: 1.5252 - val_accur