# **Get Dataset**

In [1]:
import os
from scipy.io import wavfile
import numpy as np
import librosa

In [43]:
# Define the folder paths for training, testing, and validation files
folder_path_train = "MUS/train"
folder_path_test = "MUS/test"
folder_path_validation = "MUS/validation"

# Check and list files in the training folder
if os.path.exists(folder_path_train) and os.path.isdir(folder_path_train):
    files_train = os.listdir(folder_path_train)
    print(f"Files in 'train' folder ({folder_path_train}):", files_train)
else:
    print(f"The folder '{folder_path_train}' does not exist or is not a directory.")
    files_train = []  # Assign empty list if folder doesn't exist

# Check and list files in the testing folder
if os.path.exists(folder_path_test) and os.path.isdir(folder_path_test):
    files_test = os.listdir(folder_path_test)
    print(f"Files in 'test' folder ({folder_path_test}):", files_test)
else:
    print(f"The folder '{folder_path_test}' does not exist or is not a directory.")
    files_test = []  # Assign empty list if folder doesn't exist

# Check and list files in the validation folder
if os.path.exists(folder_path_validation) and os.path.isdir(folder_path_validation):
    files_val = os.listdir(folder_path_validation)
    print(f"Files in 'validation' folder ({folder_path_validation}):", files_val)
else:
    print(f"The folder '{folder_path_validation}' does not exist or is not a directory.")
    files_val = []  # Assign empty list if folder doesn't exist

Files in 'train' folder (MUS/train): ['MAPS_MUS-mond_2_SptkBGCl.wav', 'MAPS_MUS-gra_esp_3_SptkBGCl.txt', 'MAPS_MUS-mz_331_1_SptkBGCl.wav', 'MAPS_MUS-chpn-p2_SptkBGCl.txt', 'MAPS_MUS-liz_et4_SptkBGCl.mid', 'MAPS_MUS-bk_xmas2_SptkBGCl.txt', 'MAPS_MUS-mond_2_SptkBGCl.txt', 'MAPS_MUS-gra_esp_3_SptkBGCl.wav', 'MAPS_MUS-mz_331_1_SptkBGCl.txt', 'MAPS_MUS-chpn-p2_SptkBGCl.wav', 'MAPS_MUS-bk_xmas2_SptkBGCl.wav', 'MAPS_MUS-alb_esp6_SptkBGCl.txt', 'MAPS_MUS-bk_xmas4_SptkBGCl.txt', 'MAPS_MUS-br_im5_SptkBGCl.mid', 'MAPS_MUS-mz_330_1_SptkBGCl.txt', 'MAPS_MUS-bach_847_SptkBGCl.mid', 'MAPS_MUS-alb_esp6_SptkBGCl.wav', 'MAPS_MUS-bk_xmas4_SptkBGCl.wav', 'MAPS_MUS-mz_330_1_SptkBGCl.wav', 'MAPS_MUS-mond_2_SptkBGCl.mid', 'MAPS_MUS-mz_331_1_SptkBGCl.mid', 'MAPS_MUS-liz_et4_SptkBGCl.wav', 'MAPS_MUS-gra_esp_3_SptkBGCl.mid', 'MAPS_MUS-chpn-p2_SptkBGCl.mid', 'MAPS_MUS-liz_et4_SptkBGCl.txt', 'MAPS_MUS-bk_xmas2_SptkBGCl.mid', 'MAPS_MUS-br_im5_SptkBGCl.wav', 'MAPS_MUS-bach_847_SptkBGCl.wav', 'MAPS_MUS-alb_esp6_Sptk

# **Preprocessing**

In [70]:
# Constants for processing parameters
HOP_LENGTH = 512
N_BINS = 252
BINS_PER_OCTAVE = 36
NUM_NOTES = 88
MAX_SAMPLES_PER_FILE = 4000000

# Training, validation, and test sets
train_features, train_labels = [], []
val_features, val_labels = [], []
test_features, test_labels = [],[]
train_count = 0
val_count = 0
test_count = 0

def preprocess(files, feature_storage, label_storage, count, folder_path, type):
    """
    Processes a list of audio files, extracts their CQT features, aligns them with ground truth labels,
    and stores them in feature and label storage lists.

    Args:
        files (list): List of file names to process.
        feature_storage (list): The list to store CQT features.
        label_storage (list): The list to store label matrices.
        count (int): Counter to keep track of the number of files processed.
        folder_path (str): Path to the folder containing the audio files.

    Returns:
        int: The updated count of processed files.
    """
    for filename in files:
        # Load audio file and calculate window length
        if not filename.lower().endswith('.wav'):
            continue
        
        full_file_path = os.path.join(folder_path, filename)
        sampling_freq, stereo_vector = wavfile.read(full_file_path)
        
        # If audio is stereo (ndim == 2), convert to mono by averaging channels
        if stereo_vector.ndim == 2:
            stereo_vector = stereo_vector.mean(axis=1)

        # Ensure Fortran-contiguous format for librosa
        float_array = np.asfortranarray(stereo_vector / 1.0)

        # Extract CQT features
        try:
            cqt_features = np.abs(librosa.cqt(float_array, sr=sampling_freq, hop_length=HOP_LENGTH, 
                                              n_bins=N_BINS, bins_per_octave=BINS_PER_OCTAVE)).T
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue  # Skip the file if there's an issue

        # Determine the number of frames and create a time vector
        num_frames = cqt_features.shape[0]
        window_length = HOP_LENGTH / float(sampling_freq)
        time_vector = np.arange(1, num_frames + 1) * window_length

        # Initialize binary label matrix with dimensions (frames, notes)
        labels = np.zeros((num_frames, NUM_NOTES))

        # Read and process aligned labels from the corresponding text file
        folder_path_label = "MUS/"+type
        label_file_path = os.path.join(folder_path_label, f"{filename.split('.')[0]}.txt")
        try:
            with open(label_file_path, "r") as file:
                lines = file.readlines()[1:]  # Skip the first line (header)
        except FileNotFoundError:
            print(f"Label file not found for {filename}, skipping.")
            continue  # Skip if the label file does not exist

        for line in lines:
            if line.strip():
                try:
                    start, end, pitch = map(float, line.split('\t')[:3])
                    pitch = int(pitch) - 21
                    start_idx = np.where(time_vector >= start)[0]
                    end_idx = np.where(time_vector > end)[0]

                    if start_idx.size and end_idx.size:
                        labels[start_idx[0]:end_idx[0], pitch] = 1
                except ValueError:
                    print(f"Error parsing line: {line}")
                    continue

        # Ensure files don't exceed the maximum allowed samples per file
        while (len(feature_storage) + len(cqt_features)) >= MAX_SAMPLES_PER_FILE:
            to_add = MAX_SAMPLES_PER_FILE - len(feature_storage)
            feature_storage.extend(cqt_features[:to_add, :])
            label_storage.extend(labels[:to_add, :])

            feature_storage_np = np.array(feature_storage)
            label_storage_np = np.array(label_storage)

            count += 1
            feature_storage = []
            label_storage = []
            cqt_features = cqt_features[to_add:, :]
            labels = labels[to_add:, :]

        # Add remaining CQT features and labels if less than the max allowed
        if len(cqt_features) == MAX_SAMPLES_PER_FILE:
            feature_storage.extend(cqt_features)
            label_storage.extend(labels)
            feature_storage_np = np.array(feature_storage)
            label_storage_np = np.array(label_storage)

            count += 1
            feature_storage = []
            label_storage = []
        elif len(cqt_features) > 0:
            feature_storage.extend(cqt_features)
            label_storage.extend(labels)

    return count

In [81]:
train_count = preprocess(files_train, train_features, train_labels, train_count, folder_path_train,"train")
test_count = preprocess(files_test, test_features, test_labels, test_count, folder_path_test,"test")
val_count = preprocess(files_val, val_features, val_labels, val_count, folder_path_validation,"validation")

In [83]:
print(len(train_features), len(train_labels))
print(len(test_features), len(test_labels))
print(len(val_features), len(val_labels))

1002404 1002404
120046 120046
214852 214852


In [87]:
print(val_features[1])
print(val_labels[3128])

[1.26247715 3.34382455 4.193564   1.21715206 2.56971742 3.92374946
 2.62094867 0.49291579 1.83522757 1.97284751 1.24932324 0.22804384
 0.60954276 0.84884565 0.68742419 0.38717084 0.10932005 0.20133487
 0.38653377 0.19615492 0.07954734 0.05405585 0.05381713 0.07940027
 0.22942333 0.1004976  0.25170722 0.21345577 0.08918274 0.11633008
 0.12876456 0.23205345 0.09492521 0.40929257 0.1622173  0.1742389
 0.11214504 0.38406738 0.08026482 0.37651087 0.52047902 0.3553573
 0.23245577 0.55621974 0.54132612 0.51501621 0.58460044 0.48808568
 0.14209117 0.64718753 0.83478702 0.53035571 0.40087926 0.52906981
 0.72820123 1.27839933 1.07559764 0.34710241 0.64346272 1.73411278
 1.01550131 2.03847055 1.96620169 1.00971576 0.53550953 0.36454564
 1.4919704  0.89495478 3.31558091 2.97018455 1.28069179 0.64816916
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.       

# Normalization

In [85]:
# Get the minimum and maximum values from the X values in the training set
max_train = max(max(val) for val in train_features)
min_train = min(min(val) for val in train_features)
min_train, max_train

(0.0, 201740.499183033)

In [86]:
# Normalize each X value in the training set for every single frame (but this one hasnt taken into account the mean)
initial_norm_train_features = [[(x - min_train) / (max_train - min_train) for x in inner_list] for inner_list in train_features]

In [88]:
# Calculate the mean of the normalized values
flattened_normalized_X_train = [value for sublist in initial_norm_train_features for value in sublist]
train_mean = sum(flattened_normalized_X_train) / len(flattened_normalized_X_train)

In [89]:
train_mean

0.005631972873553208

In [90]:
# Normalize each X value in the training set for every single frame
norm_train_X = [[value - train_mean for value in sublist] for sublist in initial_norm_train_features]

# Normalize each X value in the validation set for every single frame
norm_val_X = [[value - train_mean for value in sublist] for sublist in val_features]

# Normalize each X value in the test set for every single frame
norm_test_X = [[value - train_mean for value in sublist] for sublist in test_features]


# Baseline Log Regression Model

In [91]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, multilabel_confusion_matrix

In [92]:
# Training data
X_train = np.array(norm_train_X)
y_train = np.array(train_labels)

model = OneVsRestClassifier(LogisticRegression(solver='saga', tol=0.01, max_iter=100))
model.fit(X_train, y_train)



OneVsRestClassifier(estimator=LogisticRegression(solver='saga', tol=0.01))

In [93]:
# Testing set
X_test = np.array(norm_test_X)
y_test = np.array(test_labels)

In [94]:
# Making Prediction
y_pred = model.predict(X_test)

In [95]:
report = classification_report(y_test, y_pred)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.07      1.00      0.13      3336
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.04      0.73      0.08       816
           9       0.00      1.00      0.00        32
          10       0.09      0.95      0.17      1206
          11       0.03      0.96      0.07       698
          12       0.28      0.95      0.44      6366
          13       0.09      0.99      0.17      3116
          14       0.08      0.87      0.14      1236
          15       0.32      0.94      0.47      3894
          16       0.09      0.82      0.17       986
          17       0.19    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
# Confusion matrices
conf_matrices = multilabel_confusion_matrix(y_test, y_pred)

In [104]:
TN_total = conf_matrices[:, 0, 0].sum()
FP_total = conf_matrices[:, 0, 1].sum()
FN_total = conf_matrices[:, 1, 0].sum()
TP_total = conf_matrices[:, 1, 1].sum()
precision = TP_total / float(TP_total + FP_total)
recall = TP_total / float(TP_total + FN_total)
f1_score = 100 * 2 * precision * recall / (precision + recall)
accuracy = TP_total / float(TP_total + FP_total + FN_total)

print(f"Total True Negatives (TN): {TN_total}")
print(f"Total False Positives (FP): {FP_total}")
print(f"Total False Negatives (FN): {FN_total}")
print(f"Total True Positives (TP): {TP_total}")

print(f"\nAccuracy: {accuracy:.15f}")

print(f"\nRecall: {recall:.15f}")

print(f"\nPrecision: {precision:.15f}")

print(f"\nF-measure: {f1_score:.2f}")

Total True Negatives (TN): 8714120
Total False Positives (FP): 1394664
Total False Negatives (FN): 44188
Total True Positives (TP): 411076

Accuracy: 0.222211891489831

Recall: 0.902939832712448

Precision: 0.227649606255607

F-measure: 36.36


# Application

In [110]:
def preprocess_single_audio(file_path):
    """
    Processes a single audio file, extracts its CQT features.

    Args:
        file_path (str): Path to the .wav file to process.

    Returns:
        np.ndarray: Extracted CQT features from the audio file.
    """
    # Load the audio file
    sampling_freq, stereo_vector = wavfile.read(file_path)

    # If stereo (2D), convert to mono by averaging the channels
    if len(stereo_vector.shape) == 2:
        float_array = stereo_vector.mean(axis=1)  # Convert to mono by averaging channels
    else:
        float_array = stereo_vector

    # Convert to float type and ensure Fortran-contiguity
    float_array = float_array / 1.0
    float_array = np.asfortranarray(float_array)  # Ensure Fortran contiguity

    # Extract CQT features
    cqt_features = np.abs(librosa.cqt(float_array, sr=sampling_freq, hop_length=HOP_LENGTH, n_bins=N_BINS, bins_per_octave=BINS_PER_OCTAVE)).T

    return cqt_features


In [115]:
cqt_features = preprocess_single_audio("sample/bee.wav")
print(cqt_features.shape) 
print(cqt_features)

(1850, 252)
[[2.17684854e+02 1.57981411e+02 1.47884884e+03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.17604614e+02 1.60361315e+02 1.47921326e+03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.17425644e+02 1.66343285e+02 1.48032090e+03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [4.95789456e+02 1.00039470e+03 4.62234693e+02 ... 3.11889514e+02
  9.93173209e+02 4.86098692e+02]
 [4.97109182e+02 1.00095879e+03 4.44941630e+02 ... 5.32992270e+00
  2.00924157e+00 2.55606982e+00]
 [4.97609825e+02 1.00117257e+03 4.38081383e+02 ... 1.21831726e+00
  1.41130249e+00 1.27370883e+00]]


In [117]:

cqt_min = np.min(cqt_features)
cqt_max = np.max(cqt_features)

cqt_features_normalized = (cqt_features - cqt_min) / (cqt_max - cqt_min)

print(cqt_features_normalized.shape)
print(cqt_features_normalized)

cqt_features_normalized = 2 * (cqt_features - cqt_min) / (cqt_max - cqt_min) - 1

print(cqt_features_normalized.shape)
print(cqt_features_normalized)



(1850, 252)
[[6.48963131e-04 4.70974941e-04 4.40875126e-03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.48723920e-04 4.78069922e-04 4.40983766e-03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.48190374e-04 4.95903398e-04 4.41313978e-03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [1.47804991e-03 2.98238148e-03 1.37801628e-03 ... 9.29806518e-04
  2.96085274e-03 1.44915975e-03]
 [1.48198428e-03 2.98406314e-03 1.32646212e-03 ... 1.58895911e-05
  5.98996061e-06 7.62016763e-06]
 [1.48347680e-03 2.98470048e-03 1.30601032e-03 ... 3.63205326e-06
  4.20738175e-06 3.79718687e-06]]
(1850, 252)
[[-0.99870207 -0.99905805 -0.9911825  ... -1.         -1.
  -1.        ]
 [-0.99870255 -0.99904386 -0.99118032 ... -1.         -1.
  -1.        ]
 [-0.99870362 -0.99900819 -0.99117372 ... -1.         -1.
  -1.        ]
 ...
 [-0.9970439  -0.99403524 -0.99724397 ... -0.99814039 -0.99407829
  -0.99710168]
 [-0.99703603 -0.99403187 -0.99734708 ... -0.99996822 -0.99998802
  -0.