# **Get Dataset**

In [1]:
import os
from scipy.io import wavfile
import numpy as np
import librosa

In [43]:
# Define the folder paths for training, testing, and validation files
folder_path_train = "MUS/train"
folder_path_test = "MUS/test"
folder_path_validation = "MUS/validation"

# Check and list files in the training folder
if os.path.exists(folder_path_train) and os.path.isdir(folder_path_train):
    files_train = os.listdir(folder_path_train)
    print(f"Files in 'train' folder ({folder_path_train}):", files_train)
else:
    print(f"The folder '{folder_path_train}' does not exist or is not a directory.")
    files_train = []  # Assign empty list if folder doesn't exist

# Check and list files in the testing folder
if os.path.exists(folder_path_test) and os.path.isdir(folder_path_test):
    files_test = os.listdir(folder_path_test)
    print(f"Files in 'test' folder ({folder_path_test}):", files_test)
else:
    print(f"The folder '{folder_path_test}' does not exist or is not a directory.")
    files_test = []  # Assign empty list if folder doesn't exist

# Check and list files in the validation folder
if os.path.exists(folder_path_validation) and os.path.isdir(folder_path_validation):
    files_val = os.listdir(folder_path_validation)
    print(f"Files in 'validation' folder ({folder_path_validation}):", files_val)
else:
    print(f"The folder '{folder_path_validation}' does not exist or is not a directory.")
    files_val = []  # Assign empty list if folder doesn't exist

Files in 'train' folder (MUS/train): ['MAPS_MUS-mond_2_SptkBGCl.wav', 'MAPS_MUS-gra_esp_3_SptkBGCl.txt', 'MAPS_MUS-mz_331_1_SptkBGCl.wav', 'MAPS_MUS-chpn-p2_SptkBGCl.txt', 'MAPS_MUS-liz_et4_SptkBGCl.mid', 'MAPS_MUS-bk_xmas2_SptkBGCl.txt', 'MAPS_MUS-mond_2_SptkBGCl.txt', 'MAPS_MUS-gra_esp_3_SptkBGCl.wav', 'MAPS_MUS-mz_331_1_SptkBGCl.txt', 'MAPS_MUS-chpn-p2_SptkBGCl.wav', 'MAPS_MUS-bk_xmas2_SptkBGCl.wav', 'MAPS_MUS-alb_esp6_SptkBGCl.txt', 'MAPS_MUS-bk_xmas4_SptkBGCl.txt', 'MAPS_MUS-br_im5_SptkBGCl.mid', 'MAPS_MUS-mz_330_1_SptkBGCl.txt', 'MAPS_MUS-bach_847_SptkBGCl.mid', 'MAPS_MUS-alb_esp6_SptkBGCl.wav', 'MAPS_MUS-bk_xmas4_SptkBGCl.wav', 'MAPS_MUS-mz_330_1_SptkBGCl.wav', 'MAPS_MUS-mond_2_SptkBGCl.mid', 'MAPS_MUS-mz_331_1_SptkBGCl.mid', 'MAPS_MUS-liz_et4_SptkBGCl.wav', 'MAPS_MUS-gra_esp_3_SptkBGCl.mid', 'MAPS_MUS-chpn-p2_SptkBGCl.mid', 'MAPS_MUS-liz_et4_SptkBGCl.txt', 'MAPS_MUS-bk_xmas2_SptkBGCl.mid', 'MAPS_MUS-br_im5_SptkBGCl.wav', 'MAPS_MUS-bach_847_SptkBGCl.wav', 'MAPS_MUS-alb_esp6_Sptk

# **Preprocessing**

In [49]:
# Constants for processing parameters
HOP_LENGTH = 512
N_BINS = 252
BINS_PER_OCTAVE = 36
NUM_NOTES = 88
MAX_SAMPLES_PER_FILE = 4000000

# Training, validation, and test sets
train_features, train_labels = [], []
val_features, val_labels = [], []
test_features, test_labels = [],[]
train_count = 0
val_count = 0
test_count = 0

def preprocess(files, feature_storage, label_storage, count, folder_path, type):
    """
    Processes a list of audio files, extracts their CQT features, aligns them with ground truth labels,
    and stores them in feature and label storage lists.

    Args:
        files (list): List of file names to process.
        feature_storage (list): The list to store CQT features.
        label_storage (list): The list to store label matrices.
        count (int): Counter to keep track of the number of files processed.
        folder_path (str): Path to the folder containing the audio files.

    Returns:
        int: The updated count of processed files.
    """
    for filename in files:
        # Load audio file and calculate window length
        if not filename.lower().endswith('.wav'):
            continue
        
        full_file_path = os.path.join(folder_path, filename)
        sampling_freq, stereo_vector = wavfile.read(full_file_path)
        
        # If audio is stereo (ndim == 2), convert to mono by averaging channels
        if stereo_vector.ndim == 2:
            stereo_vector = stereo_vector.mean(axis=1)

        # Ensure Fortran-contiguous format for librosa
        float_array = np.asfortranarray(stereo_vector / 1.0)

        # Extract CQT features
        try:
            cqt_features = np.abs(librosa.cqt(float_array, sr=sampling_freq, hop_length=HOP_LENGTH, 
                                              n_bins=N_BINS, bins_per_octave=BINS_PER_OCTAVE)).T
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue  # Skip the file if there's an issue

        # Determine the number of frames and create a time vector
        num_frames = cqt_features.shape[0]
        window_length = HOP_LENGTH / float(sampling_freq)
        time_vector = np.arange(1, num_frames + 1) * window_length

        # Initialize binary label matrix with dimensions (frames, notes)
        labels = np.zeros((num_frames, NUM_NOTES))

        # Read and process aligned labels from the corresponding text file
        folder_path_label = "MUS/"+type
        label_file_path = os.path.join(folder_path_label, f"{filename.split('.')[0]}.txt")
        try:
            with open(label_file_path, "r") as file:
                lines = file.readlines()[1:]  # Skip the first line (header)
        except FileNotFoundError:
            print(f"Label file not found for {filename}, skipping.")
            continue  # Skip if the label file does not exist

        for line in lines:
            if line.strip():
                try:
                    start, end, pitch = map(float, line.split('\t')[:3])
                    pitch = int(pitch) - 21
                    start_idx = np.where(time_vector >= start)[0]
                    end_idx = np.where(time_vector > end)[0]

                    if start_idx.size and end_idx.size:
                        labels[start_idx[0]:end_idx[0], pitch] = 1
                except ValueError:
                    print(f"Error parsing line: {line}")
                    continue

        # Ensure files don't exceed the maximum allowed samples per file
        while (len(feature_storage) + len(cqt_features)) >= MAX_SAMPLES_PER_FILE:
            to_add = MAX_SAMPLES_PER_FILE - len(feature_storage)
            feature_storage.extend(cqt_features[:to_add, :])
            label_storage.extend(labels[:to_add, :])

            feature_storage_np = np.array(feature_storage)
            label_storage_np = np.array(label_storage)

            count += 1
            feature_storage = []
            label_storage = []
            cqt_features = cqt_features[to_add:, :]
            labels = labels[to_add:, :]

        # Add remaining CQT features and labels if less than the max allowed
        if len(cqt_features) == MAX_SAMPLES_PER_FILE:
            feature_storage.extend(cqt_features)
            label_storage.extend(labels)
            feature_storage_np = np.array(feature_storage)
            label_storage_np = np.array(label_storage)

            count += 1
            feature_storage = []
            label_storage = []
        elif len(cqt_features) > 0:
            feature_storage.extend(cqt_features)
            label_storage.extend(labels)

    return count

In [50]:
train_count = preprocess(files_train, train_features, train_labels, train_count, folder_path_train,"train")
test_count = preprocess(files_test, test_features, test_labels, test_count, folder_path_test,"test")
val_count = preprocess(files_val, val_features, val_labels, val_count, folder_path_validation,"validation")

Error parsing line: 1.07144	1.35715	68

Error parsing line: 1.07144	1.35715	73

Error parsing line: 1.07144	1.35715	65

Error parsing line: 1.35774	1.93015	72

Error parsing line: 1.35774	1.93015	68

Error parsing line: 1.35774	1.93015	63

Error parsing line: 1.92954	2.21851	67

Error parsing line: 1.92954	2.21851	70

Error parsing line: 1.92954	2.21851	61

Error parsing line: 2.21851	2.29076	72

Error parsing line: 2.21851	2.29076	75

Error parsing line: 2.21851	2.29076	60

Error parsing line: 2.79643	2.86765	65

Error parsing line: 2.79643	2.86765	73

Error parsing line: 2.79643	2.86765	49

Error parsing line: 2.79643	2.86765	58

Error parsing line: 3.08128	3.1525	68

Error parsing line: 3.08128	3.1525	72

Error parsing line: 3.08128	3.1525	63

Error parsing line: 3.08128	3.1525	51

Error parsing line: 3.65097	3.72291	67

Error parsing line: 3.65097	3.72291	70

Error parsing line: 3.65097	3.72291	51

Error parsing line: 3.65097	3.72291	61

Error parsing line: 3.93873	4.01067	68

Erro

In [51]:
print(len(train_features), len(train_labels))
print(len(test_features), len(test_labels))
print(len(val_features), len(val_labels))

501202 501202
60023 60023
107426 107426


In [69]:
print(val_features[1])
print(val_labels[12])

[1.26247715 3.34382455 4.193564   1.21715206 2.56971742 3.92374946
 2.62094867 0.49291579 1.83522757 1.97284751 1.24932324 0.22804384
 0.60954276 0.84884565 0.68742419 0.38717084 0.10932005 0.20133487
 0.38653377 0.19615492 0.07954734 0.05405585 0.05381713 0.07940027
 0.22942333 0.1004976  0.25170722 0.21345577 0.08918274 0.11633008
 0.12876456 0.23205345 0.09492521 0.40929257 0.1622173  0.1742389
 0.11214504 0.38406738 0.08026482 0.37651087 0.52047902 0.3553573
 0.23245577 0.55621974 0.54132612 0.51501621 0.58460044 0.48808568
 0.14209117 0.64718753 0.83478702 0.53035571 0.40087926 0.52906981
 0.72820123 1.27839933 1.07559764 0.34710241 0.64346272 1.73411278
 1.01550131 2.03847055 1.96620169 1.00971576 0.53550953 0.36454564
 1.4919704  0.89495478 3.31558091 2.97018455 1.28069179 0.64816916
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.       

# Normalization

In [52]:
# Get the minimum and maximum values from the X values in the training set
max_train = max(max(val) for val in train_features)
min_train = min(min(val) for val in train_features)
min_train, max_train

(0.0, 201740.499183033)

In [53]:
# Normalize each X value in the training set for every single frame (but this one hasnt taken into account the mean)
initial_norm_train_features = [[(x - min_train) / (max_train - min_train) for x in inner_list] for inner_list in train_features]

In [54]:
# Calculate the mean of the normalized values
flattened_normalized_X_train = [value for sublist in initial_norm_train_features for value in sublist]
train_mean = sum(flattened_normalized_X_train) / len(flattened_normalized_X_train)

In [55]:
train_mean

0.005631972873551941

In [56]:
# Normalize each X value in the training set for every single frame
norm_train_X = [[value - train_mean for value in sublist] for sublist in initial_norm_train_features]

# Normalize each X value in the validation set for every single frame
norm_val_X = [[value - train_mean for value in sublist] for sublist in val_features]

# Normalize each X value in the test set for every single frame
norm_test_X = [[value - train_mean for value in sublist] for sublist in test_features]


# Baseline Log Regression Model

In [57]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, multilabel_confusion_matrix

In [59]:
# Training data
X_train = np.array(norm_train_X)
y_train = np.array(train_labels)

model = OneVsRestClassifier(LogisticRegression(solver='saga', tol=0.01, max_iter=100))
model.fit(X_train, y_train)



OneVsRestClassifier(estimator=LogisticRegression(solver='saga', tol=0.01))

In [60]:
# Testing set
X_test = np.array(norm_test_X)
y_test = np.array(test_labels)

In [61]:
# Making Prediction
y_pred = model.predict(X_test)

In [62]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
# Confusion matrices
conf_matrices = multilabel_confusion_matrix(y_test, y_pred)

In [64]:
TN_total = conf_matrices[:, 0, 0].sum()
FP_total = conf_matrices[:, 0, 1].sum()
FN_total = conf_matrices[:, 1, 0].sum()
TP_total = conf_matrices[:, 1, 1].sum()
precision = TP_total / float(TP_total + FP_total)
recall = TP_total / float(TP_total + FN_total)
f1_score = 100 * 2 * precision * recall / (precision + recall)
accuracy = TP_total / float(TP_total + FP_total + FN_total)

print(f"Total True Negatives (TN): {TN_total}")
print(f"Total False Positives (FP): {FP_total}")
print(f"Total False Negatives (FN): {FN_total}")
print(f"Total True Positives (TP): {TP_total}")

print(f"\nAccuracy: {accuracy:.15f}")

print(f"\nRecall: {recall:.15f}")

print(f"\nPrecision: {precision:.15f}")

print(f"\nF-measure: {f1_score:.2f}")

Total True Negatives (TN): 5282024
Total False Positives (FP): 0
Total False Negatives (FN): 0
Total True Positives (TP): 0

Accuracy: nan

Recall: nan

Precision: nan

F-measure: nan


  precision = TP_total / float(TP_total + FP_total)
  recall = TP_total / float(TP_total + FN_total)
  accuracy = TP_total / float(TP_total + FP_total + FN_total)
