In [1]:
SCRIPT_INTRODUCTION = """
   ==================== [_ESP_Pipe003_For_GitHub_Refactored_EVAL_DetectionAndRecognition.ipynb] ====================
  This Script was intended to build a recognition evaluation, for the onsets, using SVM models. Copy this notebooks as a reference
  to make it for other models. In this notebook we also 
  ==================== ==================[ INFO ] ==============================
"""

import librosa
from google.colab import drive
import os,sys,re,pandas as pd,numpy as np
import glob
import logging
from sympy import Interval
import warnings

In [2]:
ROOT_DIR = "/content/drive"
drive.mount(ROOT_DIR)
MUSIC_DIR = os.path.join(ROOT_DIR,'My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums')
model_predictions_path_hardcoded = MUSIC_DIR+'/models/model_predictions_rnn_fulltrained.pkl'
MUSIC_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums'

# Define NN arquitecture again, for loading the models with .pth instead of pkl or joblib

In [3]:
# TORCH MODULES
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torch
import torch.nn as nn
# Load the Net module

# I unfortunately didn mange hwo to load the models without defyining the class
class Net(nn.Module):
  "Generic class for our NN. This must be loaded, if not loaded torch.load our model will throw an errror"
  def __init__(self,nchannels,nclasses, unique_labels, meanstd_normalize):
      # start
      super().__init__()
      # this is the normalizer to used in the predictor then
      self.meanstd_normalizer = torchvision.transforms.Normalize(**meanstd_normalize, inplace=False) 
      # remove it if you want to this is jsut for cleaner predictions (use labels instead of etc)
      self.unique_labels = unique_labels
      # other attributes
      self.nchannels = nchannels
      self.nclasses = nclasses
      self.conv1 = nn.Conv2d(self.nchannels, 6, 5)
      self.pool = nn.MaxPool2d(2, 2)
      self.conv2 = nn.Conv2d(6, 16, 5)
      self.fc1 = nn.Linear(2000, 120)
      self.dropout1 = nn.Dropout(p=0.5, inplace=False)
      self.fc2 = nn.Linear(120, 84)
      #self.dropout2 = nn.Dropout(p=0.3, inplace=False)
      self.fc3 = nn.Linear(84, self.nclasses)
  def forward(self, x):
      "Prints are commented for debugging purposes"
      # conv1 
      x = self.conv1(x)
      #print("Conv1:",x.shape)
      x =F.relu(x)
      x = self.pool(x)
      #print("Pool1:",x.shape)
      x = self.conv2(x)
      #print("Conv2:",x.shape)
      x = F.relu(x)
      x = self.pool(x)
      #print("Pool2:",x.shape)
      # flatten  all dims except the batch; 
      x = torch.flatten(x, 1) # flatten all dimensions except batch
      #print("Flattened, except batch:",x.shape)
      x = self.fc1(x)
      x=F.relu(x)
      x = self.dropout1(x)
      x = self.fc2(x)
      # pass over relu
      x = F.relu(x)
      #
      #x = self.dropout2(x)
      # pass over fc3 omg
      x = self.fc3(x)
      #https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
      return x
  def map_idx2labels(self,mapped_labels,unique_labels):
    labels = list()
    for idx in range(len(mapped_labels)):
      mapped_lab = mapped_labels[idx]
      label_name = unique_labels[mapped_lab]
      labels.append(label_name)
    return labels

  def predict(self,x_batch):
    """
    Final prediction function
    params:
      x_batch -> np.array model dimensions data_lenx513x17 data
    return: mapped prediction (either target label or other)
    """
    #x_batch = x_test[:10].copy()
    #N = len(x_batch)
    data_tensor = torch.tensor(x_batch, dtype=torch.float32)
    N,H,W = data_tensor.shape
    #print(N,H,W)
    data_tensor = data_tensor.reshape(N,1,H,W)
    # noramalize data
    data_tensor = self.meanstd_normalizer(data_tensor)
    #forward pass
    predictions = self.forward(data_tensor)
    # calculate the .max(1) for each batch and convert them to list
    predictions = predictions.argmax(1).tolist()
    predictions2labels = self.map_idx2labels(predictions,self.unique_labels)
    return predictions2labels

In [4]:
%cd "/content/drive/My Drive/Colab Notebooks/tesis_esp"
import eval_utils
import pred_utils

/content/drive/My Drive/Colab Notebooks/tesis_esp


# You should take models from ./models instead of using this path

In [5]:
EXP_PIPE_DATA = os.path.join(MUSIC_DIR,'pipe005_multiplemodelsdata_corrected_over60')
model_type = "NN"

In [6]:
if model_type == "SVC":
  # svc takes sparse vector as inputs (flattened) , not the data as a matrix
  # therefor needs to be flattened
  flatten_data = True
  print("[INFO ]Loading SVC models")
  # load all models SVC classifier, each of them has it own aparameters ; to acces to them use model_hh.best_estimator_
  model_tt = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_TT.joblib'))
  model_cy = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_CY.joblib'))
  model_kd = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_KD.joblib'))
  model_sd = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_SD.joblib'))
  model_hh = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_HH.joblib'))
  model_ot = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'model_OT.joblib'))
elif model_type == "NN":
  # no need to flatten the data for the conv
  flatten_data = False
  print("[INFO ]Loading NN  models")
  model_tt = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_TT.pth'))
  model_cy = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_CY.pth'))
  model_kd = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_KD.pth'))
  model_sd = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_SD.pth'))
  model_hh = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_HH.pth'))
  model_ot = pred_utils.load_model(os.path.join(EXP_PIPE_DATA,'nn_model_OT.pth'))
else:
  raise ValueError("Param: model_type must be 'SVC' or 'NN' for now, please choose a valid model")

models_list = [model_tt,model_cy,model_kd,model_sd,model_hh,model_ot]

[INFO ]Loading NN  models


In [7]:
AUDIO_DIR = os.path.join(MUSIC_DIR,'audio','drum_only')
ANNOTATIONS_DIR = os.path.join(MUSIC_DIR,'annotations','class')
ANNOTATIONS_DIR_TEST = os.path.join(ANNOTATIONS_DIR,'test')
annotations_test_filepaths = glob.glob(ANNOTATIONS_DIR_TEST+"/*.txt")
annotations_test_filepaths

['/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_Hendrix_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_SwingJazz_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_FreeJazz_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_Beatles_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_Country1_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_SpeedMetal_class.txt',
 '/content/drive/My Drive/Maestria DM y KDD/Especializacion tesis/MDBDrums/MDB Drums/annotations/class/test/MusicDelta_Punk_class.txt',
 '/content/drive/My Dri

In [8]:
# step 1 load the annotations dataset you want to scan
# step 2 load the corresponding .wav
# step 3 for each sound of the annotations, add the label (break step inot more steps)#
# step 4 use evalmetrics class
config_signal_params = {"hop_size":256,"n_fft":1024,"desired_signal_size_for_padding":4096,"seconds_window":0.05}
hop_size, n_fft, desired_signal_size_for_padding, seconds_window  = config_signal_params.values()

In [9]:
# step1
annotation_path = annotations_test_filepaths[0]

def pipeline_generate_datasets(annotation_path: str, onsets_annotation_list = None):
  """
  --------------------------------------------------------------------------------
  Given an annotation path, this function will geneate datasets with 
  predictions on a .wav and its annotation (loading the .txt);
   this is a pre function for calculatiing precision and recall
   --------------------------------------------------------------------------------
  args: 
    annotation_path -> .wav song ; a .txt with the same name must exist
    onsets_annotation_list -> annnotations that could come from an onset detector 
                          or from annotations by an user. 
                          it is a list with the times an onset occurs like
                          [0.15,0.32,..] and that means an onset occured
                          at the second .15 and .32 (that could be detected by
                          anonset detector or just annotations).
                          If onsets_annotation_list we will use the annotations
                          from the MDB Drums

                          
  return:
    df_annotation -> annotations loaded into dataframe
    df_predicted -> predictiosn from the model (comes from pred_utils models)
  In case you wanna change the models you ened to change : pred_utils.config_signal_params, sklearn_models_list within the function; the rest will be the same
  """
  df_annotation = eval_utils.load_labels(annotation_path,set2df = True)
  # step 2: load correponsding .wav
  # 2.a search path
  wav_path = eval_utils.search_correspondingpath_given_annotation(annotation_path_txt = annotation_path, audio_directory = AUDIO_DIR)
  # 2.b now load the wav
  signal,sampling_rate = librosa.load(wav_path)
  # 3.c) now use annotations onset, here you could use an onset detection method aswell  and DruMTypesDetector's instance to create a dataset with annotations and predictions 
  # 3.c.i) first set onset time to onset_sample
  df_annotation["onset_sample"] = (df_annotation["onset_time"].astype("float")*sampling_rate).astype("int")


  if onsets_annotation_list is None:
    print("[INFO] Since onsets_annotation_list is None, we will be using labeled onsets for presettled onsets; if you have an OnsetDetector, put their predicts output in this param ")
    onsets_annotation_list = df_annotation["onset_sample"].tolist()
  else:
    print("[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task")
  # 3.d) now perform the predicts and put this into a dataframe
  drum_types_detector = pred_utils.DrumTypesDetector(config_signal_params,
                                        sklearn_models_list = models_list,
                                        flatten_data = flatten_data)
  drum_types_detector(wav_path, presettled_onsets= onsets_annotation_list)
  df_predicted = pd.DataFrame(drum_types_detector.list_formatted_onsets, columns=["onset_time","predicted_drumtype"])
  # now write annotation path for both datasets
  df_predicted["annotation_path"] = annotation_path
  df_annotation["annotation_path"] = annotation_path
  return df_annotation, df_predicted



# First load rnn osnet detector predicitons taken from the other script (onset detector rnn)

In [10]:
# this is for all songs
sampling_rate =  int(44100/2)
df_predictions_rnn_onsetdetector = pd.read_pickle(model_predictions_path_hardcoded)
# generate sample columns
df_predictions_rnn_onsetdetector["onset_sample"] =  (df_predictions_rnn_onsetdetector["onset_time"].astype("float")*sampling_rate).astype("int")
# replace audio path in order to match annotations dataset ; anotehr smarter way would be to just pick up names instead of fullpath --> we will do it later
df_predictions_rnn_onsetdetector["audio_path"] = df_predictions_rnn_onsetdetector["audio_path"].str.replace("_Drum.wav","_class.txt").str.replace("/audio/drum_only/","/annotations/class/")
df_predictions_rnn_onsetdetector.head()

  import sys


Unnamed: 0,onset_time,audio_path,onset_sample
0,0.02,/content/drive/My Drive/Maestria DM y KDD/Espe...,441
1,0.29,/content/drive/My Drive/Maestria DM y KDD/Espe...,6394
2,0.57,/content/drive/My Drive/Maestria DM y KDD/Espe...,12568
3,0.85,/content/drive/My Drive/Maestria DM y KDD/Espe...,18742
4,1.0,/content/drive/My Drive/Maestria DM y KDD/Espe...,22050


# Now generate the predictions of drum types for all onset detected in df_predictions_rnn_onsetdetector

In [11]:
# build a huge dataset with this pipeline
annotation_list, predicted_list = list(), list()
#  vbuild a dataset with all paths

counter = 0
total_files2process = len(annotations_test_filepaths)
for annotation_path in annotations_test_filepaths:
  print("Proportion of processed:",round(counter/total_files2process,2))

  # load the predetected onset samples by our model
  predetected_onset_samples = df_predictions_rnn_onsetdetector[ df_predictions_rnn_onsetdetector["audio_path"] == annotation_path ]["onset_sample"].tolist()
  # now  input the predetected onset by the rnn
  df_annotation, df_predicted = pipeline_generate_datasets(annotation_path, onsets_annotation_list = predetected_onset_samples)
  annotation_list.append(df_annotation)
  predicted_list.append(df_predicted)
  counter += 1
# now create a df based on the list of dfs
df_annotation_all = pd.concat(annotation_list)
df_predicted_all = pd.concat(predicted_list)

Proportion of processed: 0.0
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.09
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.18
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.27
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.36
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.45
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.55
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.64
[INFO] Since onsets_annotation_list is NOT None, this is a Detection+Recognition task
Proportion of processed: 0.73
[INFO] Since onsets_annotation_list is NOT 

# Compute metrics (F1,recall, precision) for each drumtype

In [12]:
# criterion; this criterion is for onset detection;
# but take into account that you are passing five models, 
# for example if the HH is on second 0.02 and SD on 0.03 ; using window time
# you can recognize the HH at second 0.03 and SD at second 0.02; and this will be correct (since it is part of the same STFT / sound)
# therefore no need to change sample criterion so much (0.01 is min in recognition)
seconds_criterion = 0.03
# this is for all of our songs (see it in the librosa.laod return)
sampling_rate =  int(44100/2)
# add onset_sample which if forgot
df_predicted_all["onset_sample"] = (df_predicted_all["onset_time"]*sampling_rate).astype("int")
df_predicted_all["drum_type"] = df_predicted_all["predicted_drumtype"]
#
samples_criterion = int(seconds_criterion*sampling_rate)
# all drumtypes
drum_types_list = ["KD","SD","HH","CY","OT","TT"]

# list all the metrics for each drumtype
metrics_dict_list = list()
for drum_type in drum_types_list:
  compute_metrics = eval_utils.ComputeMetrics(true_labels = df_annotation_all, predicted_labels = df_predicted_all, samples_criterion = samples_criterion, filter_drumtype = drum_type)
  metrics_dict = compute_metrics()
  metrics_dict["drum_type"] = drum_type
  metrics_dict_list.append(metrics_dict)

  self.precision = np.sum(total_matches_precision)/self.precision_denominator


# Show metrics 

In [13]:
metrics_dict_list

[{'recall': 0.9635535307517085,
  'precision': 0.9724137931034482,
  'f1_score': 0.9679633867276888,
  'drum_type': 'KD'},
 {'recall': 0.9022222222222223,
  'precision': 0.9115442278860569,
  'f1_score': 0.9068592694369233,
  'drum_type': 'SD'},
 {'recall': 0.7268339768339769,
  'precision': 0.7917981072555205,
  'f1_score': 0.7579265223955711,
  'drum_type': 'HH'},
 {'recall': 0.4421768707482993,
  'precision': 0.7168141592920354,
  'f1_score': 0.5469561603989196,
  'drum_type': 'CY'},
 {'recall': 0.0, 'precision': nan, 'f1_score': nan, 'drum_type': 'OT'},
 {'recall': 0.0, 'precision': nan, 'f1_score': nan, 'drum_type': 'TT'}]