In [1]:
drive = False  # False for Local
if drive:
    !pip install pretty_midi

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pretty_midi
from mido import Message, MidiFile, MidiTrack
import pandas as pd
import json
import torch as torch
import math
from bisect import bisect
from tqdm import tqdm

In [3]:
if drive:
    from google.colab import drive
    drive.mount('/content/drive')
    my_drive_path = #DRIVE PATH HERE
else: # local
    my_drive_path = "./"

In [4]:
my_data = pd.read_csv(my_drive_path + 'maestro-v3.0.0/maestro-v3.0.0.csv')

In [5]:
my_data.iloc[45]

canonical_composer                                       Claude Debussy
canonical_title                          "Voiles" from Preludes, Book I
split                                                             train
year                                                               2008
midi_filename         2008/MIDI-Unprocessed_07_R3_2008_01-05_ORIG_MI...
audio_filename        2008/MIDI-Unprocessed_07_R3_2008_01-05_ORIG_MI...
duration                                                     212.295461
Name: 45, dtype: object

## Initial Processing of Pieces
- convert to piano roll
- get num beats; tempo
- remove silence at beginning/end

In [None]:
path_beginning = my_drive_path + "maestro-v3.0.0/"
pieces = []

# how many steps you get
for i in tqdm(range(my_data.shape[0])):  # 1276 pieces
  # loads the piece into pretty_midi form
  path_part = my_data.iloc[i]["midi_filename"]
  split = my_data.iloc[i]["split"]
  path = path_beginning + path_part
  midi_data = pretty_midi.PrettyMIDI(path)

  # estimates the tempo of the piece
  tempo = midi_data.estimate_tempo()
  print(" estimated tempo:", tempo, end="")
  # gets piano roll sampled at estimated tempo
  # (diff tempo for each piece)
  roll = midi_data.get_piano_roll(tempo/60).T
  print("\tbeats:", roll.shape[0], end=" -> ")  # before silence removed or any other cut/padding
    
  # removes silence at beginning of piece
  over = False
  r = np.zeros(128)
  while over == False:
    m = roll[0]
    if np.array_equal(m,r):
      roll = roll[1:]
    else:
      over = True
    
  # removes silence at end of piece
  over = False
  r = np.zeros(128)
  while over == False:
    m = roll[-1]
    if np.array_equal(m,r):
      roll = roll[:-1]
    else:
      over = True
    
  # converts roll to binary matrix (only 0s and 1s)
  roll = np.where(roll > 0, 1, 0)
    
  # recalculate num beats after removing silence etc.
  beats = roll.shape[0]
  print(beats)  # ends print line from above (pre-processing beats)

  # save to data_sizes - beats, tempo, split, etc. for each piece
  this_piece = my_data.iloc[i].copy()
  this_piece["tempo"] = tempo
  this_piece["beats"] = beats
  this_piece["roll"] = roll
    
  pieces.append(this_piece)

## Add to bins

In [43]:
class Bin:
    def __init__(self, length, lower_bound, upper_bound, verbose=False):
        self.verbose = verbose
        self.bin_length = length  # how long the pieces are in this bin
        self.lower_bound = lower_bound  # inclusive # the shortest pieces that get put into this bin (round up beats in piece)
        self.upper_bound = upper_bound  # exclusive # the longest pieces in this bin (truncate)
        self.pieces = []
        self.bin_count = 0
        
    def add_piece(self, roll, length, save_to_bins=False):
        if self.verbose:
            print(f"adding piece of length {length}", end=" ")
        
        self.bin_count += 1
        
        # fit piece to bin size
        if length >= self.bin_length:
            fit_roll = self.truncate(roll, length)
            
            if self.verbose:
                print(f"truncated to {fit_roll.shape[0]}")
        else:
            fit_roll = self.add_beats_repeat(roll, length)
            
            if self.verbose:
                print(f"repeated to {fit_roll.shape[0]}")

        # save fit piece in bin
        if save_to_bins:
            self.pieces.append(fit_roll)
            
        return fit_roll
        
    def add_sub_piece(self, roll, length, roll_before, roll_after, save_to_bins=False):
        if self.verbose:
            print(f"adding piece of length {length}", end=" ")
            
        self.bin_count += 1
        
        # fit sub-piece to bin size
        if length >= self.bin_length:
            fit_roll = self.truncate(roll, length)
            
            if self.verbose:
                print(f"truncated to {fit_roll.shape[0]}")
        else:
            fit_roll = self.add_beats_from_neighbor(roll, length, roll_before, roll_after)
            
            if self.verbose:
                print(f"extended to {fit_roll.shape[0]}")
        
        # save fit sub-piece in bin
        if save_to_bins:
            self.pieces.append(fit_roll)
            
        return fit_roll
        
        
    def get_bin_length(self):
        return self.bin_length
    
    def get_bin_count(self):
        return self.bin_count
    
    def truncate(self, roll, length):
        beats_to_remove = length - self.bin_length # shape aka beats
        if beats_to_remove > 0:
            roll = roll[:(-1) * beats_to_remove] # cut off last [beats_to_remove] beats
        return roll
    
    def add_beats_repeat(self, roll, length):
        beats_to_add = self.bin_length - length
        if beats_to_add > 0:  # never ever adding more than 1 repeat
            roll = np.vstack((roll,roll))
            roll = roll[:self.bin_length] 
        return roll
    
    def add_beats_from_neighbor(self, roll, length, roll_before=None, roll_after=None):
        """For a sub-piece that must expand to meet a bin length, borrow beats from neighboring sub-pieces."""
        beats_to_add = self.bin_length - length
        
        if self.verbose:
            print(f"added {beats_to_add} beats", end=" ")
        
        if (roll_before is None) and (roll_after is None):
            print("error: cannot use method on one sub-piece; must be at least two sub-pieces")
        elif roll_before is None:  # first sub-piece
            # add from roll after
            beats_after = roll_after[:beats_to_add]
            roll = np.vstack((roll, beats_after))
        elif roll_after is None:  # last sub-piece
            # add from roll before
            beats_before = roll_before[-1*beats_to_add:]
            roll = np.vstack((beats_before, roll))
        elif beats_to_add == 1:  # middle piece only adds 1 beat
            # add from roll after
            beats_after = roll_after[:beats_to_add]
            roll = np.vstack((roll, beats_after))
        else: # a sub-piece in the middle
            # add half from roll before, and
            # add half from roll after
            beats_from_before = beats_to_add // 2
            beats_from_after = beats_to_add - beats_from_before
            beats_before = roll_before[-1*beats_from_before:]
            beats_after = roll_after[:beats_from_after]
            roll = np.vstack((beats_before, roll, beats_after))
        return roll
        
class Bin_Holder:
    def __init__(self, num_bins, min_size, max_size, verbose=True):
        """a class to create and contain all the bins"""
        self.verbose = verbose
        self.num_bins = num_bins
        self.bins = []
        self.max_size = max_size
        self.min_size = min_size
       
        # shorten names for conciseness
        pl = min_size
        pu = max_size
        
        # calculate exponential formula constants
        # formula is f(x) = a*e^(b*x)
        self.a = pl * (pl / pu)**(1/(num_bins-1))
        self.b = (1/(num_bins-1)) * math.log(pu/pl)
        
        # create bins
        for i in range(1, num_bins+1):
            lower_bound, bin_length, upper_bound = self.get_bin_bounds(i)
            # create a new bin and add to self.bins
            self.bins.append(Bin(bin_length, lower_bound, upper_bound))  # add verbose=verbose for bins to inherit verbose
            
        # for sorting purposes, create list of bin bounds
        self.bin_bounds = [self.apply_exp(x+0.5) for x in range(num_bins+1)] # start at 0 not 1, to include lowest bound
        self.bin_lengths = [self.apply_exp(x) for x in range(1, num_bins+1)]
        self.max_bin_bound = self.bin_bounds[-1]  # largest piece that fits into a bin without splitting into sub-pieces
        
    def get_bin_bounds(self, bin_num):
        """Get bounds for a bin using exponential formula."""
        lower_bound = self.apply_exp(bin_num - 0.5)
        bin_length = self.apply_exp(bin_num)
        upper_bound = self.apply_exp(bin_num + 0.5)
    
        if self.verbose:
            print(f"current bin bounds for bin {bin_num}: [{lower_bound}, {bin_length}, {upper_bound})")

        return lower_bound, bin_length, upper_bound
    
    def print_bin_info(self):
        print("num_bins:", self.num_bins)
        print("max_size:", self.max_size, "\tmin_size:", self.min_size)
        print("bin_bounds:", self.bin_bounds)
        print("bin_lengths:", self.bin_lengths)
        
    def print_bin_counts(self):
        print([this_bin.get_bin_count() for this_bin in self.bins])
        
    def print_bin_lengths(self):
        print([this_bin.get_bin_length() for this_bin in self.bins])
        
    def apply_exp(self, bin_num):
        return int(self.a*math.exp(self.b * bin_num))
    
    def get_bin_ind(self, length):
        return bisect(self.bin_bounds, length) - 1
    
    def get_bin_length(self, length):
        bin_ind = self.get_bin_ind(length)
        return self.bins[bin_ind].get_length()
    
    def get_bin_ind_and_length(self, length):
        """Returns both ind and length at once."""
        bin_ind = self.get_bin_ind(length)
        bin_length = self.bins[bin_ind].get_length()
        return bin_ind, bin_length
    
    def add_piece(self, roll, length):
        fit_rolls = []
        # if larger than can fit in largest bin (len>=max_bin_bound), break into sub-pieces
        # note: max_bin_bound is exclusive
        if length >= self.max_bin_bound:
            # how many sub-pieces to split this roll into
            n_splits = math.ceil(length / self.max_size)
            sub_pieces = np.array_split(roll, n_splits, axis=0)  # shape (beats,128)
            
            if self.verbose:
                print(f"piece size {length} is too large; split into {n_splits} sub-pieces of sizes {[r.shape[0] for r in sub_pieces]}")
            
            # for each sub-piece
            for i in range(n_splits):
                sub_piece = sub_pieces[i]
                sub_length = sub_piece.shape[0]
                
                # get sub-piece before and after, in case this sub-piece needs expanding to fit in bin
                if i != 0:  # not first piece
                    roll_before = sub_pieces[i-1]
                else:
                    roll_before = None  # first piece
                    
                if i != n_splits - 1:  # not last piece
                    roll_after = sub_pieces[i+1]
                else:
                    roll_after = None  # last piece
                
                # insert sub-piece in bin
                bin_ind = self.get_bin_ind(sub_length)
                if self.verbose:
                    print("bin:", bin_ind, "length:", length)
                fit_rolls.append(self.bins[bin_ind].add_sub_piece(sub_piece, sub_length, roll_before, roll_after))
        
        # fits in bin w/o breaking into pieces
        else:  
            bin_ind = self.get_bin_ind(length)
            if self.verbose:
                print("bin", bin_ind, "length", length)
            
            if bin_ind >= 0: # large enough for a bin
                fit_rolls.append(self.bins[bin_ind].add_piece(roll, length))
                
        return fit_rolls
                

In [44]:
def add_roll_to_split(roll, tempo, beats, split):
    # convert roll to tensor
    # and save roll, tempo, beats
    roll_data = np.array([torch.tensor(roll), tempo, beats], dtype="object")
    if split=="test":
        test.append(roll_data)
    elif split=="train":
        train.append(roll_data)
    else:
        validation.append(roll_data)
        
# clear lists each time data are added to rolls
train = []
test = []
validation = []

In [45]:
# set constants
num_bins = 16
max_length = 700

# find shortest piece length
all_beat_lengths = [piece["beats"] for piece in pieces]  # note: pieces is NOT sorted by length
all_beat_lengths.sort()
min_length = all_beat_lengths[9]  # 10th-shortest piece length

In [None]:
# create bins to hold pieces
all_bins = Bin_Holder(num_bins, min_length, max_length, verbose=False)
all_bins.print_bin_info()

In [47]:
for piece in pieces:
    roll = piece["roll"]
    tempo = piece["tempo"]
    split = piece["split"]
    beats = piece["beats"]
    
    # fit roll to a bin
    # may return multiple 'fit_roll's - from pieces larger than max_size
    fit_rolls = all_bins.add_piece(roll, beats)
    
    # save each fit_roll to split
    for fit_roll in fit_rolls:
        add_roll_to_split(fit_roll, tempo, fit_roll.shape[0], split)

In [None]:
all_bins.print_bin_lengths()

In [None]:
all_bins.print_bin_counts()

In [50]:
# find lengths of train pieces
train_lengths_all = [piece[0].shape[0] for piece in train]

In [None]:
# tally lengths of train pieces
train_lengths = set(train_lengths_all)
length_tallies = []
for length in train_lengths:
    length_tallies.append((length, train_lengths_all.count(length)))
    
length_tallies.sort(key=lambda x:x[0])
print(length_tallies)

In [None]:
# print lengths, then counts, in matching order
lengths = [x[0] for x in length_tallies]
counts = [x[1] for x in length_tallies]

print("FOR TRAIN ONLY")
print("lengths:")
for length in lengths:
    print(length)
    
print("\ncounts:")
for c in counts:
    print(c)

In [None]:
# may help estimate run time:
# calculate length times batch size
len_times_count = [x[0] * x[1] for x in length_tallies]
len_times_count

In [None]:
# see how many pieces are in each set
print("validation:\t", len(validation))
print("test:\t", len(test))
print("train:\t", len(train))

In [55]:
# the data does not save until you run this.
torch.save(validation, my_drive_path + "usable_data/mar-1-variable_bin_bounds_val.csv")
torch.save(test, my_drive_path + "usable_data/mar-1-variable_bin_bounds_test.csv")
torch.save(train, my_drive_path + "usable_data/mar-1-variable_bin_bounds_train.csv")