In [1]:
import mido
import pandas as pd
import numpy as np
import os
import pickle

In [None]:
working_dir="/content/drive/My Drive/audio-processing-notebook/music-files/generating"
midi_dir = "midi"
midi_files = os.listdir(os.path.join(working_dir, midi_dir))

In [None]:
for midi_file in midi_files:
  midi_path = os.path.join(working_dir, midi_dir, midi_file)

  # Load midi
  print("loading", midi_file)
  midi_song=mido.MidiFile(midi_path)

  # Convert midi to dataframe
  print("converting to dataframe")
  notes=list()
  for msg in midi_song.tracks[0]:
    if msg.type=='note_on': # this is rests
      notes += [(-1, msg.time)]
    elif msg.type=='note_off':
      notes += [(msg.note, msg.time)]
  if notes[0][0] == -1:
    notes = notes[1:]
  mdf = pd.DataFrame(notes, columns=['note', 'duration'])

  # Get long rests index
  print("getting long rests index")
  rests = mdf[mdf['note'] < 0]
  notes = mdf[mdf['note'] > 0]
  long_rest_th = np.percentile(rests['duration'], 97) # using 97th percentile as minimum threshold
  segment_idx = rests[rests['duration'] >= long_rest_th].index

  # Segment dataframe by long rests
  print("segmenting dataframe")
  segments = list()
  prev_idx=0
  min_len = 50 # minimum number of notes
  for i in segment_idx:
    if np.abs(i-prev_idx) < min_len:
      continue
    segments += [mdf[prev_idx: i].reset_index(drop=True)]
    prev_idx = i + 1
  segments += [mdf[prev_idx:].reset_index(drop=True)]

  # Reformat notes
  print("reformating notes and durations")
  # +1 for rest; +2 for <start> and <end> tag
  relative_distance_vector_len = 61 + 1 + 2
  # -3 for two positions that aren't for notes
  relative_distance_zero_pos = (relative_distance_vector_len - 3) // 2 + 1 

  for segment in segments:
    seg_notes = segment[segment['note'] > 0]
    seg_rests = segment[segment['note'] < 0]
    seg_note_diff = seg_notes['note'].diff().fillna(0).astype(int) + relative_distance_zero_pos
    segment.loc[seg_notes.index, 'note'] = seg_note_diff
    segment.loc[seg_rests.index, 'note'] = 0
    segment.loc[:, 'duration'] = (segment['duration'] / 10).astype(int)

  # Break down long notes
  print("breaking down notes")
  new_segments = list()
  for segment in segments:
    long_duration_brokens = list()
    for i, (note, duration) in segment.iterrows():
      if duration > 100:
        new_dur_multipliers, new_dur_rem = divmod(duration, 100)
        long_duration_brokens += [(note, 99)] * new_dur_multipliers + [(note, new_dur_rem - 1)]
        continue
      long_duration_brokens += [(note, duration - 1)]
    new_segments += [long_duration_brokens]

  # Pickle segments
  print("pickling...")
  dataset_path = os.path.join(working_dir, 'csv')
  for i, v in enumerate(new_segments):
    dataset_fname = midi_file.replace('.', '_') + '.part{:02d}'.format(i)
    dataset_fpath = os.path.join(dataset_path, dataset_fname)
    with open(dataset_fpath, 'wb') as fpickle:
      pickle.dump(v, fpickle)

  print("Done!\n")