In [2]:
import mido
import pandas as pd
import numpy as np
import os
import pickle

In [3]:
working_dir="../dataset"
midi_dir = "midi"
midi_files = os.listdir(os.path.join(working_dir, midi_dir))

In [3]:
for midi_file in midi_files:
  midi_file="Backstreet Boys - I Want It That Way.mid"
  midi_path = os.path.join(working_dir, midi_dir, midi_file)

  # Load midi
  print("loading", midi_file)
  midi_song=mido.MidiFile(midi_path)

  # Convert midi to dataframe
  print("converting to dataframe")
  notes=list()
  for msg in midi_song.tracks[0]:
    if msg.type=='note_on' and msg.time > 0: # this is rests
      notes += [(-1, msg.time)]
    elif msg.type=='note_off':
      notes += [(msg.note, msg.time)]
  if notes[0][0] == -1:
    notes = notes[1:]
  mdf = pd.DataFrame(notes, columns=['note', 'duration'])
  mdf['duration'] = (np.round(mdf['duration'] / 10) * 10).astype(int)

  # Get long rests index
  print("getting long rests index")
  rests = mdf[mdf['note'] < 0]
  notes = mdf[mdf['note'] > 0]
  long_rest_th = np.percentile(rests['duration'], 97) # using 97th percentile as minimum threshold
  segment_idx = rests[rests['duration'] >= long_rest_th].index

  # Segment dataframe by long rests
  print("segmenting dataframe")
  segments = list()
  prev_idx=0
  min_len = 80 # minimum number of notes
  for i in segment_idx:
    if np.abs(i-prev_idx) < min_len:
      continue
    segments += [mdf[prev_idx: i].reset_index(drop=True)]
    prev_idx = i + 1
  segments += [mdf[prev_idx:].reset_index(drop=True)]

  # Reformat notes
  print("reformating notes and durations")
  # +1 for rest; +2 for <start> and <end> tag
  relative_distance_vector_len = 61 + 1 + 2
  # -3 for two positions that aren't for notes
  relative_distance_zero_pos = (relative_distance_vector_len - 3) // 2 + 1 

  for segment in segments:
    seg_notes = segment[segment['note'] > 0]
    seg_rests = segment[segment['note'] < 0]
    seg_note_diff = seg_notes['note'].diff().fillna(0).astype(int) + relative_distance_zero_pos
    segment.loc[seg_notes.index, 'note'] = seg_note_diff
    segment.loc[seg_rests.index, 'note'] = 0
    segment.loc[:, 'duration'] = (segment['duration'] / 10).astype(int)

  # Break down long notes
  print("breaking down notes")
  new_segments = list()
  for segment in segments:
    long_duration_brokens = list()
    for i, (note, duration) in segment.iterrows():
      if duration > 100:
        new_dur_multipliers, new_dur_rem = divmod(duration, 100)
        long_duration_brokens += [(note, 99)] * new_dur_multipliers + [(note, new_dur_rem)]
        continue
      long_duration_brokens += [(note, duration - 1)]
    new_segments += [long_duration_brokens]

  break
  # Pickle segments
  print("pickling...")
  dataset_path = os.path.join(working_dir, 'csv')
  for i, v in enumerate(new_segments):
    dataset_fname = midi_file.replace('.', '_') + '.part{:02d}'.format(i)
    dataset_fpath = os.path.join(dataset_path, dataset_fname)
    with open(dataset_fpath, 'wb') as fpickle:
      pickle.dump(v, fpickle)

  print("Done!\n")

loading Backstreet Boys - I Want It That Way.mid
converting to dataframe
getting long rests index
segmenting dataframe
reformating notes and durations
breaking down notes


In [16]:
for midi_file in midi_files:
  midi_path = os.path.join(working_dir, midi_dir, midi_file)

  # Load midi
  print("loading", midi_file)
  midi_song=mido.MidiFile(midi_path)

  # Convert midi to dataframe
  print("converting to dataframe")
  notes=list()
  for msg in midi_song.tracks[0]:
    if msg.type=='note_on' and msg.time > 0: # this is rests
      notes += [(-1, msg.time)]
    elif msg.type=='note_off':
      notes += [(msg.note, msg.time)]
  if notes[0][0] == -1:
    notes = notes[1:]
  mdf = pd.DataFrame(notes, columns=['note', 'duration'])
  mdf['duration'] = (np.round(mdf['duration'] / 10)).astype(int)
    
  seg_notes = mdf[mdf['note'] > 0]
  mdf['note_str'] = 'R'
  seg_note_diff = seg_notes['note'].diff().fillna(0).astype(int).astype(str)
  mdf.loc[seg_notes.index, 'note_str'] = seg_note_diff
    
  assert (mdf.duration >= 0).all()
  # Pickle segments
  print("pickling...")
  dataset_path = os.path.join(working_dir, 'csv-2')
  dataset_fname = midi_file.replace('.', '_')
  dataset_fpath = os.path.join(dataset_path, dataset_fname)
  with open(dataset_fpath, 'wb') as fpickle:
      pickle.dump([(n, d) for n, d in zip(mdf.note_str.to_list(), mdf.duration.to_list())], fpickle)

  print("Done!\n")

loading taylor_swift-you_belong_with_me.mid
converting to dataframe
pickling...
Done!

loading westlife-flying_without_wings.mid
converting to dataframe
pickling...
Done!

loading taylor_swift-begin_again.mid
converting to dataframe
pickling...
Done!

loading fall_out_boy-beat_it.mid
converting to dataframe
pickling...
Done!

loading taylor_swift-shake_it_off.mid
converting to dataframe
pickling...
Done!

loading taylor_swift-safe_and_sound_feat_the_civil_wars.mid
converting to dataframe
pickling...
Done!

loading bruno_mars-locked_out_of_heaven.mid
converting to dataframe
pickling...
Done!

loading coldplay-trouble.mid
converting to dataframe
pickling...
Done!

loading demi_lovato-skyscraper.mid
converting to dataframe
pickling...
Done!

loading alicia_keys-no_one.mid
converting to dataframe
pickling...
Done!

loading Backstreet Boys - I Want It That Way.mid
converting to dataframe
pickling...
Done!

loading taylor_swift-untouchable.mid
converting to dataframe
pickling...
Done!

loadi

In [17]:
assert (mdf.duration >= 0).all()

In [18]:
mdf

Unnamed: 0,note,duration,note_str
0,76,38,0
1,-1,2,R
2,76,38,0
3,-1,2,R
4,76,18,0
...,...,...,...
778,74,18,-2
779,-1,2,R
780,72,38,-2
781,-1,2,R


In [39]:
midi_path = os.path.join(working_dir, midi_dir, midi_file)

# Load midi
print("loading", midi_file)
midi_song=mido.MidiFile(midi_path)

# Convert midi to dataframe
print("converting to dataframe")
notes=list()
for msg in midi_song.tracks[0]:
    if msg.type=='note_on': # this is rests
        notes += [(-1, msg.time)]
    elif msg.type=='note_off':
        notes += [(msg.note, msg.time)]
if notes[0][0] == -1:
    notes = notes[1:]
mdf = pd.DataFrame(notes, columns=['note', 'duration'])

loading bruno_mars-when_i_was_your_man.mid
converting to dataframe


In [40]:
print("getting long rests index")
rests = mdf[mdf['note'] < 0]
notes = mdf[mdf['note'] > 0]
long_rest_th = np.percentile(rests['duration'], 97) # using 97th percentile as minimum threshold
segment_idx = rests[rests['duration'] >= long_rest_th].index
segment_idx, long_rest_th

getting long rests index


(Int64Index([25, 51, 125, 197, 269, 295, 321, 401, 471, 543, 649, 759], dtype='int64'),
 486.6000000000023)

In [42]:
mdf[:125]

Unnamed: 0,note,duration
0,76,385
1,-1,20
2,76,385
3,-1,20
4,76,183
...,...,...
120,81,384
121,-1,21
122,79,791
123,-1,20
