# Using music21 to parse MIDI data

In [1]:
from music21 import *

Let's start with a piece Ave Maria...

In [2]:
sBach = converter.parse("MIDI_Archive/Classical_Greatest/Bach/AveMaria.mid")
# sBach.show() #for this to work, you have to install a software called musescore

We can also play it!

In [3]:
sBach.show('midi')

# Extract MIDI sequence

To make a network, we want to extract the MIDI sequence for different parts. After piece is parsed, you can access different parts using piece.parts, and for each part p, p.notesAndRests would access the notes (or chords, aka notes happening together) and rests in the sequence, and secondsMap will provide the information of not only the note pitch, but also when it starts and when it ends.

However, there are rests that is not legit (those who didn't actually show in the sheet music) and notes with duration 0. We also probably want to keep track of very long rest, since that means a new sentence has begun. Therefore, we do a second round on the music piece, in order to filter notes that is shorter than a threhold and mark rest longer than a threshold.

In [2]:
def quantify_music(piece):
    from collections import defaultdict
    # first pass of the music, not removing notes and rests that is not reasonable
    midi_dict = {}
    time_dict = {}
    key = piece.analyze('key').name # tell us the key of the piece (eg. F major, C minor)
    duration_notes = []
    for i, p in enumerate(piece.parts):
        part_midi = []
        time = []
        for n in p.flat.notesAndRests.secondsMap: # secondsMap would include not only the note, but also the time information
            start = n['offsetSeconds']
            end = n['endTimeSeconds']
            time.append((start, end))
            element = n['element']
            try:
                part_midi.append(element.pitch.midi)
                if (end-start)!=0.0:
                    duration_notes.append((end-start))
            except:
                try:
                    part_midi.append([item.midi for item in element.pitches])
                except:
                    part_midi.append(128) # coding rest as 128
        midi_dict[i] = part_midi
        time_dict[i] = time
    # Indentify short rest and long rest
    midi_dict_prune = defaultdict(list)
    time_dict_prune = defaultdict(list)
    min_threshold = min(duration_notes)
    max_threshold = max(duration_notes)
#     print(min_threshold, max_threshold)
    for p in time_dict:
        for note, (start, end) in zip(midi_dict[p], time_dict[p]):
            if note!=128:
                midi_dict_prune[p].append(note)
                time_dict_prune[p].append((start, end))
            else:
                if end-start >= min_threshold:
                    if end-start > max_threshold:
                        midi_dict_prune[p].append(129)
                        time_dict_prune[p].append((start, end))
                    else:
                        midi_dict_prune[p].append(128)
                        time_dict_prune[p].append((start, end))
    return key, midi_dict_prune, time_dict_prune

In [5]:
sMozart = converter.parse("MIDI_Archive/Classical_Greatest/Mozart/K527 Overture ''Don Giovanni''.mid")

KeyboardInterrupt: 

In [6]:
%timeit key, midi_dict, time_dict = quantify_music(sMozart)

2.05 s ± 319 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
sBeatles = converter.parse("MIDI_Archive/Beatles/Beatles_Blackbird.mid")

In [7]:
key, midi_dict, time_dict = quantify_music(sBeatles)

In [8]:
len(midi_dict)

6

# Extract the whole corpus

## Get all the midi file name under this directory (including subdirectory)

In [3]:
import os
from glob import glob
import random
midis = []
start_dir1 = "../../dataset/midi/"
#start_dir2 = "MIDI_Archive/Jazz"
#start_dir3 = "MIDI_Archive/Metal_Rock"
#start_dir4 = "MIDI_Archive/american_folk"

def walk_dir(start_dir):
    midis = []
    pattern   = "*.mid"
    for dir,_,_ in os.walk(start_dir):
        midis.extend(glob(os.path.join(dir,pattern))) 
    return midis

midis = walk_dir(start_dir1)
#midis = walk_dir(start_dir1) + walk_dir(start_dir2) + walk_dir(start_dir3) + walk_dir(start_dir4)
#midis = random.sample(walk_dir(start_dir1), 10)
# random.sample(walk_dir(start_dir2), 50) + 
# random.sample(walk_dir(start_dir3), 50) + 
# random.sample(walk_dir(start_dir4), 50)

In [4]:
len(midis)

45129

In [None]:
import signal
import json
midi_corpus = {}
count = 0
ind = 0
failed_midi = []
big_midi = []
class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)

for midi in midis:
    # Start the timer. Once 10 seconds are over, a SIGALRM signal is sent.
    signal.alarm(30)    
    # This try/except loop ensures that 
    #   you'll catch TimeoutException when it's sent.
    try:
        try:
        #if True:
            piece = converter.parse(midi)
            key, midi_dict, duration_dict = quantify_music(piece)
            #midi_corpus[midi.split("MIDI_Archive/")[1].split('.mid')[0].replace('/',' ')] = [key, midi_dict, duration_dict]
            fname = midi.split('/')[-1]
            fname = fname.split('.')[0]
            midi_corpus[fname] = [key, midi_dict, duration_dict]
            count+=1
        except:
            print("failed:",midi)
            failed_midi.append(midi)
    except TimeoutException:
        print("big:",midi)
        big_midi.append(midi)
        signal.alarm(0)
        continue # continue the for loop if function A takes more than 5 second
    else:
        # Reset the alarm
        signal.alarm(0)
    if count == 1000:        
        json.dump(midi_corpus, open("../../dataset/corpus/midi_corpus_%s.json"%ind,'w'),indent = 4)
        midi_corpus = {}
        count = 0
        ind += 1

if len(midi_corupus) > 0:
    json.dump(midi_corpus, open("../../dataset/corpus/midi_corpus_%s.json"%ind,'w'),indent = 4)

failed: ../../dataset/midi/69ab8697eb22b3c3dd0439d7546382fe.mid
failed: ../../dataset/midi/321d1c5f7afe08aaf196d2d0f23164a8.mid
failed: ../../dataset/midi/cccdc75525a7962269b0973c1ed993dd.mid
failed: ../../dataset/midi/23f7af1f08bc322d66556831265a556f.mid
failed: ../../dataset/midi/e58d485bcad6ef75f5cbc2826aa23170.mid
failed: ../../dataset/midi/93cb6e40788e60d65a89045394d3a42d.mid
failed: ../../dataset/midi/e0027ee77ffdbe7293f47a4690792c84.mid
failed: ../../dataset/midi/449e5619fe20213165bc57af74755d8c.mid
failed: ../../dataset/midi/e8824ac3b15d9588a53eb779a54a50dd.mid
failed: ../../dataset/midi/7e111ba414653c3fe7278045f8c181fb.mid
failed: ../../dataset/midi/361d6a1e62a298d29bfd32529daed630.mid
failed: ../../dataset/midi/10d393fe16bc9c600312b3dae16e7725.mid
failed: ../../dataset/midi/90a6c2bf6cd22c93f85d688ed45f7594.mid
failed: ../../dataset/midi/71225f8f0c59f72c9aec9ced11452bf3.mid
failed: ../../dataset/midi/f55e5e66b1f270716a7d8ecf56e8cf0a.mid
failed: ../../dataset/midi/ceb116de1d5cc

failed: ../../dataset/midi/36d8616d4274ea18413359325ceaf11e.mid
failed: ../../dataset/midi/6cfe36761cb4360a32ced939c9bf03f0.mid
failed: ../../dataset/midi/b70f7c02289335be6c03ab9c64e9e7cc.mid
failed: ../../dataset/midi/71480258445c67246788681e2856a6da.mid
failed: ../../dataset/midi/48d8a8cfe243bef88c3e8d4984b2707a.mid
failed: ../../dataset/midi/6cbfd12293a501e183c255ed67d7c701.mid
failed: ../../dataset/midi/2edd91a24578f82d34e9460d7a31f5e6.mid
failed: ../../dataset/midi/18614fcad13f24f2fec52f7f7b27c9b8.mid
failed: ../../dataset/midi/aa38b3edf435b40bd3e3ecbd7a78a748.mid
failed: ../../dataset/midi/9bdf022dfee62927ca28dbaaa851b8b9.mid
failed: ../../dataset/midi/b7e94a44d987ef22398ec6a11086f4fe.mid
failed: ../../dataset/midi/4e3308b868d715b37332893e064b8175.mid
failed: ../../dataset/midi/e256818c4f2e25068962f4b0452f498c.mid
failed: ../../dataset/midi/ace5710ddc324af667ad4472c1f82e15.mid
failed: ../../dataset/midi/3f67feebb8f6bbecdffbf7c263fb7df0.mid
failed: ../../dataset/midi/c4aed19e9959a

In [10]:
len(midi_corpus)

8090

# Simple network generation (one order)

In [11]:
import pickle
for each_piece in midi_corpus:
    from collections import Counter
    for each_part in midi_corpus[each_piece][1]:
        edges = []
        sequence = midi_corpus[each_piece][1][each_part]
        sequence = [item if type(item)!=list else max(item) for item in sequence]
        edges = list(zip(sequence[:-1], sequence[1:]))
        edges_with_weight = dict(Counter(edges))
        pickle.dump(edges_with_weight, open("Simple Network/%s %s.pickle"%(each_piece.replace('/',' '), each_part), "wb"))