# Using music21 to parse MIDI data

In [1]:
from music21 import *

Let's start with a piece Ave Maria...

In [2]:
sBach = converter.parse("MIDI_Archive/Classical_Greatest/Bach/AveMaria.mid")
# sBach.show() #for this to work, you have to install a software called musescore

We can also play it!

In [3]:
sBach.show('midi')

# Extract MIDI sequence

To make a network, we want to extract the MIDI sequence for different parts. After piece is parsed, you can access different parts using piece.parts, and for each part p, p.notesAndRests would access the notes (or chords, aka notes happening together) and rests in the sequence, and secondsMap will provide the information of not only the note pitch, but also when it starts and when it ends.

However, there are rests that is not legit (those who didn't actually show in the sheet music) and notes with duration 0. We also probably want to keep track of very long rest, since that means a new sentence has begun. Therefore, we do a second round on the music piece, in order to filter notes that is shorter than a threhold and mark rest longer than a threshold.

In [17]:
def quantify_music(piece):
    from collections import defaultdict
    # first pass of the music, not removing notes and rests that is not reasonable
    midi_dict = {}
    time_dict = {}
    key = piece.analyze('key').name # tell us the key of the piece (eg. F major, C minor)
    duration_notes = []
    for i, p in enumerate(piece.parts):
        part_midi = []
        time = []
        for n in p.flat.notesAndRests.secondsMap: # secondsMap would include not only the note, but also the time information
            start = n['offsetSeconds']
            end = n['endTimeSeconds']
            time.append((start, end))
            element = n['element']
            try:
                part_midi.append(element.pitch.midi)
                if (end-start)!=0.0:
                    duration_notes.append((end-start))
            except:
                try:
                    part_midi.append([item.midi for item in element.pitches])
                except:
                    part_midi.append(128) # coding rest as 128
        midi_dict[i] = part_midi
        time_dict[i] = time
    # Indentify short rest and long rest
    midi_dict_prune = defaultdict(list)
    time_dict_prune = defaultdict(list)
    min_threshold = min(duration_notes)
    max_threshold = max(duration_notes)
#     print(min_threshold, max_threshold)
    for p in time_dict:
        for note, (start, end) in zip(midi_dict[p], time_dict[p]):
            if note!=128:
                midi_dict_prune[p].append(note)
                time_dict_prune[p].append((start, end))
            else:
                if end-start >= min_threshold:
                    if end-start > max_threshold:
                        midi_dict_prune[p].append(129)
                        time_dict_prune[p].append((start, end))
                    else:
                        midi_dict_prune[p].append(128)
                        time_dict_prune[p].append((start, end))
    return key, midi_dict_prune, time_dict_prune

In [None]:
sMozart = converter.parse("MIDI_Archive/Classical_Greatest/Mozart/K527 Overture ''Don Giovanni''.mid")

In [None]:
key, midi_dict, time_dict = quantify_music(sMozart)

# Extract the whole corpus

## Get all the midi file name under this directory (including subdirectory)

In [27]:
import os
from glob import glob
import random
midis = []
start_dir1 = "MIDI_Archive/Classical_Greatest"
start_dir2 = "MIDI_Archive/Jazz"
start_dir3 = "MIDI_Archive/Metal_Rock"
start_dir4 = "MIDI_Archive/american_folk"

def walk_dir(start_dir):
    midis = []
    pattern   = "*.mid"
    for dir,_,_ in os.walk(start_dir):
        midis.extend(glob(os.path.join(dir,pattern))) 
    return midis

midis = random.sample(walk_dir(start_dir1), 50) + random.sample(walk_dir(start_dir2), 50) + random.sample(walk_dir(start_dir3), 50) + random.sample(walk_dir(start_dir4), 50)

In [28]:
len(midis)

200

In [29]:
import json
midi_corpus = {}
count = 0
ind = 0
failed_midi = []
for midi in midis:
    try:
        piece = converter.parse(midi)
        key, midi_dict, duration_dict = quantify_music(piece)
        midi_corpus[midi.split("MIDI_Archive/")[1].split('.mid')[0].replace('/',' ')] = [key, midi_dict, duration_dict]
        count+=1
    except:
        print("here")
        failed_midi.append(midi)
json.dump(midi_corpus, open("midi_corpus_%s.json"%ind,'w'),indent = 4)

In [26]:
failed_midi

[]

In [24]:
piece = converter.parse(failed_midi[0])
key, midi_dict, duration_dict = quantify_music(piece)
midi_corpus[midi.split("MIDI_Archive/")[1].split('.mid')[0].replace('/',' ')] = [key, midi_dict, duration_dict]

NameError: name 'time_dict' is not defined

In [None]:
list(piece.parts[1].notesAndRests)

# Simple network generation (one order)

In [None]:
import pickle
for each_piece in midi_corpus:
    from collections import Counter
    for each_part in midi_corpus[each_piece][1]:
        edges = []
        sequence = midi_corpus[each_piece][1][each_part]
        sequence = [item if type(item)!=list else min(item) for item in sequence]
        edges = list(zip(sequence[:-1], sequence[1:]))
        edges_with_weight = dict(Counter(edges))
        pickle.dump(edges_with_weight, open("Simple Network/%s %s.pickle"%(each_piece[1:].replace('/',' '), each_part), "wb"))