**Goal:** Parse the abc music files to only get the elements related to the rhythm.

In [1]:
# Imports
import pandas as pd
import music21 as m21
import os
import textract
import pathlib

In [2]:
# Function that, from an abc file, gets the beat distribution of every bar of the piece
# Returns an array (piece) of arrays (bars)
def extractBeatArray(path):

    # getting the parsable score
    sample = m21.converter.parse(path)
    part = sample.parts[0]
    notes = part.recurse().notes
    length_in_bars = int(sample.duration.quarterLength/sample.getTimeSignatures()[0].numerator) 
    # should be numerator*(denominator/4), but here denominator is already 4 since all pieces are in 3/4
    
    bars = []

    # iteratively adding every beat of every bar. Each bar is a list and the piece is a list of bars i.e. a list of lists
    for i in range(0, length_in_bars):
        if part.measure(i) is not None:
            notes = part.measure(i).notes
            beats = []
            for n in notes:
                beats.append(n.beat)
                    
        bars.append(beats)
        i += 1

    return bars

In [3]:
# Function that, from an abc file, gets the pitch distribution of every bar of the piece
# Returns an array (piece) of arrays (bars)
def extractPitchArray(path):

    # getting the parsable score
    sample = m21.converter.parse(path)
    part = sample.parts[0]
    notes = part.recurse().notes
    length_in_bars = int(sample.duration.quarterLength/sample.getTimeSignatures()[0].numerator) 
    # should be numerator*(denominator/4), but here denominator is already 4 since all pieces are in 3/4
    
    bars = []

    # iteratively adding every beat of every bar. Each bar is a list and the piece is a list of bars i.e. a list of lists
    for i in range(0, length_in_bars):
        if part.measure(i) is not None:
            notes = part.measure(i).notes
            beats = []
            for n in notes:
                beats.append(n.pitch.midi) # should we also use pitchClass (no octave) or is the octave important
                    
        bars.append(beats)
        
        i += 1

    return bars

In [4]:
# Parsing the dataset with the above function

data = pd.DataFrame(columns=['piece', 'beats', 'pitches'])

path = "./abc/_1814_698087.abc"

directory = '../abc'

pieces = 0 # check every piece in the directory (599) is encoded

for name in os.listdir(directory):
    pieces += 1
    path = directory + '/' + name 
    bars_beats = []
    bars_pitches = []
    try:
        bars_beats = extractBeatArray(path)
    except:
        bars_beats = []
    try:
        bars_pitches = extractPitchArray(path)
    except:
        bars_pitches = []

    if ((bars_beats != []) & (bars_pitches != [])): # not adding unparsed pieces to dataframe
        data.loc[len(data)] = [path, bars_beats, bars_pitches]


print(f"{pieces} pieces were analysed.")    

print(f"{len(data)} pieces were correctly parsed.")

599 pieces were analysed.
484 pieces were correctly parsed.


In [13]:
# List of pieces that throw an exception (9)
exceptions = ['./abc/Näckapolska_efter_Anders_Bredal_c55b3d.abc', './abc/Pollonesse_ur_Andreas_Dahlgrens_notbok_no_84_f2a90d.abc',
'./abc/Polonäs_a39d56.abc', './abc/Polonäs_efter_Pehr_Andersson_Bild_30_nr_90_d06dcf.abc', './abc/Slängpolska_efter_Olof_Larsson_2f914f.abc',
'./abc/_Polonesse_in_G_Moll_no_1_ur_Anders_Larssons_notbok_61aa5e.abc', './abc/_Polonesse_in_G_Måll_no_2_769438.abc', 
'./abc/_Polonäs_sexdregasamlingen_del_2_nr_70_1b9662.abc', './abc/_Polska_Kringellåt_efter_Snickar_Erik_985b63.abc']

# Testing: 14 pieces are registered as note-less, for some reason. The 9 ones that threw an exception + 5 unknown others. 
# We choose to omit them from the dataset.

In [14]:
# Function that maps the beat distribution of a bar to a series of numbers between 1 and 12 (position of each sixteenth-note in the bar)
def map_beats(notes):
    notes_mapped = []
    for bar in notes:
        bar_mapped = []
        for x in bar:
            bar_mapped.append(int(4*(x-1)+1))
        notes_mapped.append(bar_mapped)
    return notes_mapped

In [15]:
# Function that maps a series of numbers between 1 and 12 (position of each sixteenth-note in the bar) to a music21 beat distribution
def map_beats_reverse(notes):
    notes_mapped = []
    for bar in notes:
        bar_mapped = []
        for y in bar:
            bar_mapped.append((y+3)/4)
        notes_mapped.append(bar_mapped)
    return notes_mapped

In [16]:
# Testing the function
example_notes = data[data['piece'] == '../abc/1b651a94a8f8680da3cef1e60705194d.abc']['beats'].tolist()[0]
# when we retrieve a specific piece and convert to list it wraps the piece's array in another array for some reason, hence the [0]
# but when doing iterrows we don't have that problem
print(example_notes)
map_12 = map_beats(example_notes)
print(map_12)
print(map_beats_reverse(map_12))

[[1.0, 1.5, 2.0, 2.25, 2.5, 2.75, 3.0, 3.5], [1.0, 1.5, 2.0, 2.25, 2.5, 2.75, 3.0, 3.5], [1.0, 1.5, 1.75, 2.0, 2.5, 2.75, 3.0, 3.5]]
[[1, 3, 5, 6, 7, 8, 9, 11], [1, 3, 5, 6, 7, 8, 9, 11], [1, 3, 4, 5, 7, 8, 9, 11]]
[[1.0, 1.5, 2.0, 2.25, 2.5, 2.75, 3.0, 3.5], [1.0, 1.5, 2.0, 2.25, 2.5, 2.75, 3.0, 3.5], [1.0, 1.5, 1.75, 2.0, 2.5, 2.75, 3.0, 3.5]]


In [17]:
# Using the mapping function on every piece
data_mapped = data.copy()

for _, d in data_mapped.iterrows():
    d['beats'] = map_beats(d['beats'])

print("Done")

data_mapped.head() # our dataframe for the model!

Done


Unnamed: 0,piece,beats,pitches
0,../abc/1b651a94a8f8680da3cef1e60705194d.abc,"[[1, 3, 5, 6, 7, 8, 9, 11], [1, 3, 5, 6, 7, 8,...","[[69, 76, 81, 83, 81, 79, 77, 74], [74, 76, 79..."
1,../abc/3679c385a618c89f00d159f418022b39.abc,"[[1, 3, 4, 5, 7, 8, 9, 11], [1, 4, 5, 6, 7, 8,...","[[76, 76, 81, 76, 76, 77, 76, 74], [72, 71, 69..."
2,../abc/60ae81952628613e03d563b0dae8bbc2.abc,"[[1, 4, 5, 7, 9, 10, 11, 12], [1, 3, 5, 9, 11]]","[[62, 67, 71, 74, 72, 71, 72, 69], [67, 71, 62..."
3,../abc/6a7d307d1f0712090e4043a6b4637efa.abc,"[[1, 5, 9], [1, 1, 3, 4, 5, 7, 9, 11], [1, 4, ...","[[64, 69, 71], [71, 72, 71, 72, 74, 72, 71, 69..."
4,../abc/8c6965e3c831e8cb769e578423c12d1e.abc,"[[1, 5, 7, 8, 9], [1, 3, 4, 5, 6, 7, 8, 9, 10,...","[[69, 67, 66, 67, 69], [69, 71, 73, 74, 73, 74..."


In [18]:
data_mapped['beat_pitch'] = data_mapped.apply(lambda x: [[(beat, pitch) for beat, pitch in zip(beats, pitches)] for beats, pitches in zip(x['beats'], x['pitches'])], axis=1)
data_mapped.head()

Unnamed: 0,piece,beats,pitches,beat_pitch
0,../abc/1b651a94a8f8680da3cef1e60705194d.abc,"[[1, 3, 5, 6, 7, 8, 9, 11], [1, 3, 5, 6, 7, 8,...","[[69, 76, 81, 83, 81, 79, 77, 74], [74, 76, 79...","[[(1, 69), (3, 76), (5, 81), (6, 83), (7, 81),..."
1,../abc/3679c385a618c89f00d159f418022b39.abc,"[[1, 3, 4, 5, 7, 8, 9, 11], [1, 4, 5, 6, 7, 8,...","[[76, 76, 81, 76, 76, 77, 76, 74], [72, 71, 69...","[[(1, 76), (3, 76), (4, 81), (5, 76), (7, 76),..."
2,../abc/60ae81952628613e03d563b0dae8bbc2.abc,"[[1, 4, 5, 7, 9, 10, 11, 12], [1, 3, 5, 9, 11]]","[[62, 67, 71, 74, 72, 71, 72, 69], [67, 71, 62...","[[(1, 62), (4, 67), (5, 71), (7, 74), (9, 72),..."
3,../abc/6a7d307d1f0712090e4043a6b4637efa.abc,"[[1, 5, 9], [1, 1, 3, 4, 5, 7, 9, 11], [1, 4, ...","[[64, 69, 71], [71, 72, 71, 72, 74, 72, 71, 69...","[[(1, 64), (5, 69), (9, 71)], [(1, 71), (1, 72..."
4,../abc/8c6965e3c831e8cb769e578423c12d1e.abc,"[[1, 5, 7, 8, 9], [1, 3, 4, 5, 6, 7, 8, 9, 10,...","[[69, 67, 66, 67, 69], [69, 71, 73, 74, 73, 74...","[[(1, 69), (5, 67), (7, 66), (8, 67), (9, 69)]..."


In [19]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(data_mapped) # split 75% train 25% test

print(len(x_train))
print(len(x_test))

363
121


In [20]:
# Writing train and test to csv
x_train.to_csv("train.csv", index=False)
x_test.to_csv("test.csv", index=False)