**Goal:** Parse the abc music files to only get the elements related to the rhythm.

In [19]:
# Imports
import pandas as pd
import music21 as m21
import os
import textract
import pathlib

In [20]:
# Function that, from an abc file, gets the beat distribution of every bar of the piece
# Returns an array (piece) of arrays (bars)
def extractBeatArray(path):

    # getting the parsable score
    sample = m21.converter.parse(path)
    part = sample.parts[0]
    notes = part.recurse().notes
    length_in_bars = int(sample.duration.quarterLength/sample.getTimeSignatures()[0].numerator) 
    # should be numerator*(denominator/4), but here denominator is already 4 since all pieces are in 3/4
    
    bars = []

    # iteratively adding every beat of every bar. Each bar is a list and the piece is a list of bars i.e. a list of lists
    for i in range(0, length_in_bars):
        if part.measure(i) is not None:
            notes = part.measure(i).notes
            beats = []
            for n in notes:
                beats.append(n.beat)
                    
        bars.append(beats)
        i += 1

    return bars

In [21]:
# Parsing the dataset with the above function

data = pd.DataFrame(columns=['piece', 'notes'])

path = "./abc/_1814_698087.abc"

directory = './abc'

pieces = 0 # check every piece in the directory (599) is encoded

for name in os.listdir(directory):
    pieces += 1
    path = directory + '/' + name 
    bars = []
    try:
        bars = extractBeatArray(path)
    except:
        bars = []

    if (bars != []): # not adding unparsed pieces to dataframe
        data.loc[len(data)] = [path, bars]


print(f"{pieces} pieces were analysed.")    

print(f"{len(data)} pieces were correctly parsed.")

599 pieces were analysed.
585 pieces were correctly parsed.


In [22]:
# List of pieces that throw an exception (9)
exceptions = ['./abc/Näckapolska_efter_Anders_Bredal_c55b3d.abc', './abc/Pollonesse_ur_Andreas_Dahlgrens_notbok_no_84_f2a90d.abc',
'./abc/Polonäs_a39d56.abc', './abc/Polonäs_efter_Pehr_Andersson_Bild_30_nr_90_d06dcf.abc', './abc/Slängpolska_efter_Olof_Larsson_2f914f.abc',
'./abc/_Polonesse_in_G_Moll_no_1_ur_Anders_Larssons_notbok_61aa5e.abc', './abc/_Polonesse_in_G_Måll_no_2_769438.abc', 
'./abc/_Polonäs_sexdregasamlingen_del_2_nr_70_1b9662.abc', './abc/_Polska_Kringellåt_efter_Snickar_Erik_985b63.abc']

# Testing: 14 pieces are registered as note-less, for some reason. The 9 ones that threw an exception + 5 unknown others. 
# We choose to omit them from the dataset.

In [23]:
# Function that maps the beat distribution of a bar to a series of numbers between 1 and 12 (position of each sixteenth-note in the bar)
def map_beats(notes):
    notes_mapped = []
    for bar in notes:
        bar_mapped = []
        for x in bar:
            bar_mapped.append(int(4*(x-1)+1))
        notes_mapped.append(bar_mapped)
    return notes_mapped

In [24]:
# Function that maps a series of numbers between 1 and 12 (position of each sixteenth-note in the bar) to a music21 beat distribution
def map_beats_reverse(notes):
    notes_mapped = []
    for bar in notes:
        bar_mapped = []
        for y in bar:
            bar_mapped.append((y+3)/4)
        notes_mapped.append(bar_mapped)
    return notes_mapped

In [25]:
# Testing the function
example_notes = data[data['piece'] == './abc/_1814_698087.abc']['notes'].tolist()[0]
# when we retrieve a specific piece and convert to list it wraps the piece's array in another array for some reason, hence the [0]
# but when doing iterrows we don't have that problem
print(example_notes)
map_12 = map_beats(example_notes)
print(map_12)
print(map_beats_reverse(map_12))

[[1.0, 1.5, 2.0, 3.0], [1.0, 1.5, 2.0, 3.0], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0, 3.0], [1.0, 1.5, 2.0, 3.0], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0], [1.0, 1.5, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0, 2.5, 3.0, 3.5], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 2.0, 2.25, 2.5, 2.75, 3.0, 3.5], [1.0, 1.5, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0, 2.5, 3.0, 3.5], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 2.0]]
[[1, 3, 5, 9], [1, 3, 5, 9], [1, 4, 5, 7, 9, 11], [1, 3, 4, 5, 7, 9, 11], [1, 3, 5, 9], [1, 3, 5, 9], [1, 4, 5, 7, 9, 11], [1, 3, 5], [1, 3, 5, 7, 9, 11], [1, 3, 5, 7, 9, 11], [1, 4, 5, 7, 9, 11], [1, 5, 6, 7, 8, 9, 11], [1, 3, 5, 7, 9, 11], [1, 3, 5, 7, 9, 11], [1, 4, 5, 7, 9, 11], [1, 5]]
[[1.0, 1.5, 2.0, 3.0], [1.0, 1.5, 2.0, 3.0], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0, 3.0], [1.0, 1.5, 2.0, 3.0], [1.0, 1.75, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 2.0], [1.0, 1.5, 2.0, 2.5, 3.0, 3.5], [1.0, 1.5, 

In [26]:
# Using the mapping function on every piece
data_mapped = data.copy()

for _, d in data_mapped.iterrows():
    d['notes'] = map_beats(d['notes'])

print("Done")

data_mapped.head() # our dataframe for the model!

Done


Unnamed: 0,piece,notes
0,./abc/1b651a94a8f8680da3cef1e60705194d.abc,"[[1, 3, 5, 6, 7, 8, 9, 11], [1, 3, 5, 6, 7, 8,..."
1,./abc/3679c385a618c89f00d159f418022b39.abc,"[[1, 3, 4, 5, 7, 8, 9, 11], [1, 4, 5, 6, 7, 8,..."
2,./abc/60ae81952628613e03d563b0dae8bbc2.abc,"[[1, 4, 5, 7, 9, 10, 11, 12], [1, 3, 5, 9, 11]]"
3,./abc/6a7d307d1f0712090e4043a6b4637efa.abc,"[[1, 5, 9], [1, 1, 3, 4, 5, 7, 9, 11], [1, 4, ..."
4,./abc/8c6965e3c831e8cb769e578423c12d1e.abc,"[[1, 5, 7, 8, 9], [1, 3, 4, 5, 6, 7, 8, 9, 10,..."


In [27]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(data_mapped) # split 75% train 25% test

print(len(x_train))
print(len(x_test))

438
147


In [28]:
# Writing train and test to csv
x_train.to_csv("train.csv", index=False)
x_test.to_csv("test.csv", index=False)