In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import os.path
import pickle
import glob

### Filter out filler pages

The file filler.txt indicates which pages are filler (e.g. cover, foreword, etc).

In [None]:
def getFillerList(filler_file, feat_dir):
    d = {} # list of pages to remove
    with open(filler_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) > 1:
                relpath = parts[0] # e.g. Bach/00748
                scoreID = os.path.basename(relpath) # e.g. 00748
                removeField = parts[1].strip('"') # e.g. "0,1,-2,-1" or "r" or "rl"
                numPages = getNumPages(relpath, feat_dir)
                if removeField == 'r' or removeField == 'rl': # remove all pages
                    for pkl_file in glob.glob('{}/{}/*.pkl'.format(feat_dir, parts[0])):
                        pageID = os.path.splitext(os.path.basename(pkl_file))[0] # e.g. 00822-3
                        d[pageID] = 1
                else:
                    for pageNumStr in removeField.split(','):
                        pageNum = int(pageNumStr)
                        if pageNum < 0:
                            pageID = '{}-{}'.format(scoreID, numPages + pageNum)
                            d[pageID] = 1
                        else:
                            pageID = '{}-{}'.format(scoreID, pageNum)
                            d[pageID] = 1
    return d

In [None]:
def getNumPages(relpath, indir):
    numPages = len(glob.glob('{}/{}/*.pkl'.format(indir, relpath)))
    return numPages

In [None]:
def getNonFillerFeatures(filler_file, feat_dir):
    '''
    Collect bootleg score features from all pages that are (a) not filler and (b) have a valid 
    bootleg score matrix.
    '''
    
    filler = getFillerList(filler_file, feat_dir)
    feats = {}
    
    for pieceDir in glob.glob('{}/*/*/'.format(feat_dir)): # e.g. score_feat/Bach/00748/
        
        pieceID = pieceDir.split('/')[-2]
        composer = pieceDir.split('/')[-3]
        accum = [] # collect features from all valid pages in this score
        
        for pkl_file in glob.glob('{}/*.pkl'.format(pieceDir)):
            
            pageID = os.path.splitext(os.path.basename(pkl_file))[0] # e.g. 00748-2
            if pageID in filler: # filler page, skip
                continue
            with open(pkl_file, 'rb') as f:
                bscore = pickle.load(f)['bscore']
            if bscore is not None: # if None, no features were computed
                accum.append(bscore == 1) # convert from float to bool to compress memory
        
        if len(accum) > 0:
            feats[pieceDir] = accum
    
    return feats

In [None]:
filler_file = 'cfg_files/filler.txt'
score_feat_dir = 'score_feat'
feats = getNonFillerFeatures(filler_file, score_feat_dir)

### Investigate Feature Statistics

In [None]:
def getFeatureStats(feats):
    
    # count number of features per page
    featsPerPage = []
    for pieceDir in feats:
        for elem in feats[pieceDir]:
            featsPerPage.append(elem.shape[1])
    featsPerPage = np.array(featsPerPage)
    printStats(featsPerPage, "Number of Features Per Page")
    
    # plot histogram
    plt.subplot(2,1,1)
    plt.hist(featsPerPage, bins=100)
    plt.xlabel('Number of Events In Single Page')
    plt.ylabel('Frequency')
    plt.show()
    
    # count total number of pages by composer
    pages = {}
    for pieceDir in feats: # e.g. score_feat/Bach/00748/
        composer = pieceDir.split('/')[-3]
        if composer not in pages:
            pages[composer] = 0
        pages[composer] += len(feats[pieceDir])
    pageCnts = [pages[composer] for composer in pages]
    composers = [composer[0:5] for composer in pages]
    printStats(pageCnts, "Total Number of Pages by Composer")
    
    # plot histogram
    x_pos = np.arange(len(pageCnts))
    plt.bar(x_pos, pageCnts)
    plt.xticks(x_pos, composers)
    plt.ylabel('Total # Pages')
    plt.show()
    
    # count total number of note events by composer
    noteEvents = {}
    for pieceDir in feats: # e.g. score_feat/Bach/00748/
        composer = pieceDir.split('/')[-3]
        if composer not in noteEvents:
            noteEvents[composer] = 0
        for elem in feats[pieceDir]:
            noteEvents[composer] += elem.shape[1]
    noteEventCnts = [noteEvents[composer] for composer in noteEvents]
    printStats(noteEventCnts, "Total Number of Note Events by Composer")
    
    # plot histogram
    x_pos = np.arange(len(composers))
    plt.bar(x_pos, noteEventCnts)
    plt.xticks(x_pos, composers)
    plt.ylabel('Total # Note Events')
    plt.show()
    
    return

In [None]:
def printStats(arr, title = None):
    if title:
        print(title)
    print('Mean: {}'.format(np.mean(arr)))
    print('Std: {}'.format(np.std(arr)))
    print('Min: {}'.format(np.min(arr)))
    print('Max: {}'.format(np.max(arr)))

In [None]:
getFeatureStats(feats)

### Split Data into Train, Validation, & Test

In [None]:
def splitTrainValidTest(d, train=.6, validation=.2, test=.2, savefile = None):
    
    # shuffle
    assert(train + validation + test == 1.0)
    np.random.seed(0)
    pieceDirs = list(d.keys())
    np.random.shuffle(pieceDirs)
    
    # split
    breakpt1 = int(len(pieceDirs) * train)
    breakpt2 = breakpt1 + int(len(pieceDirs) * validation)
    pieceDirs_train = pieceDirs[0:breakpt1]
    pieceDirs_valid = pieceDirs[breakpt1:breakpt2]
    pieceDirs_test = pieceDirs[breakpt2:]
    
    # save
    d_train = getDataSubset(d, pieceDirs_train)
    d_valid = getDataSubset(d, pieceDirs_valid)
    d_test = getDataSubset(d, pieceDirs_test)
    if savefile:
        saveToPickle([d, pieceDirs_train, pieceDirs_valid, pieceDirs_test], savefile)
    
    return d_train, d_valid, d_test

In [None]:
def getDataSubset(dAll, toKeep):
    dSubset = {}
    for pieceDir in toKeep:
        dSubset[pieceDir] = dAll[pieceDir]
    return dSubset

In [None]:
def saveToPickle(d, outfile):
    with open(outfile, 'wb') as f:
        pickle.dump(d, f)

In [None]:
def loadPickle(infile):
    with open(infile, 'rb') as f:
        d = pickle.load(f)
    return d

In [None]:
save_pages_file = '{}/data.pages.pkl'.format(score_feat_dir)
d_train, d_valid, d_test = splitTrainValidTest(feats, train=.6, validation=.2, test=.2, savefile=save_pages_file)

### Format data in chunks

In [None]:
def getComposer2IndexMapping(feat_dir):
    composers = []
    for composerDir in sorted(glob.glob('{}/*/'.format(feat_dir))):
        composer = composerDir.split('/')[-2]
        composers.append(composer)
    c_to_i = {c:i for i, c in enumerate(composers)}
    
    return c_to_i, composers

In [None]:
def getChunkedData(d, chunkSize, c_to_i):
    frags = []
    labels = []
    pieceDir2idxRange = {}
    for pieceDir in d:
        merged = np.hstack(d[pieceDir])
        composerIdx = c_to_i[pieceDir.split('/')[-3]]
        startChunkIdx = len(frags)
        for startIdx in range(0, merged.shape[1], chunkSize // 2):
            endIdx = startIdx + chunkSize
            if endIdx <= merged.shape[1]:
                frags.append(merged[:,startIdx:endIdx])
                labels.append(composerIdx)
        endChunkIdx = len(frags)
        pieceDir2idxRange[pieceDir] = (startChunkIdx, endChunkIdx)
    frags = np.array(frags)
    labels = np.array(labels)
    
    return frags, labels, pieceDir2idxRange

In [None]:
composer2idx, idx2composer = getComposer2IndexMapping(score_feat_dir)

In [None]:
chunkSize = 64
X_train, y_train, map_train = getChunkedData(d_train, chunkSize, composer2idx)
X_valid, y_valid, map_valid = getChunkedData(d_valid, chunkSize, composer2idx)
X_test, y_test, map_test = getChunkedData(d_test, chunkSize, composer2idx)

In [None]:
X_train.shape, X_valid.shape, X_test.shape

In [None]:
save_chunks_file = '{}/data.chunks.pkl'.format(score_feat_dir)
saveToPickle([X_train, y_train, map_train, X_valid, y_valid, map_valid, X_test, y_test, map_test], save_chunks_file)