In [1]:
from magenta.scripts import convert_dir_to_note_sequences
import constants

import pandas as pd

import re

import sys
import os

### 1. This notebook will help you manage the data files

Instructions:
* Put all song `.xml` files in a single folder.
    * Each song must have at least two peforming levels of difficulty. Files must follow the naming convention defined in the SourceFolderManager class.

In [299]:
class SourceFolderManager():
    """Manager for the folder which holds the raw .xml data files.
    
    Args:
        src_folder: Path to folder which holds .xml files. 
        tgt_folder: Path to folder where to save the reorganized files.
    
    Files inside `src_folder` must be organized by song, such that all 
        files for a song are in one folder. No two songs can be in the
        same folder.
    Files must follow the naming convention:
        [Song Name]_[Performance Level]-[Song Segment]-[Hand].xml, where:
            Performance Level is one of ['_beg', '_int', '_adv']
            Hand is one of ['lh', 'rh', 'bh']
            Song Segment must be a unique string given a song and a 
                performance level. Additionally, Song Segments must have 1:1
                correspondence across the Performance Levels of a song.
    Folders can be nested and do NOT need to follow a naming convention.
    
    """
    
    def __init__(self, src_folder, tgt_folder=None):
        self.src_folder = src_folder
        self.tgt_folder = tgt_folder
        self.files_index = self._build_index()
        
    def _build_index(self):
        """ Builds DataFrame which indexes and classifies the files in `src_folder`"""
        
        files_index = dict()
        
        for path, directories, files in os.walk(self.src_folder):
            for file in files:
                file_match = re.match(r'^[A-Za-z0-9]+(_)(adv|int|beg)-[A-Za-z0-9]+-(lh|rh|bh).xml$', file)
                if file_match:
                    file_id = file
                    name = re.match(r'^[A-Za-z0-9]+(?=(_))', file).group(0)
                    level = re.search(r'(?<=_)(adv|int|beg)(?=(-))', file).group(0)
                    segment = re.search(r'(?<=_(adv|int|beg)-)[A-Za-z0-9]+(?=(-))', file).group(0)
                    hand = re.search(r'(?<=-)(lh|rh|bh)(?=(.xml))', file).group(0)
                    
                    files_index[file_id] = {
                        "name" : name,
                        "level" : level,
                        "segment" : segment,
                        "hand" : hand,
                        "path" : os.path.join(path, file)
                    }
                    
        return pd.DataFrame.from_dict(files_index, orient='index').sort_values(by=['segment'])
        
    def _collate_stats(self, collated):
        """ Return statistics on the collated list.
                - number of pairs total
                - number by type of pair
                - histogram of pairs by level
                - histogram of pairs by segment
                - number of files ignored with another extension
                - size of collated .xml files
        """
        pass
        
    def collate(self, 
                hand='bh',
                level=[('int', 'adv'), ('beg', 'adv'), ('beg', 'int')],
                includeWholeSong=False
               ):
        """Collates source -> target .xml pairs from files in a data folder. 
            
            Args:
                hand: One of `lh`, `rh`, `bh`.
                level: A list of tuples, where the first element is the desired
                source level of playing difficulty, and the second element is the target
                includeWholeSong: `False` includes all segments except wholeSong, `True` 
                    includes all segments including wholeSong
            Returns:
                A list of (`src`, `tgt`) tuples with paths to .xml files. Both paths point
                to .xml files of the same song, segment and hand but varying difficulty.
                
            Asserts validity of arguments.
            
        """
        assert set(sum(level, ())).issubset(('int', 'adv', 'beg'))
        level = set(level)
        assert hand in ['lf', 'rh', 'bh']
        
        collated = list()
        
        _songs_sliced_df = self.files_index.loc[self.files_index['hand'] == hand]
        if not includeWholeSong:
            _songs_sliced_df = _songs_sliced_df.loc[_songs_sliced_df['segment'] != 'wholeSong']
        
        # Iterate over all songs
        for song_name in self.files_index['name'].unique():
            # Temp dataframe sliced by the current song and hand
            _song_df = _songs_sliced_df.loc[(_songs_sliced_df['name'] == song_name) & 
                                            (_songs_sliced_df['hand'] == hand)]
            # Get available levels for a song
            available_levels = _song_df['level'].unique()
            # Check which requested pairings are possible
            for pairing in level:
                assert len(pairing) == 2
                if pairing[0] in available_levels and pairing[1] in available_levels:
                    src = _song_df.loc[_song_df['level'] == pairing[0]]['segment']
                    tgt = _song_df.loc[_song_df['level'] == pairing[1]]['segment']
                    try:
                        # Two levels of difficulty must have matching segments
                        assert list(src) == list(tgt)
                        src = _song_df.loc[_song_df['level'] == pairing[0]]['path']
                        tgt = _song_df.loc[_song_df['level'] == pairing[1]]['path']
                        collated += list(zip(src, tgt))
                    except:
                        print('\n')
                        print('INFO: Skipping "{}" because of mismatching segments.'.format(song_name))
        
        return collated


In [294]:
manager_instance = SourceFolderManager(constants.SRC_FOLDER)

In [295]:
# manager_instance.files_index.loc[(manager_instance.files_index['segment'] == 'wholeSong') 
#                                 & (manager_instance.files_index['hand'] == 'bh')]

In [296]:
# _song_df = manager_instance.files_index
# _song_df.loc[(_song_df['name'] == 'YouveGotAFriendInMe')
#             & (_song_df['level'] == 'int')
#             & (_song_df['hand'] == 'bh')]

In [297]:
collated = manager_instance.collate()

In [298]:
len(collated)

2339

### 2. Load XMLs to TFRecord

This will take the `.xml` files from the `INPUT_DIR` and convert them to a single `.tfrecord` file, which is a collection of `NoteSequence` protos.

In [52]:
convert_dir_to_note_sequences.convert_directory(INPUT_DIR, TFRECORD_FILE, recursive=False)

INFO:tensorflow:Converting files in './'.
INFO:tensorflow:Converted MusicXML file ./sevenyears_adv-wholeSong-bh.xml.
INFO:tensorflow:Converted MusicXML file ./sevenyears_adv-intro-bh.xml.


### 3. Useful scripts for reading TFRecord

We can read the contents of the TFRecord file holding NoteSequence records like this:

The following is a more general record iterator, which accepts as a second argument the protocol buffer class to be used for deserialization. Yields a generator.