This notebook will help you define and run pipelines to process your data. This includes data augmentation, slicing, stretching and encoding among others. If you want to use this notebook, you are expected to have already collated your original `.xml` with the help of `1.1. Collate Files.ipynb`.

Pipelines are a data processing module which transforms input data types to output data types. The idea as well as bits & pieces are borrowed [the Magenta project](https://github.com/tensorflow/magenta/tree/master/magenta/pipelines).


**INSTRUCTIONS**
 
First, adjust the definition of the pipelines inside `pipeline_graph_def`. Then run `build_dataset`. This will create 4 files, two sets of train and evaluate. The first set is the inputs, and the second set is the targets.

**DEPENDENCIES**

In [40]:
import arranger_pipelines
import importlib
importlib.reload(arranger_pipelines)

<module 'arranger_pipelines' from '/Users/vesko/GitHub/UoE-dissertation/model/build_dataset/arranger_pipelines.py'>

In [41]:
# General
import os
import re 
import pandas as pd

# The processing manager which glues everything
import arranger_pipelines

# Augmentation Pipelines
from arranger_pipelines import TransposerToC, TransposerToRange, Reverser

# Processing Pipelines
from magenta.pipelines.note_sequence_pipelines import Quantizer, Splitter
from arranger_pipelines import PerformanceExtractor, MetadataExtractor, ParserToText, QuantizedSplitter

# Other
from magenta.protobuf import music_pb2
from magenta.pipelines import dag_pipeline

**PARAMETERS**

In [42]:
pipeline_config = dict()

pipeline_config['data_source_dir'] = "../assets/data/collated/M/"
pipeline_config['data_target_dir'] = "../assets/data/processed/hummingbird/"

In [46]:
# How many steps per quarter note
pipeline_config['steps_per_quarter'] = 4

pipeline_config['min_events'] = 1
pipeline_config['max_events'] = 9999999

pipeline_config['MIN_MIDI_PITCH'] = 0 # Inclusive.
pipeline_config['MAX_MIDI_PITCH'] = 126 # Inclusive.

**DEFINITIONS**

In [47]:
def pipeline_graph_def(collection_name,
                       config):
    """Returns the Pipeline instance which creates the RNN dataset.

    Args:
        collection_name:
        config: dict() with configuration settings

    Returns:
        A pipeline.Pipeline instance.
    """
    
    
    # User Variables
    metadata_df = pd.read_csv(os.path.join(pipeline_config['data_source_dir'], 'filex_index.csv'), index_col=0)
    metadata_attr = []
    split_hop_size_seconds = 99999
    hop_bars = list(range(0,500,1))
    
    # Do Not Modify those
    train_mode = re.match(r'train(?=_)', collection_name)
    key = collection_name
    
    # Input must NOT be quantized
    splitter = Splitter(
        hop_size_seconds=split_hop_size_seconds,
        name='Splitter_' + key)
    
    # `Quantizer` takes note data in seconds and snaps, or quantizes, 
    # everything to a discrete grid of timesteps. It maps `NoteSequence` 
    # protocol buffers to `NoteSequence` protos with quanitzed times. 
    quantizer = Quantizer(
        steps_per_quarter=pipeline_config['steps_per_quarter'], 
        name='Quantizer_' + key)
        # input_type=music_pb2.NoteSequence
        # output_type=music_pb2.NoteSequence
        
    # Input MUST BE quantized
    quant_splitter = QuantizedSplitter(
        hop_bars=hop_bars,
        metadata_df = metadata_df,
        name='QuantizedSplitter_' + key)
        
    reverser = Reverser(
        True if train_mode else False, 
        name='Reverser' + key)
        # input_type=music_pb2.NoteSequence
        # output_type=music_pb2.NoteSequence
        
    transposerToC = TransposerToC(
        name='TransposerToC' + key,
        min_valid_pitch = pipeline_config['MIN_MIDI_PITCH'],
        max_valid_pitch = pipeline_config['MAX_MIDI_PITCH'])

    transposer = TransposerToRange(
        range(-24, 24) if train_mode else [0],
        min_pitch = pipeline_config['MIN_MIDI_PITCH'],
        max_pitch = pipeline_config['MAX_MIDI_PITCH'],
        name='TransposerToRange_' + key)
        # input_type=music_pb2.NoteSequence
        # output_type=music_pb2.NoteSequence

    perf_extractor = PerformanceExtractor(
        min_events=pipeline_config['min_events'],
        max_events=pipeline_config['max_events'],
        num_velocity_bins=0,
        name='PerformanceExtractor_' + key)
        # input_type = music_pb2.NoteSequence
        # output_type = magenta.music.MetricPerformance

    meta_extractor = MetadataExtractor(
        metadata_df = metadata_df,
        attributes=metadata_attr,
        name = 'MetadataExtractor' + key)
    
    parser = ParserToText(
        name='ParserToText' + key)
        # input_type = magenta.music.MetricPerformance
        # output_type = str

    
    ### Pipelines Full Map ###
    #
    # DagInput > Splitter > Quantizer > QuantizedSplitter > Reverser > TransposerToC > TransposerToRange > PerformanceExtractor > 'MetricPerformance'
    # DagInput > MetadataExtractor > 'metadata'
    # 
    # {'MetricPerformance', 'meta'} > ParserToText > DagOutput
    #
    
    dag = {}
    dag[quantizer] = dag_pipeline.DagInput(music_pb2.NoteSequence)
    dag[quant_splitter] = quantizer
    dag[reverser] = quant_splitter
    dag[transposerToC] = reverser
    dag[transposer] = transposerToC
    dag[perf_extractor] = transposer
    
#     dag[quantizer] = dag_pipeline.DagInput(music_pb2.NoteSequence)
#     dag[reverser] = quantizer
#     dag[transposerToC] = reverser
#     dag[transposer] = transposerToC
#     dag[perf_extractor] = transposer
    
    dag[meta_extractor] = dag_pipeline.DagInput(music_pb2.NoteSequence)
    
    dag[parser] = { 'MetricPerformance' : perf_extractor, 
                    'metadata' : meta_extractor }
    
    dag[dag_pipeline.DagOutput(key)] = parser
    
    return dag_pipeline.DAGPipeline(dag)




# Build Dataset

In [None]:
arranger_pipelines.build_dataset(pipeline_config, pipeline_graph_def)

INFO: Target ../assets/data/processed/hummingbird/.
INFO: Collated data sourced from ../assets/data/collated/M/.

INFO: Building train_inputs dataset...
INFO: Augmenting by reversing.
INFO: Transposing all to C.
INFO: Transposition range(-24, 24)
INFO: Transposition pipeline will ignore Key Signatures, Pitch Names and Chord Symbols.


# Build Vocabulary

In [35]:
# Uncomment if you want to prepend metadata tokens
# arranger_pipelines.build_vocab(pipeline_config,
#                             source_vocab_from=['train_inputs.txt', 'train_targets.txt'])
arranger_pipelines.build_vocab(pipeline_config)

INFO: Vocabulary built.
INFO: Tokens collected {'ON121', 'SHIFT12', 'ON75', 'OFF66', 'OFF123', 'OFF85', 'ON21', 'ON91', 'ON94', 'OFF4', 'SHIFT15', 'ON27', 'OFF33', 'OFF89', 'ON77', 'ON84', 'ON97', 'OFF72', 'ON69', 'ON1', 'SHIFT0', 'ON4', 'OFF58', 'OFF91', 'ON36', 'OFF55', 'OFF115', 'OFF81', 'ON6', 'OFF92', 'ON5', 'OFF106', 'SHIFT8', 'SHIFT16', 'ON13', 'OFF23', 'ON30', 'ON70', 'OFF94', 'OFF107', 'OFF24', 'OFF35', 'OFF44', 'OFF57', 'ON78', 'OFF45', 'OFF52', 'ON111', 'OFF64', 'OFF37', 'ON55', 'OFF29', 'OFF8', 'ON59', 'OFF61', 'ON95', 'ON12', 'ON98', 'OFF63', 'OFF95', 'OFF38', 'SHIFT7', 'ON18', 'ON22', 'ON57', 'ON60', 'ON40', 'ON62', 'ON108', 'ON119', 'OFF28', 'OFF30', 'OFF103', 'OFF110', 'ON16', 'OFF15', 'OFF118', 'OFF120', 'ON52', 'OFF9', 'OFF97', 'ON64', 'OFF102', 'ON25', 'ON89', 'OFF88', 'OFF42', 'ON96', 'ON54', 'ON104', 'OFF100', 'OFF68', 'SHIFT4', 'OFF5', 'OFF54', 'OFF71', 'ON42', 'ON11', 'OFF41', 'SHIFT13', 'ON38', 'OFF79', 'ON47', 'ON90', 'SHIFT1', 'ON82', 'OFF46', 'ON32', 'OFF11',

# Synchronously Remove Blank Lines in Two Files

Necessary only if splitting.

In [430]:
pipeline_config['data_target_dir'] = "../assets/data/processed/magpie_2/"

In [431]:
import pandas as pd
for dataset_type in ['eval', 'train', 'test']:
    inputs_file_name = dataset_type +'_inputs.txt'
    targets_file_name = dataset_type + '_targets.txt'

    inputs_path = os.path.join(pipeline_config['data_target_dir'], inputs_file_name)
    targets_path = os.path.join(pipeline_config['data_target_dir'], targets_file_name)

    with open(inputs_path, 'r') as i, open(targets_path, 'r') as t:
        inputs = [l for l in i.readlines()]
        targets = [l for l in t.readlines()]

    assert len(inputs) == len(targets)

    to_remove = []
    for i in range(len(inputs)):
        if inputs[i] == '\n' or targets[i] == '\n':
            to_remove.append(i)

    print('INFO: {} Empty line indices found in {}.'.format(len(to_remove), dataset_type))
    
    # Write to disk
    inputs_light = pd.Series(inputs).drop(to_remove)
    targets_light = pd.Series(targets).drop(to_remove)

    with open(os.path.join(pipeline_config['data_target_dir'], 'fixed', inputs_file_name), 'w') as f:
        f.write(''.join(list(inputs_light)))
    print('INFO: Finished writing {}'.format(inputs_file_name))
    with open(os.path.join(pipeline_config['data_target_dir'], 'fixed', targets_file_name), 'w') as f:
        f.write(''.join(list(targets_light)))
    print('INFO: Finished writing {}'.format(targets_file_name))

INFO: 0 Empty line indices found in eval.
INFO: Finished writing eval_inputs.txt
INFO: Finished writing eval_targets.txt
INFO: 0 Empty line indices found in train.
INFO: Finished writing train_inputs.txt
INFO: Finished writing train_targets.txt
INFO: 0 Empty line indices found in test.
INFO: Finished writing test_inputs.txt
INFO: Finished writing test_targets.txt
