This notebook will help you define and run the pipelines for preprocessing your data (which you should have loaded earlier with the help of `1.1. Collate Files.ipynb`).


**INSTRUCTIONS**

First, adjust the definition of the pipelines inside `pipeline_graph_def`. Then run `build_dataset`. This will create 4 files, two sets of train and evaluate. The first set is the inputs, and the second set is the targets.

**DEPENDENCIES**

In [18]:
from magenta.protobuf import music_pb2
from magenta.pipelines import pipelines_common, dag_pipeline, note_sequence_pipelines

import process_data
from process_data import PerformanceExtractor, PerformanceParser

import os

**PARAMETERS**

In [19]:
pipeline_config = dict()

pipeline_config['data_source_dir'] = "./data/note_seq_proto/"
pipeline_config['data_target_dir'] = "./data/performance_seq_text/"

pipeline_config['steps_per_second'] = 100

pipeline_config['min_events'] = 1
pipeline_config['max_events'] = 10000

pipeline_config['eval_ratio'] = 0.1

**DEFINITIONS**

In [20]:
def pipeline_graph_def(collection_name,
                       config):
    """Returns the Pipeline instance which creates the RNN dataset.

    Args:
        collection_name:
        config: dict() with configuration settings

    Returns:
        A pipeline.Pipeline instance.
    """
    
    # Stretch by -5%, -2.5%, 0%, 2.5%, and 5%.
    stretch_factors = [0.95, 0.975, 1.0, 1.025, 1.05]

    # Transpose no more than a major third.
    transposition_range = range(-3, 4)

    partitioner = pipelines_common.RandomPartition(
        music_pb2.NoteSequence,
        ['eval_arrangement' + '_' + collection_name,  
         'train_arrangement' + '_' + collection_name],
        [pipeline_config['eval_ratio']])
    dag = {partitioner: dag_pipeline.DagInput(music_pb2.NoteSequence)}

    for mode in ['eval', 'train']:
        key = mode + '_arrangement' + '_' + collection_name
        
        quantizer = note_sequence_pipelines.Quantizer(
            steps_per_second=pipeline_config['steps_per_second'], 
            name='Quantizer_' + key)
        
        perf_extractor = PerformanceExtractor(
            min_events=pipeline_config['min_events'],
            max_events=pipeline_config['max_events'],
            num_velocity_bins=0,
            name='PerformanceExtractor_' + key)
            # input_type = music_pb2.NoteSequence
            # output_type = magenta.music.Performance
            
        perf_parser = PerformanceParser(
            name='PerformanceParser_' + key)
            # input_type = magenta.music.Performance
            # output_type = str
        
        dag[quantizer] = partitioner[key]
        dag[perf_extractor] = quantizer
        dag[perf_parser] = perf_extractor
        dag[dag_pipeline.DagOutput(key)] = perf_parser
        
    return dag_pipeline.DAGPipeline(dag)

# Build Dataset

In [21]:
process_data.build_dataset(pipeline_config, pipeline_graph_def)

FileExistsError: File ./data/performance_seq_text/eval_arrangement_inputs.txt already exists. Please remove and try again.

# Build Vocabulary

In [13]:
process_data.build_vocab(pipeline_config)

INFO: File ./data/performance_seq_text/vocab.txt exists. Removing. Rebuilding vocabulary.
INFO: Vocabulary built.
