# Demonstration

In [9]:
# Helper functions

import json
import os

from click.testing import CliRunner
import yaml
from fastavro import writer, reader, parse_schema
from pprint import pprint

from pfb import cli
from pfb.cli import main as pfb

def read_yaml(filepath):
    with open(filepath, "r") as yaml_file:
        return yaml.load(yaml_file, Loader=yaml.FullLoader)

def read_json(filepath, default=None):
    if (default is not None) and (not os.path.isfile(filepath)):
        return default

    with open(filepath, 'r') as data_file:
        return json.load(data_file)

def write_json(data, filepath, **kwargs):
    with open(filepath, 'w') as json_file:
        kwargs = {
            'indent': 4,
            'sort_keys': True
        }
        json.dump(data, json_file, **kwargs)

def minify(input_json_file, output_file):
    data = read_json(input_json_file)
    with open(output_file, 'w') as minified_file:
        s = json.dumps(data, separators=(',', ':'))
        minified_file.write(s)
        
def pfb_invoke(*args, **kwargs):
    # Use CliRunner to call Click cli from python
    runner = CliRunner()
    result = runner.invoke(main, args, **kwargs)
    try:
        assert result.exit_code == 0, result.output
    except AssertionError:
        print(str(result.exc_info))

    return result

## Vanilla Avro

In [47]:
# Output avro filepath
data_file = 'data/kf-vanilla.avro'

# Avro schema describing data that will go into avro file
schema = {
    "namespace": "kf-vanilla.avro",
     "type": "record",
     "name": "Participant",
     "fields": [
         {"name": "external_id", "type": "string"},
         {"name": "gender",  "type": ["null", "string"]}
     ]
}
write_json(schema, 'data/kf-vanilla-avro-schema.json')

# Parse the schema into memory so that subsequent ops are faster
parsed_schema = parse_schema(schema)

# Create some data that conforms to the schema
records = [
    {"external_id": "P1", "gender": "female"},
    {"external_id": "P2", "gender": "male"}
]

# Write the schema and data into the avro file
with open(data_file, 'wb') as out:
    writer(out, parsed_schema, records)

# Read the binary compressed avro data back into JSON
with open(data_file, 'rb') as fo:
    print(f'Avro file {data_file} schema:')
    pprint(reader(fo).metadata)
    fo.seek(0)
    print(f'\nAvro file {data_file} data:')
    for record in reader(fo):
        pprint(record)
    

Avro file data/kf-vanilla.avro schema:
{'avro.codec': 'null',
 'avro.schema': '{"type": "record", "name": "kf-vanilla.avro.Participant", '
                '"fields": [{"name": "external_id", "type": "string"}, '
                '{"name": "gender", "type": ["null", "string"]}]}'}

Avro file data/kf-vanilla.avro data:
{'external_id': 'P1', 'gender': 'female'}
{'external_id': 'P2', 'gender': 'male'}


## PFB Avro - Suitable for relational data

In [2]:
# Create test data using gen3 data simulator
# Requires the gen3 data dictionary to be stored on s3
data_dir = 'data/simulated/'
gen3_dd = 'data/kf-gen3-datadict.json'
schema_avro = 'data/kf-pfb-schema.avro'
output_avro = 'data/kf-pfb.avro'
program = 'kidsfirst'
project = 'drc'

# Execute if you don't have any test data yet
# !data-simulator simulate --url https://s3.amazonaws.com/singhn4-data-dict-bucket/kf-gen3-datadict.json --path data/simulated --program kidsfirst --project drc
# !ls -l data/simulated   

In [16]:
# Create schema avro file from gen3 data dict
kf_gen3_dd = read_yaml(gen3_dd)

print('******************* Writing PFB Schema *******************')
result = pfb_invoke('from', '-o', schema_avro, 'dict', gen3_dd)
print(result.output)
# # Show the avro schema in pfb file
# print('******************* PFB Schema *******************')
# result = pfb_invoke('show', '-i', schema_avro, 'schema')
# pprint(json.loads(result.output))

# Write the test data to the output avro file
print('******************* Writing data to PFB file *******************')
result = pfb_invoke('from', '-o', output_avro, 'json',
           '-s', schema_avro, 
           '--program', program,
          '--project', project,
          data_dir)
print(result.output)
# # Read the data back out from the pfb file
# print('******************* PFB Data *******************')
# print('PFB nodes')
# result = pfb_invoke('show', '-i', output_avro, 'nodes')
# print(result.output)

# # Read the binary compressed avro data back into JSON
# print('Avro data records')
# with open(output_avro, 'rb') as fo:
#     for record in reader(fo):
#         pprint(record)

******************* Writing PFB Schema *******************
Loading dictionary: data/kf-gen3-datadict.json
Parsing dictionary...
Writing PFB...
Done, created PFB file at: data/kf-pfb-schema.avro

******************* Writing data to PFB file *******************
Loading schema...
1/3: family
2/3: participant
3/3: project
Done!

