This is a pre-processing script to load the metadata.json file from http://jmcauley.ucsd.edu/data/amazon/ into BigQuery. 

The file cannot be loaded as is because some of the JSON elements contain unsupported syntax. 

Once the script is run, the resulting JSON file is BQ friendly

In [54]:
import json
import ast

"""
Arguments:
   record (string): JSON record of format desrcibed in 'Metadata' file here: http://jmcauley.ucsd.edu/data/amazon/
   elements (list): the JSON elements to preseve from the record (the rest will be left out)
Returns:
   (string): JSON record with only the specified elements
Example Usage:
   slim_record(single_json_record,['asin','title','imUrl'])
"""
def slim_record(record,elements):
    slim_record = {}
    record = ast.literal_eval(record) #converts to python dict. using ast.literal_eval() since file is not strict JSON so json.load() fails
    for element in elements: #TODO: can you convert for loop to comprehension?
        slim_record.update({element:record.get(element)})
    return slim_record

"""
Arguments:
   input_path (string): path to 'metadata' file frome here: http://jmcauley.ucsd.edu/data/amazon/
Returns:
   yield returns a generator that reads one line at a time instead of trying to read the whole
   file into memory which would lead to out of memory error for large data files 
"""
def parse_inline(input_path):
  with open(input_path) as file:
      for record in file:
        yield slim_record(record,['asin','title','imUrl']) #

In [55]:
with open('metadata/output_sample.json', 'w') as output:
    for line in parse_inline('metadata/metadata_sample.json'): #generator
      output.write(json.dumps(line) + '\n')