In [1]:
import gzip
import json
import shutil
import os
from ast import literal_eval
import pandas as pd
from datetime import datetime

In [2]:
# For unzipping .gz files 
for f in os.listdir():
  if 'json' in f:  
      with gzip.open(f, 'rb') as f_in:
          with open(f.replace('.gz',''), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

---
## Data Wrangling

Due to invalid formatting found in the JSON files during data modeling, the code below was utilized to aid in validating and cleaning JSON data for processing. First, run **python -m json.tool filename** in command line to check whether file is a valid JSON document. Should receive an error if invalid. Othewise, the whole file should print. 

IF an error is confirmed, run the data pipeline below to render a clean json file. The data pipeline is defined to utilze the JSONDecoder.raw_decode() (and its undocumented second parameter) to traverse the data, look for valid JSON structures in an iterative manner, and parse any invalid structures it encounters. A nice benefit to this built-in json module is that it will properly parse the data even if the concatenated JSONs are not properly indented or are just missing. 

Once all our JSON data has been parsed, the file will be outputted, read again, and unnested at the first level. This should aid us in better idenitifying which JSON objects need to be flatten even further.

**Please note that even after running the JSON files into the data pipeline, the data will still be structured as a JSON array (or list in Python) rather than the standard JSON object (or dict in Python)**

## FIX FUNCTION TMR
parser = json.JSONDecoder()
parsed = []  # a list to hold individually parsed JSON structures

def jsonFormatter(filename):
    with open('{filename}.json'.format(filename = filename)) as f:
        data = f.read()
    head = 0  # hold the current position as we parse
    while True:
        head = (data.find('{', head) + 1 or data.find('[', head) + 1) - 1
        try:
            struct, head = parser.raw_decode(data, head)
            parsed.append(struct)
        except (ValueError, json.JSONDecodeError):  # no more valid JSON structures
            break

    with open('{filename}Clean.json'.format(filename = filename), 'w', encoding='utf-8') as jsonfile:
            json.dump(parsed, jsonfile, ensure_ascii=False, indent=2)
            
            # Clean.json file is read,, loaded as a list, and then unnested by the first level. This will automatically covert it to a dataframe. 
    with open('{filename}Clean.json'.format(filename = filename)) as f:
        cleanDataDict = json.load(f)
        file = pd.json_normalize(cleanDataDict, max_level = 1)
        # Due to unnesting, column names include json keys so this function helps with removing them.
        file.rename(columns=lambda x: x.split('.')[0].replace(' ','') if '.' in x else x, inplace= True)
        return file

In [None]:
# Load fetch rewards datasets.
#users = jsonFormatter('users')
#receipts = jsonFormatter('receipts')
#brands = jsonFormatter('brands')

In [None]:
parser = json.JSONDecoder()
parsed = []  # a list to hold individually parsed JSON structures

#def jsonFormatter(filename):
with open('receipts.json') as f:
    data = f.read()
    head = 0  # hold the current position as we parse
    while True:
        head = (data.find('{', head) + 1 or data.find('[', head) + 1) - 1
        try:
            struct, head = parser.raw_decode(data, head)
            parsed.append(struct)
        except (ValueError, json.JSONDecodeError):  # no more valid JSON structures
            break

with open('receiptsClean.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(parsed, jsonfile, ensure_ascii=False, indent=2)
    receipts = pd.json_normalize(parsed, max_level = 1)
    # Due to unnesting, column names include json keys so this function helps with removing them.
    receipts.rename(columns=lambda x: x.split('.')[0].replace(' ','') if '.' in x else x, inplace= True)

# Flattening Deeply Nested JSON Objects

After formatting our datasets into a desired state, our next step is to explode the deeply nested variables such as 'rewardsReceiptItemList' in an attempt to access the JSON arrays (or lists of receipts). From there, we ensure each item is embedded within a dictionary, convert them to a string, and feed our variable into the literal_eval function to detect the dictionaries within each receipt. Finally, we run json_normalize to unnest all keys and values and merge them back to their respective datasets by index. 

In [6]:
receipts = receipts.explode('rewardsReceiptItemList')
receipts.reset_index(inplace=True)

receipts = receipts.fillna({'rewardsReceiptItemList':'{}'})
receipts['rewardsReceiptItemList'] = receipts['rewardsReceiptItemList'].apply(lambda x:str(x))
receipts['rewardsReceiptItemList'] = receipts['rewardsReceiptItemList'].apply(literal_eval)

In [7]:
rewardsReceipts_flat = pd.json_normalize(receipts['rewardsReceiptItemList'],errors='ignore',record_prefix='rewardsReceiptItemList')

In [8]:
receipts_clean =  pd.merge(receipts, rewardsReceipts_flat, left_index = True, right_index = True, how = 'outer')

In [9]:
receipts_clean = receipts_clean.drop(['rewardsReceiptItemList'], axis= 1)
receipts_clean = receipts_clean.drop_duplicates()

In [1]:
#Read json files and make them dataframe for data wrangling
#users_test = pd.read_json('users.json', lines= True)
#receipts_test = pd.read_json('receipts.json', lines= True)
#brands_test = pd.read_json('brands.json', lines= True)