In [22]:
import gzip
import json
import shutil
import os
from ast import literal_eval
import pandas as pd
from datetime import datetime

In [23]:
# For unzipping .gz files 
for f in os.listdir():
  if 'json' in f:  
      with gzip.open(f, 'rb') as f_in:
          with open(f.replace('.gz',''), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

---
## Data Wrangling

Due to invalid formatting found in the JSON files during data modeling, the code below was utilized to aid in validating and cleaning JSON data for processing. First, run **python -m json.tool filename** in command line to check whether file is a valid JSON document. Should receive an error if invalid. Othewise, the whole file should print. 

If an error is confirmed, run the data pipeline below to render a clean json file. The data pipeline is defined to utilze the JSONDecoder.raw_decode() (and its undocumented second parameter) to traverse the data, look for valid JSON structures in an iterative manner, and parse any invalid structures it encounters. A nice benefit to this built-in json module is that it will properly parse the data even if the concatenated JSONs are not properly indented or are just missing. 

Once all our JSON data has been parsed, the file will be outputted, read again, and unnested at the first level. This should aid us in idenitifying which JSON objects need to be flatten even further.

**Please note that even after running the JSON files into the data pipeline, the data will still be structured as a JSON array (or list in Python) rather than the standard JSON object (or dict in Python)**

In [24]:
def jsonFormatter(filename, parsed= None, parser= None): 
    parser = json.JSONDecoder() 
    parsed = [] # a list to hold individually parsed JSON structures
    with open('{filename}.json'.format(filename = filename)) as f: 
        data = f.read() 
        head = 0 # hold the current position as we parse while True: 
        while True:
            head = (data.find('{', head) + 1 or data.find('[', head) + 1) - 1
            try:
                struct, head = parser.raw_decode(data, head)
                parsed.append(struct)
            except (ValueError, json.JSONDecodeError):  # no more valid JSON structures
                break

    with open('{filename}Clean.json'.format(filename = filename), 'w', encoding='utf-8') as jsonfile: # Parsed file is outputted for documentation
        json.dump(parsed, jsonfile, ensure_ascii=False, indent=2)

        df = pd.json_normalize(parsed, max_level = 1) # objects unnested
        df.rename(columns=lambda x: x.split('.')[0].replace(' ','') if '.' in x else x, inplace= True) #removing json keys in column name
        return df

In [25]:
# Load fetch rewards datasets.
users = jsonFormatter('users')
receipts = jsonFormatter('receipts')
brands = jsonFormatter('brands')

# Flattening Deeply Nested JSON Objects

After formatting our datasets into a desired state, our next step is to explode the deeply nested variables such as 'rewardsReceiptItemList' in an attempt to access the JSON arrays (or lists of receipts). From there, we ensure each item, specifically NAs, is embedded within lists, convert them to strings, and feed our variable into the literal_eval function to detect the individualized items within each receipt. Finally, we run json_normalize to unnest all keys and values and merge them back to their respective datasets by index. 

## Receipts

In [26]:
receipts = receipts.reindex(sorted(receipts.columns), axis=1) 

In [27]:
receipts = receipts.explode('rewardsReceiptItemList') # explode nested objects
receipts.reset_index(inplace=True)

receipts = receipts.fillna({'rewardsReceiptItemList':'{}'}) #adding curly bracklets to detect lists among NAs
receipts['rewardsReceiptItemList'] = receipts['rewardsReceiptItemList'].apply(lambda x:str(x)) # converting to strings
receipts['rewardsReceiptItemList'] = receipts['rewardsReceiptItemList'].apply(literal_eval) # detecting dictionaries and lists

In [28]:
rewardsReceiptsFlat = pd.json_normalize(receipts['rewardsReceiptItemList'],errors='ignore',record_prefix='rewardsReceiptItemList') #unnesting by variable, ideally performed with meta
rewardsReceiptsFlat.rename(columns =  {'pointsEarned': 'pointsEarnedReceipt'}, inplace= True)
rewardsReceiptsFlat = rewardsReceiptsFlat.reindex(sorted(rewardsReceiptsFlat.columns), axis=1) 

In [29]:
receipts_clean =  pd.merge(receipts, rewardsReceiptsFlat, left_index = True, right_index = True, how = 'outer') # Merging by index
receipts_clean = receipts_clean.drop(['rewardsReceiptItemList'], axis= 1)

## Brands

In [30]:
s = pd.Series(brands.columns)
brands.columns= brands.columns+s.groupby(s).cumcount().replace(0,'').astype(str) # idenitfied duplicated columns and numbered them
brands = brands.reindex(sorted(brands.columns), axis=1) 

In [31]:
cpgFlatId = pd.json_normalize(brands['cpg'], errors='ignore', record_prefix='cpg' , max_level= 1)\
    .add_prefix('cpgId') #unnesting by variable and adding prefix to column name

In [32]:
brands_clean =  pd.merge(brands, cpgFlatId, left_index = True, right_index = True, how = 'outer') # Merging by index
brands_clean = brands_clean.drop(['cpg'], axis= 1)

## UNIX Timestamp to DateTime

In [516]:
def dateConverter(x):
  try:
    return(datetime.utcfromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S'))
  except:
    return pd.NaT

In [517]:
for col in receipts_clean.columns:
    if 'date' in col.lower():
        receipts_clean[col] = receipts_clean[col].apply(lambda time: dateConverter(time))

In [518]:
receipts_clean = receipts_clean.dropna(axis = 1, how = 'all')

In [519]:
receipts_clean = receipts_clean.drop_duplicates()

# Users

In [520]:
users = users.reindex(sorted(users.columns), axis=1) 

In [521]:
for col in users.columns:
    if 'date' in col.lower():
        users[col] = users[col].apply(lambda time: dateConverter(time))

In [522]:
user_clean = users.dropna(axis = 1, how = 'all')

In [411]:
users_clean = users.drop_duplicates()