In [1]:
import pandas as pd
import numpy as np
import random
import json
import re
import os

from datetime import datetime

# IDENTIFY ALL JSON FILES

In [2]:
files = []

for f in os.listdir():
    if re.findall('.json', f) != []:
        files.append(f)
    else:
        pass

files

['brands.json', 'receipts.json', 'users.json']

# PARSE brands.json FILE

In [3]:
# READ + VIEW DATASTRUCTURE
df_brands = pd.read_json('brands.json', lines=True)
df_brands.head(3)

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176


In [4]:
# STRIP OUT NESTED DICTIONARY VALUES
df_brands['_id'] = df_brands['_id'].apply(lambda i: i['$oid'])
df_brands['cpg_id'] = df_brands['cpg'].apply(lambda i: i['$id']['$oid'])
df_brands['ref'] = df_brands['cpg'].apply(lambda i: i['$ref'])

In [5]:
# FILTER OUT COLS
cols = df_brands.columns.tolist()
cols.remove('cpg')
df_brands = df_brands[cols]

In [6]:
# VIEW DATASTRUCTURE
df_brands.head(3)

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,ref
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,0.0,,601ac114be37ce2ead437550,Cogs
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,0.0,STARBUCKS,5332f5fbe4b03c9a25efd0ba,Cogs
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,601ac142be37ce2ead437559,Cogs


# PARSE receipts.json FILE

In [7]:
# READ + VIEW DATASTRUCTURE
df_receipts = pd.read_json('receipts.json', lines=True)
df_receipts.head(3)

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b


In [8]:
# STRIP OUT NESTED DICTIONARY VALUES
df_receipts['_id'] = df_receipts['_id'].apply(lambda i: i['$oid'])

for c in [c for c in df_receipts.columns if re.findall('date', c.lower())!=[]]:
    df_receipts[c] = df_receipts[c].apply(lambda i: datetime.fromtimestamp(i['$date'] // 1000) if str(i) != str(np.nan) else i)

In [9]:
# GET LIST OF ALL DICTIONARY KEYS WITHIN rewardsReceiptItemList COL

keys = []

for i in df_receipts[~df_receipts.rewardsReceiptItemList.isnull()].reset_index(drop=True).rewardsReceiptItemList:
    keys.extend(i[0].keys())
    
keys = list(set(keys))

In [10]:
# CREATE SUBTABLE FOR rewardsReceiptItemList VALUES
df_receipts_sbtbl = pd.DataFrame(columns=['receipt_id']+keys, dtype='object')

In [11]:
# ASSIGN receipts.json '_id' AS DICTIONARY VALUE FOR rewardsReceiptItemList COL VALUES
for i in list(range(len(df_receipts))):
    if isinstance(df_receipts.rewardsReceiptItemList[i], list):
        df_receipts.rewardsReceiptItemList[i][0]['receipt_id'] = df_receipts._id[i]
    elif str(df_receipts.rewardsReceiptItemList[i]) == str(np.nan):
        df_receipts.rewardsReceiptItemList[i] = [{'receipt_id': df_receipts._id[i]}]
    else:
        print(df_receipts.rewardsReceiptItemList[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
# STRIP OUT NESTED DICTIONARY VALUES for rewardsReceiptItemList COL
for i in list(range(len(df_receipts))):
    for k in (keys + ['receipt_id']):
        try:
            df_receipts.rewardsReceiptItemList[i][0][k] =  [df_receipts.rewardsReceiptItemList[i][0][k]]
        except:
            pass
        
    df_hold = pd.DataFrame.from_dict(df_receipts.rewardsReceiptItemList[i][0])
    df_receipts_sbtbl = df_receipts_sbtbl.append(df_hold, sort=False).reset_index(drop=True)

In [13]:
# FILTER OUT COLS
cols = df_receipts.columns.tolist()
cols.remove('rewardsReceiptItemList')
df_receipts = df_receipts[cols]

In [14]:
# VIEW DATASTRUCTURE
df_receipts.head(3)

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 10:25:31,2021-01-03 10:25:31,2021-01-03 10:25:31,2021-01-03 10:25:36,2021-01-03 10:25:31,500.0,2021-01-02 19:00:00,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 10:24:43,2021-01-03 10:24:43,2021-01-03 10:24:43,2021-01-03 10:24:48,2021-01-03 10:24:43,150.0,2021-01-02 10:24:43,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 10:25:37,2021-01-03 10:25:37,NaT,2021-01-03 10:25:42,NaT,5.0,2021-01-02 19:00:00,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b


# PARSE users .json FILE

In [15]:
# READ + VIEW DATASTRUCTURE
df_users = pd.read_json('users.json', lines=True)
df_users.head(3)

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [16]:
# STRIP OUT NESTED DICTIONARY VALUES
df_users['_id'] = df_users['_id'].apply(lambda i: i['$oid'])
df_users['createdDate'] = df_users['createdDate'].apply(lambda i: datetime.fromtimestamp(i['$date'] // 1000) if str(i) != str(np.nan) else i)
df_users['lastLogin'] = df_users['lastLogin'].apply(lambda i: datetime.fromtimestamp(i['$date'] // 1000) if str(i) != str(np.nan) else i)

In [17]:
# REMOVE DUP '_id' RECORDS ('createdDate' takes precedence)

In [18]:
df_users = df_users.sort_values(by=['createdDate', '_id'], ascending=True).reset_index(drop=True)

In [19]:
df_users = df_users.drop_duplicates(subset='_id', keep='first')

In [20]:
# VIEW DATASTRUCTURE
df_users.head(3)

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,54943462e4b07e684157a532,True,2014-12-19 09:21:22,2021-03-05 11:52:23,fetch-staff,,
20,55308179e4b0eabd8f99caa2,True,2015-04-16 23:43:53,2018-05-07 13:23:40,consumer,,WI
21,5964eb07e4b03efd0c0f267b,True,2017-07-11 11:13:11,2021-03-04 14:07:49,fetch-staff,,IL


# REVIEW FINAL PARSED DATAFRAMES

In [21]:
[var for var in dir() if (isinstance(eval(var), pd.core.frame.DataFrame) and re.findall('df', var))]

['df_brands', 'df_hold', 'df_receipts', 'df_receipts_sbtbl', 'df_users']

In [22]:
df_brands.head(3)

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,ref
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,0.0,,601ac114be37ce2ead437550,Cogs
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,0.0,STARBUCKS,5332f5fbe4b03c9a25efd0ba,Cogs
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,601ac142be37ce2ead437559,Cogs


In [23]:
df_receipts.head(3)

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 10:25:31,2021-01-03 10:25:31,2021-01-03 10:25:31,2021-01-03 10:25:36,2021-01-03 10:25:31,500.0,2021-01-02 19:00:00,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 10:24:43,2021-01-03 10:24:43,2021-01-03 10:24:43,2021-01-03 10:24:48,2021-01-03 10:24:43,150.0,2021-01-02 10:24:43,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 10:25:37,2021-01-03 10:25:37,NaT,2021-01-03 10:25:42,NaT,5.0,2021-01-02 19:00:00,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b


In [24]:
df_receipts_sbtbl.head(3)

Unnamed: 0,receipt_id,originalMetaBriteQuantityPurchased,brandCode,partnerItemId,barcode,itemPrice,preventTargetGapPoints,originalReceiptItemText,needsFetchReviewReason,itemNumber,...,metabriteCampaignId,userFlaggedPrice,targetPrice,originalMetaBriteDescription,competitiveProduct,finalPrice,quantityPurchased,originalMetaBriteBarcode,rewardsProductPartnerId,deleted
0,5ff1e1eb0a720f0523000575,,,1,4011.0,26.0,True,,,,...,,26.0,,,,26.0,5.0,,,
1,5ff1e1bb0a720f052300056b,,,1,4011.0,1.0,,,,,...,,,,,,1.0,1.0,,,
2,5ff1e1f10a720f052300057a,,,1,,,True,,,,...,,26.0,,,,,,,,


In [25]:
df_users.head(3)

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,54943462e4b07e684157a532,True,2014-12-19 09:21:22,2021-03-05 11:52:23,fetch-staff,,
20,55308179e4b0eabd8f99caa2,True,2015-04-16 23:43:53,2018-05-07 13:23:40,consumer,,WI
21,5964eb07e4b03efd0c0f267b,True,2017-07-11 11:13:11,2021-03-04 14:07:49,fetch-staff,,IL


# WRITE DATAFRAMES TO CSV

In [26]:
if not os.path.exists('outputs'):
    os.makedirs('outputs')

In [27]:
df_brands.to_csv('./outputs/brands.csv', index=False)

In [28]:
df_receipts.to_csv('./outputs/receipts.csv', index=False)

In [31]:
df_receipts_sbtbl.to_csv('./outputs/receipts_v.csv', index=False)

In [30]:
df_users.to_csv('./outputs/users.csv', index=False)