# Data Exploration for brands.json, users.json, reciepts.json

## Library Imports and Helper Code

In [33]:
import gzip
import json
import pandas as pd
import random
from collections import defaultdict


#Helper Functions to load data from gzip json
def read_json_from_gzip(file_path):
    try:
        with gzip.open(file_path, 'rt') as f:
            json_data = []
            for line in f:
                json_obj = json.loads(line)
                json_data.append(json_obj)
            return json_data
    except Exception as e:
        try:
            data = []
            with gzip.open(file_path, 'rt', encoding='utf-8') as f2:
                firstline=True
                for line in f2:
                    if firstline:
                        firstline=False
                        line =line[line.find('{'):]
                    try:
                        data.append(json.loads(line))
                    except:
                        print(line)
            return data
        except Exception as e:
            print(line)
            print(f"Error reading JSON from {file_path}: {e}")
            return None
    

## Exploring Brands

In [34]:
# Load brands JSON and convert to Dataframe
brands_json= read_json_from_gzip('../data/raw_data/brands.json.gz')
brands_df = pd.DataFrame(brands_json)

In [35]:
#View 5 random elements in json
for i in range(5):
    ind = random.randint(0, len(brands_json))
    print("Index", ind)
    print(brands_json[ind])
    print()

Index 104
{'_id': {'$oid': '57c0829ce4b0718ff5fcb03a'}, 'name': 'Victoria', 'cpg': {'$ref': 'Cpgs', '$id': {'$oid': '5332f7a7e4b03c9a25efd134'}}, 'barcode': '511111002437'}

Index 1116
{'_id': {'$oid': '5d66e07da3a018093ab3472d'}, 'name': 'Sierra Mist', 'cpg': {'$ref': 'Cogs', '$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}}, 'category': 'Beverages', 'barcode': '511111205500', 'brandCode': 'SIERRA MIST'}

Index 577
{'_id': {'$oid': '53501d29b9237fd7718f9e15'}, 'name': 'Tombstone', 'cpg': {'$ref': 'Cpgs', '$id': {'$oid': '5332fa58e4b03c9a25efd215'}}, 'barcode': '511111302858'}

Index 180
{'_id': {'$oid': '5ffe3a86be37ce7aab2d4fd7'}, 'name': 'test brand @1610496646289', 'cpg': {'$ref': 'Cogs', '$id': {'$oid': '5ffe3a85be37ce7aab2d4fd4'}}, 'category': 'Baking', 'categoryCode': 'BAKING', 'barcode': '511111719519', 'brandCode': 'TEST BRANDCODE @1610496646289', 'topBrand': False}

Index 757
{'_id': {'$oid': '5fb7f9c2be37ce522e165cc8'}, 'barcode': '511111517511', 'brandCode': 'ABSOLUT ELYX', 'cat

In [36]:
#View the dataframe
brands_df

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,False,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,False,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,False,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,False,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,False,TEST BRANDCODE @1612366146827
...,...,...,...,...,...,...,...,...
1162,{'$oid': '5f77274dbe37ce6b592e90c0'},511111116752,Baking,BAKING,"{'$ref': 'Cogs', '$id': {'$oid': '5f77274dbe37...",test brand @1601644365844,,
1163,{'$oid': '5dc1fca91dda2c0ad7da64ae'},511111706328,Breakfast & Cereal,,"{'$ref': 'Cogs', '$id': {'$oid': '53e10d6368ab...",Dippin DotsÂ® Cereal,,DIPPIN DOTS CEREAL
1164,{'$oid': '5f494c6e04db711dd8fe87e7'},511111416173,Candy & Sweets,CANDY_AND_SWEETS,"{'$ref': 'Cogs', '$id': {'$oid': '5332fa12e4b0...",test brand @1598639215217,,TEST BRANDCODE @1598639215217
1165,{'$oid': '5a021611e4b00efe02b02a57'},511111400608,Grocery,,"{'$ref': 'Cogs', '$id': {'$oid': '5332f5f6e4b0...",LIPTON TEA Leaves,False,LIPTON TEA Leaves


In [37]:
# Find out the column names, non null count and datatype
brands_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   _id           1167 non-null   object
 1   barcode       1167 non-null   object
 2   category      1012 non-null   object
 3   categoryCode  517 non-null    object
 4   cpg           1167 non-null   object
 5   name          1167 non-null   object
 6   topBrand      555 non-null    object
 7   brandCode     933 non-null    object
dtypes: object(8)
memory usage: 73.1+ KB


In [38]:
#Find out the null values for each columns
brands_df.isnull().sum()

_id               0
barcode           0
category        155
categoryCode    650
cpg               0
name              0
topBrand        612
brandCode       234
dtype: int64

## Exploring Users

In [39]:
# Load users JSON and convert to Dataframe
users_json= read_json_from_gzip('../data/raw_data/users.json.gz')
users_df = pd.DataFrame(users_json)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [40]:
#View 5 random elements in json
for i in range(5):
    ind = random.randint(0, len(users_json))
    print("Index", ind)
    print(users_json[ind])
    print()

Index 263
{'_id': {'$oid': '600992d17d983a11f63d11ad'}, 'active': True, 'createdDate': {'$date': 1611240145107}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}

Index 151
{'_id': {'$oid': '5fff0f4fb3348b03eb45abb0'}, 'active': True, 'createdDate': {'$date': 1610551119070}, 'lastLogin': {'$date': 1610551322153}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}

Index 146
{'_id': {'$oid': '5fff55dabd4dff11dda8f5f1'}, 'active': True, 'createdDate': {'$date': 1610569178721}, 'lastLogin': {'$date': 1610569406975}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}

Index 243
{'_id': {'$oid': '60088a46b6310511daa4ec97'}, 'active': True, 'createdDate': {'$date': 1611172422902}, 'lastLogin': {'$date': 1611172422948}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}

Index 382
{'_id': {'$oid': '60186237c8b50e11d8454d5f'}, 'active': True, 'createdDate': {'$date': 1612210743551}, 'role': 'consumer', 'signUpSource': 'Email'}



In [50]:
#View the dataframe
users_df

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1.609688e+12,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1.609688e+12,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1.609688e+12,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1609687530554,1.609688e+12,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1.609688e+12,consumer,Email,WI
...,...,...,...,...,...,...,...
490,54943462e4b07e684157a532,True,1418998882381,1.614963e+12,fetch-staff,,
491,54943462e4b07e684157a532,True,1418998882381,1.614963e+12,fetch-staff,,
492,54943462e4b07e684157a532,True,1418998882381,1.614963e+12,fetch-staff,,
493,54943462e4b07e684157a532,True,1418998882381,1.614963e+12,fetch-staff,,


In [43]:
users_df['_id'] = users_df['_id'].apply(
            lambda x: x['$oid'])

In [49]:
# users_df['createdDate'] = users_df['createdDate'].apply(
#             lambda x: x['$date'])
users_df['lastLogin'] = users_df['lastLogin'].apply(
    lambda x: x['$date'] if isinstance(x, dict) and '$date' in x else x
)


In [51]:
users_df.drop_duplicates()


Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1.609688e+12,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1609687530554,1.609688e+12,consumer,Email,WI
6,5ff1e1e8cfcf6c399c274ad9,True,1609687528354,1.609688e+12,consumer,Email,WI
7,5ff1e1b7cfcf6c399c274a5a,True,1609687479626,1.609687e+12,consumer,Email,WI
9,5ff1e1f1cfcf6c399c274b0b,True,1609687537564,1.609688e+12,consumer,Email,WI
...,...,...,...,...,...,...,...
435,5fc961c3b8cfca11a077dd33,True,1607033283936,1.614379e+12,fetch-staff,Email,NH
455,5fa41775898c7a11a6bcef3e,True,1604589429396,1.614874e+12,fetch-staff,Email,
456,5fa32b4d898c7a11a6bcebce,True,1604528973309,1.614843e+12,fetch-staff,Google,AL
462,5964eb07e4b03efd0c0f267b,True,1499785991771,1.614885e+12,fetch-staff,,IL


In [16]:
# Find out the column names, non null count and datatype
users_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   _id           495 non-null    object
 1   active        495 non-null    bool  
 2   createdDate   495 non-null    object
 3   lastLogin     433 non-null    object
 4   role          495 non-null    object
 5   signUpSource  447 non-null    object
 6   state         439 non-null    object
dtypes: bool(1), object(6)
memory usage: 23.8+ KB


In [21]:
#Check brand category and categorycodes
s1=defaultdict(lambda:0)
for br in brands_json:
    if "category" in br and "categoryCode" in br:
        s1[(br["category"],br["categoryCode"])]+=1
    elif "category" in br:
        s1[(br["category"],"__")]+=1
    elif "categoryCode" in br:
        s1[("__",br["categoryCode"])]+=1
    else:
        s1[("__","__")]+=1
    
dict(s1)

{('Baking', 'BAKING'): 359,
 ('Beverages', 'BEVERAGES'): 1,
 ('Candy & Sweets', 'CANDY_AND_SWEETS'): 71,
 ('Condiments & Sauces', '__'): 27,
 ('Canned Goods & Soups', '__'): 12,
 ('Baking', '__'): 10,
 ('__', '__'): 155,
 ('Magazines', '__'): 43,
 ('Breakfast & Cereal', '__'): 40,
 ('Beer Wine Spirits', '__'): 59,
 ('Health & Wellness', 'HEALTHY_AND_WELLNESS'): 14,
 ('Beauty', '__'): 9,
 ('Baby', '__'): 11,
 ('Frozen', '__'): 23,
 ('Grocery', '__'): 28,
 ('Snacks', '__'): 75,
 ('Household', '__'): 5,
 ('Beverages', '__'): 62,
 ('Personal Care', '__'): 16,
 ('Health & Wellness', '__'): 30,
 ('Grocery', 'GROCERY'): 11,
 ('Dairy', '__'): 33,
 ('Personal Care', 'PERSONAL_CARE'): 4,
 ('Cleaning & Home Improvement', 'CLEANING_AND_HOME_IMPROVEMENT'): 6,
 ('Deli', '__'): 6,
 ('Beer Wine Spirits', 'BEER_WINE_SPIRITS'): 31,
 ('Beauty & Personal Care', '__'): 6,
 ('Baby', 'BABY'): 7,
 ('Bread & Bakery', 'BREAD_AND_BAKERY'): 5,
 ('Outdoor', 'OUTDOOR'): 1,
 ('Dairy & Refrigerated', 'DAIRY_AND_REFRI

In [24]:
#View a entry which has cpg
for dat in brands_json:
    if 'cpg' in dat and len(dat["cpg"])>1:
        print(dat)
        break

{'_id': {'$oid': '601ac115be37ce2ead437551'}, 'barcode': '511111019862', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'}, 'name': 'test brand @1612366101024', 'topBrand': False}


## Exploring Reciepts

In [22]:
# Load Reciepts JSON and convert to Dataframe
receipts_json= read_json_from_gzip('../data/raw_data/receipts.json.gz')
receipts_df = pd.DataFrame(receipts_json)

In [23]:
#Check a reciept example
receipts_json[0]

{'_id': {'$oid': '5ff1e1eb0a720f0523000575'},
 'bonusPointsEarned': 500,
 'bonusPointsEarnedReason': 'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)',
 'createDate': {'$date': 1609687531000},
 'dateScanned': {'$date': 1609687531000},
 'finishedDate': {'$date': 1609687531000},
 'modifyDate': {'$date': 1609687536000},
 'pointsAwardedDate': {'$date': 1609687531000},
 'pointsEarned': '500.0',
 'purchaseDate': {'$date': 1609632000000},
 'purchasedItemCount': 5,
 'rewardsReceiptItemList': [{'barcode': '4011',
   'description': 'ITEM NOT FOUND',
   'finalPrice': '26.00',
   'itemPrice': '26.00',
   'needsFetchReview': False,
   'partnerItemId': '1',
   'preventTargetGapPoints': True,
   'quantityPurchased': 5,
   'userFlaggedBarcode': '4011',
   'userFlaggedNewItem': True,
   'userFlaggedPrice': '26.00',
   'userFlaggedQuantity': 5}],
 'rewardsReceiptStatus': 'FINISHED',
 'totalSpent': '26.00',
 'userId': '5ff1e1eacfcf6c399c274ae6'}

In [25]:
receipts_df.columns

Index(['_id', 'bonusPointsEarned', 'bonusPointsEarnedReason', 'createDate',
       'dateScanned', 'finishedDate', 'modifyDate', 'pointsAwardedDate',
       'pointsEarned', 'purchaseDate', 'purchasedItemCount',
       'rewardsReceiptItemList', 'rewardsReceiptStatus', 'totalSpent',
       'userId'],
      dtype='object')

In [26]:
receipts_df

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.00,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.00,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.00,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.00,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,{'$oid': '603cc0630a720fde100003e6'},25.0,COMPLETE_NONPARTNER_RECEIPT,{'$date': 1614594147000},{'$date': 1614594147000},,{'$date': 1614594148000},,25.0,{'$date': 1597622400000},2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33
1115,{'$oid': '603d0b710a720fde1000042a'},,,{'$date': 1614613361873},{'$date': 1614613361873},,{'$date': 1614613361873},,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1116,{'$oid': '603cf5290a720fde10000413'},,,{'$date': 1614607657664},{'$date': 1614607657664},,{'$date': 1614607657664},,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1117,{'$oid': '603ce7100a7217c72c000405'},25.0,COMPLETE_NONPARTNER_RECEIPT,{'$date': 1614604048000},{'$date': 1614604048000},,{'$date': 1614604049000},,25.0,{'$date': 1597622400000},2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33


In [31]:
#Get ALl keys in RecieptItems
keysMap=defaultdict(lambda:0)
for reciept in receipts_json:
    if 'rewardsReceiptItemList' in reciept:
        for recItem in reciept['rewardsReceiptItemList']:
            for key in recItem.keys():
                keysMap[key]+=1
keysMap

defaultdict(<function __main__.<lambda>()>,
            {'barcode': 3090,
             'description': 6560,
             'finalPrice': 6767,
             'itemPrice': 6767,
             'needsFetchReview': 813,
             'partnerItemId': 6941,
             'preventTargetGapPoints': 358,
             'quantityPurchased': 6767,
             'userFlaggedBarcode': 337,
             'userFlaggedNewItem': 323,
             'userFlaggedPrice': 299,
             'userFlaggedQuantity': 299,
             'needsFetchReviewReason': 219,
             'pointsNotAwardedReason': 340,
             'pointsPayerId': 1267,
             'rewardsGroup': 1731,
             'rewardsProductPartnerId': 2269,
             'userFlaggedDescription': 205,
             'originalMetaBriteBarcode': 71,
             'originalMetaBriteDescription': 10,
             'brandCode': 2600,
             'competitorRewardsGroup': 275,
             'discountedItemPrice': 5769,
             'originalReceiptItemText': 5760,
   