### Load and Normalize the data.

In [51]:
import pandas as pd
import json

# Load and normalize users data
users_df = pd.read_json('/Users/vishruti/workspace/fetch-rewards-analysis/data_folder/users.json', lines=True)
users_df = pd.json_normalize(users_df.to_dict(orient='records'), sep='_')

# Load and normalize brands data
brands_df = pd.read_json('/Users/vishruti/workspace/fetch-rewards-analysis/data_folder/brands.json', lines=True)
brands_df = pd.json_normalize(brands_df.to_dict(orient='records'), sep='_')

# Load and normalize receipt data
receipts_df = pd.read_json('/Users/vishruti/workspace/fetch-rewards-analysis/data_folder/receipts.json', lines=True)
receipts_df = pd.json_normalize(receipts_df.to_dict(orient='records'), sep='_')

# Expand 'rewardsReceiptItemList' into separate rows
receipt_items_df = receipts_df.explode('rewardsReceiptItemList')
receipt_items_df = pd.json_normalize(receipt_items_df['rewardsReceiptItemList'])



### Structural overview of the data:

In [49]:
print("Users table:")
print(users_df.info())
print("\nBrands table:")
print(brands_df.info())
print("\nReceipts table:")
print(receipts_df)

Users table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   active             495 non-null    bool   
 1   role               495 non-null    object 
 2   signUpSource       447 non-null    object 
 3   state              439 non-null    object 
 4   _id_$oid           495 non-null    object 
 5   createdDate_$date  495 non-null    int64  
 6   lastLogin_$date    433 non-null    float64
 7   lastLogin          0 non-null      float64
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 27.7+ KB
None

Brands table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   barcode       1167 non-null   int64  
 1   category      1012 non-null   object 
 2   categoryCode  517 non-null  

### Missing values across all the datasets

In [50]:
print("\nMissing values for Users:", users_df.isnull().sum() )
print("\nMissing values for brands Data:", brands_df.isnull().sum() )
print("\nMissing values for Receipts Data:", receipts_df.isnull().sum())


Missing values for Users: active                 0
role                   0
signUpSource          48
state                 56
_id_$oid               0
createdDate_$date      0
lastLogin_$date       62
lastLogin            495
dtype: int64

Missing values for brands Data: barcode           0
category        155
categoryCode    650
name              0
topBrand        612
brandCode       234
_id_$oid          0
cpg_$id_$oid      0
cpg_$ref          0
dtype: int64

Missing values for Receipts Data: bonusPointsEarned           575
bonusPointsEarnedReason     575
pointsEarned                510
purchasedItemCount          484
rewardsReceiptItemList      440
rewardsReceiptStatus          0
totalSpent                  435
userId                        0
_id_$oid                      0
createDate_$date              0
dateScanned_$date             0
finishedDate_$date          551
modifyDate_$date              0
pointsAwardedDate_$date     582
purchaseDate_$date          448
finishedDate       

### Duplicate values across all datasets:

In [63]:
print("Duplicate values in Users Table:", users_df.duplicated().sum())
print("Duplicate values in Brands Table:", brands_df.duplicated().sum())
print("Duplicate values in Receipts Taable:", receipt_items_df.duplicated().sum())


Duplicate values in Users Table: 283
Duplicate values in Brands Table: 0
Duplicate values in Receipts Taable: 1358


### Evaluate any inconsistencies in date format

In [None]:
# converting columns to appropriate datetime format
users_df['createdDate.$date'] = pd.to_datetime(users_df['createdDate.$date'], errors='coerce')
receipts_df['dateScanned.$date'] = pd.to_datetime(receipts_df['dateScanned.$date'], errors='coerce')
receipts_df['createDate.$date'] = pd.to_datetime(receipts_df['createDate.$date'], errors='coerce')
receipts_df['finishedDate.$date'] = pd.to_datetime(receipts_df['finishedDate.$date'], errors='coerce')

# Find rows where date conversion failed
invalid_dates_users = users_df[users_df['createdDate.$date'].isna()]
invalid_dates_receipts = receipts_df[receipts_df['dateScanned.$date'].isna()]
invalid_dates_receipts = receipts_df[receipts_df['createDate.$date'].isna()]
invalid_dates_receipts = receipts_df[receipts_df['finishedDate.$date'].isna()]

if invalid_dates_users.empty:
    print("No inconsistent date formats found in users")
else:
    print("Inconsistent date formats found in users")
    print(invalid_dates_users)

if invalid_dates_receipts.empty:
    print("No inconsistent date formats found in receipts")
else:
    print("Inconsistent date formats found in receipts")
    print(invalid_dates_receipts)

No inconsistent date formats found in users
Inconsistent date formats found in receipts
      bonusPointsEarned                            bonusPointsEarnedReason  \
2                   5.0                         All-receipts receipt bonus   
12                150.0  Receipt number 5 completed, bonus point schedu...   
13                750.0  Receipt number 1 completed, bonus point schedu...   
15                  NaN                                                NaN   
17                750.0  Receipt number 1 completed, bonus point schedu...   
...                 ...                                                ...   
1114               25.0                        COMPLETE_NONPARTNER_RECEIPT   
1115                NaN                                                NaN   
1116                NaN                                                NaN   
1117               25.0                        COMPLETE_NONPARTNER_RECEIPT   
1118                NaN                               