## 1. Data Cleaning

In [1]:
import json
import gzip
import pandas as pd
import warnings
from functools import partial
import uuid
from pandas import json_normalize
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

### create brandTable

In [2]:
#read data
filename = 'brands.json.gz' 

json_content = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:
        line = line.rstrip()
        if line:  
            obj = json.loads(line)
            json_content.append(obj)

data=json.dumps(json_content)

In [3]:
#clean brand data
df_brand = pd.read_json(data, orient ='records')
df_brand['cpg_oid']=df_brand['cpg'].apply(lambda x: x['$id']['$oid'])
df_brand['cpg_ref']=df_brand['cpg'].apply(lambda x: x['$ref'])
df_brand['_id']=df_brand['_id'].apply(lambda x: x['$oid'])

#prepare brandTable
brandTable=df_brand[['_id','brandCode','topBrand','name']]
brandTable.brandCode.fillna(brandTable.name.str.upper(),inplace=True)
brandTable.to_csv('brandTable.csv')

### create CategoriesTable

In [4]:
#clean brand data
PartnerItemTable=df_brand[['barcode','cpg_oid','cpg_ref','categoryCode','category','_id']]
PartnerItemTable.barcode=PartnerItemTable.barcode.astype(str)
PartnerItemTable.categoryCode.fillna(PartnerItemTable.category.str.upper(), inplace=True)
PartnerItemTable['categoryCode']=PartnerItemTable['categoryCode'].replace('&','_')

#prepare CategoriesTable
CategoriesTable=PartnerItemTable[['categoryCode','category']].drop_duplicates()
CategoriesTable.dropna(how='all', inplace=True)
CategoriesTable.to_csv('categories.csv')

In [5]:
#prepare CategoriesTable
CategoriesTable=PartnerItemTable[['categoryCode','category']].drop_duplicates()
CategoriesTable.dropna(how='all', inplace=True)
CategoriesTable.to_csv('categories.csv')

### create PartnerItemTable

In [6]:
#create a primary key:itemid for PartnerItemTable
p = partial(uuid.uuid5, uuid.NAMESPACE_DNS)
PartnerItemTable=PartnerItemTable.assign(id=(PartnerItemTable['_id'] + '_' + PartnerItemTable['barcode']).apply(p))

#prepare PartnerItemTable
PartnerItemTable.rename(columns={'id':'itemID','_id':'brandID'},inplace=True)
PartnerItemTable=PartnerItemTable[['barcode','cpg_oid','cpg_ref','categoryCode','itemID','brandID']]
PartnerItemTable.to_csv('PartnerItemTable.csv')

### create ReceiptTable 

In [7]:
#read receipt data
filename = 'receipts.json.gz'  # Sample file.

json_content = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:
        line = line.rstrip()
        if line:  
            obj = json.loads(line)
            json_content.append(obj)

data=json.dumps(json_content)

In [8]:
#clean receipt data
df_receipt = pd.read_json(data, orient ='records')
df_receipt['_id']=df_receipt['_id'].apply(lambda x: x['$oid'])
df_receipt.duplicated(subset=['_id']).sum()
df_receipt['createDate']=df_receipt['createDate'].apply(lambda x: x['$date'])
df_receipt['dateScanned']=df_receipt['dateScanned'].apply(lambda x: x['$date'])
df_receipt['finishedDate']=df_receipt['finishedDate'].apply(lambda x:x if pd.isnull(x) else x['$date'])
df_receipt['modifyDate']=df_receipt['modifyDate'].apply(lambda x: x['$date'])
df_receipt['purchaseDate']=df_receipt['purchaseDate'].apply(lambda x:x if pd.isnull(x) else  x['$date'])
df_receipt['pointsAwardedDate']=df_receipt['pointsAwardedDate'].apply(lambda x:x if pd.isnull(x) else  x['$date'])

#prepare receiptTable
receiptTable=df_receipt[['_id','createDate','dateScanned','finishedDate', 'modifyDate', 'pointsAwardedDate', 
            'bonusPointsEarnedReason', 'purchaseDate', 'rewardsReceiptStatus','totalSpent','purchasedItemCount','userId']]
receiptTable.to_csv('receiptTable.csv')

### create ScanneditemTable

In [9]:
#get the records where its receipitemlist doesn't contain null values
df_receipt['is_emptyitemList']=df_receipt['rewardsReceiptItemList'].isnull()
temp=df_receipt[['_id','rewardsReceiptItemList','is_emptyitemList','rewardsReceiptStatus']]
itemList=temp[temp['is_emptyitemList']==False]

#get all the columns that appear in the 'rewardsReceiptItemList'
keyList=list(itemList['rewardsReceiptItemList'][0][0].keys())
for row in itemList['rewardsReceiptItemList']:
    rowList=row[0].keys()
    for key in rowList:
        if key not in keyList:
            keyList.append(key)


df_all= pd.DataFrame(columns = keyList)

In [10]:
# convert the 'rewardsReceiptItemList' which is in json format to pandas dataframe
itemList_index=itemList.index.tolist()
receiptList=[]
for index in itemList_index:
    df=json_normalize(json_content[index]['rewardsReceiptItemList'])
    df_all=pd.concat([df_all,df])

In [11]:
#get the receiptids for each record
itemList['lenght']=itemList['rewardsReceiptItemList'].apply(lambda x: len(x))
numList=itemList['lenght'].tolist()
idList=itemList['_id'].tolist()

allidList=[]
for j in range(len(numList)):
    for i in range(numList[j]):
        allidList.append(idList[j])

In [12]:
# merge the brandTable and df_all on 'brandCode'. 
df_all.reset_index(drop=True,inplace=True)
df_all['receipt_id']=pd.Series(allidList)
df_all_brandCode=df_all[~df_all['brandCode'].isnull()]
df_all_nullbrandCode=df_all[df_all['brandCode'].isnull()]
df_all=pd.merge(df_all_brandCode,brandTable,how='left',on='brandCode')
df_all_nullbrandCode['_id']=None
df_all.drop(columns=['topBrand','name'],inplace=True)
df_all=pd.concat([df_all,df_all_nullbrandCode])

In [13]:
#prepare receiptTable
scanneditemTable=df_all
scanneditemTable.sort_values('receipt_id',inplace=True)
scanneditemTable.reset_index(drop=True,inplace=True)
scanneditemTable['itemID']=scanneditemTable.index+1
scanneditemTable.rename(columns={'_id':'brandID'},inplace=True)
scanneditemTable.to_csv('scanneditemTable.csv')

### create usersTable

In [14]:
#read user data
filename = 'users.json.gz'  

json_content = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:
        line = line.rstrip()
        if line:  
            obj = json.loads(line)
            json_content.append(obj)

data=json.dumps(json_content)
df_users = pd.read_json(data, orient ='records')

In [15]:
#clean users data 
df_users['createdDate']=df_users['createdDate'].apply(lambda x: x['$date'])
df_users['lastLogin']=df_users['lastLogin'].apply(lambda x: x if pd.isnull(x) else x['$date'])
df_users['_id']=df_users['_id'].apply(lambda x: x['$oid'])

#prepare userTable
UsersTable=df_users.drop_duplicates()
UsersTable.lastLogin=UsersTable.lastLogin.fillna(0)
UsersTable.lastLogin=UsersTable.lastLogin.astype('int64')
UsersTable.to_csv('usersTable.csv')

## 2. Create a relational database

In [None]:
import sqlite3
from sqlite3 import Error
import pandas as pd
import numpy as np

def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn

def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)



def main(): ##changed database name
    database = "fetchrewards.db"
    
    brand_table = """ CREATE TABLE IF NOT EXISTS brand ( 
                                _id TEXT PRIMARY KEY, 
                                brandCode TEXT, 
                                topBrand INTEGER, 
                                name TEXT
                             ); """
    
    categories_table = """ CREATE TABLE IF NOT EXISTS categories (
                            categoryCode TEXT PRIMARY KEY, 
                            category TEXT 
                      ); """
    
    
    partneritem_table = """ CREATE TABLE IF NOT EXISTS partner_item ( 
                                itemID TEXT PRIMARY KEY, 
                                cpg_oid TEXT,
                                cpg_ref TEXT,
                                barcode TEXT,
                                categoryCode TEXT,
                                brandID TEXT,
                                FOREIGN KEY(brandID) REFERENCES brand(_id),
                                FOREIGN KEY(categoryCode) REFERENCES categories(categoryCode)
                                
                           ); """
                                 

    users_table = """ CREATE TABLE IF NOT EXISTS users ( 
                            _id TEXT PRIMARY KEY, 
                            active BOOLEAN,
                            createdDate INTEGER,
                            lastLogin INTEGER,
                            role TEXT,
                            signUPSource TEXT,
                            state TEXT
                         ); """
                                  
    receipts_table = """ CREATE TABLE IF NOT EXISTS receipts ( 
                            _id TEXT PRIMARY KEY, 
                            createDate INT,
                            dateScanned INT,
                            finishedDate REAL,
                            modifyDate INT,
                            pointsAwardedDate REAL,
                            bonusPointsEarnedReason TEXT,
                            purchaseDate REAL,
                            rewardsReceiptStatus TEXT,
                            totalSpent REAL, 
                            purchasedItemCount REAL,
                            userId TEXT,
                            FOREIGN KEY(userId) REFERENCES users(_id)
                          ); """
                                  
    scanneditem_table = """ CREATE TABLE IF NOT EXISTS scanned_item ( 
                           itemID INTEGER PRIMARY KEY,
                           barcode TEXT, 
                           description TEXT, 
                           finalPrice REAL, 
                           itemPrice REAL, 
                           needsFetchReview BOOLEAN, 
                           partnerItemId INT, 
                           preventTargetGapPoints BOOLEAN, 
                           quantityPurchased INT, 
                           userflaggedBarcode TEXT, 
                           userFlaggedNewItem BOOLEAN, 
                           userFlaggedPrice REAL, 
                           userFlaggedQuantity INT, 
                           orginalMetaBriteBarcode TEXT, 
                           originalMetaBriteDescription TEXT, 
                           pointsNotAwadedReason TEXT, 
                           pointsPayerId TEXT, 
                           rewardsGroup TEXT,  
                           rewardsProductPartnerId TEXT, 
                           brandCode TEXT, 
                           competitorRewardsGroup TEXT, 
                           discountedItemPrice REAL,
                           originalReceiptItemText TEXT,
                           itemNumber INT,
                           needsFetchReviewReason TEXT,
                           originalMetaBriteQuantityPurchased INT,
                           pointsEarned REAL,
                           targetPrice REAL,
                           competitiveProduct TEXT,
                           userFlaggedDescription TEXT,
                           deleted BOOLEAN,
                           priceAfterCoupon REAL,
                           metabriteCampaignId TEXT,
                           originalFinalPrice REAL,
                           originalMetaBriteItemPrice REAL,
                           brandID INTEGER,
                           receipt_id TEXT,
                           brandID TEXT,
                           FOREIGN KEY(receipt_id) REFERENCES receipt(_id),
                           FOREIGN KEY(brandID) REFERENCES brand(_id)
                           ); """
               
    
    
    
    # create a database connection
    conn = create_connection(database)

    # create tables
    if conn is not None:
        # create tables
        create_table(conn, brand_table)
        create_table(conn, categories_table)
        create_table(conn, partneritem_table)
        create_table(conn, users_table)
        create_table(conn, receipts_table)
        create_table(conn, scanneditem_table)

        
        
        brandTable=pd.read_csv('brandTable.csv', index_col=0)
        PartnerItemTable=pd.read_csv('PartnerItemTable.csv', index_col=0)
        categories=pd.read_csv('categories.csv', index_col=0)
        usersTable=pd.read_csv('usersTable.csv', index_col=0)
        receiptTable=pd.read_csv('receiptTable.csv', index_col=0)
        scanneditemTable=pd.read_csv('scanneditemTable.csv', index_col=0)

        
        
        brandTable.to_sql('brand', con=conn, if_exists='append', index=False)
        categories.to_sql('categories', con=conn, if_exists='append', index=False)
        PartnerItemTable.to_sql('partner_item', con=conn, if_exists='append', index=False)
        usersTable.to_sql('users', con=conn, if_exists='append', index=False)
        receiptTable.to_sql('receipts', con=conn, if_exists='append', index=False)
        scanneditemTable.to_sql('scanned_item', con=conn, if_exists='append', index=False)
       
        
        conn.close()
        print('SQL insert process finished')
        
       
    else:
        print("Error! cannot create the database connection.")


if __name__ == '__main__':
    main()