# Importing Yelp Dataset

Author(s): Brian Lin

This notebook is for importing the Yelp JSON data files. Assumes run from
`preprocessing/`. Importing all of the following takes ~ 5 minutes on my machine.

Database file is around 8.3 GB.

In [None]:
import json
from pathlib import Path
import sqlite3

In [None]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to raw data directory
DATA_DIR = CWD/"raw_data"
DATA_PREFIX = "yelp_academic_dataset_"
# database
DB_PATH = ROOT/"database/YelpData.db"

In [None]:
# Auto-close / commit when used in "with" statement
# https://stackoverflow.com/questions/19522505/using-sqlite3-in-python-with-with-keyword
class SQLite():
    def __init__(self, file='sqlite.db'):
        self.file=file
    def __enter__(self):
        self.conn = sqlite3.connect(self.file)
        self.conn.row_factory = sqlite3.Row
        return self.conn.cursor()
    def __exit__(self, type, value, traceback):
        self.conn.commit()
        self.conn.close()

In [None]:
def getFilePath(name):
    return DATA_DIR/(DATA_PREFIX + name + ".json")

def jsonToStr(data, field):
    # convert nested obj to json str
    if data[field] is not None:
        # only bother if there is a nested obj
        data[field] = json.dumps(data[field])

def importData(name, nestedFields=[]):
    with SQLite(DB_PATH) as cur:
        with open(getFilePath(name), 'r') as file:
            # extract field names
            firstObj = json.loads(file.readline())
            keys = firstObj.keys()
            # must be in same order for sql
            colStr = ", ".join(keys)
            keyStr = ", ".join([":" + key for key in keys])
            
            file.seek(0) # reset to beginning of file

            # start reading lines from the beginning of file
            for line in file:
                data = json.loads(line)

                # convert nested objs to json str
                for field in nestedFields:
                    jsonToStr(data, field)

                sqlstr = f'''
                INSERT INTO {name} ({colStr})
                VALUES ({keyStr})
                '''

                cur.execute(sqlstr, data)
    print(f"Finished importing: {name}")

def printHead(name):
    with SQLite(DB_PATH) as cur:
        res = cur.execute(f"SELECT * FROM {name} LIMIT 1").fetchone()
        print(dict(res), end='\n\n')

In [None]:
importData("business", ["attributes", "hours"])
printHead("business")

In [None]:
importData("user")
#printHead("user") # suppress since too long

In [None]:
importData("review")
printHead("review")