# MVP - JSON in sqlite

Instead of shoehorning JSONs into a traditional relational DB, store as-is, and leverage the JSON features to get our delay data

1. call aviationstack API
2. save json response to sqlite3 db
3. extract data from response
4. tweet data

In [30]:
import os
import requests
from urllib3.util import Retry
from requests import Session, HTTPError
from requests.adapters import HTTPAdapter
from requests.exceptions import ReadTimeout
import sqlite3
import json
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime, timezone, timedelta
from time import sleep
import logging
from sys import stdout
import tomllib
import jinja2

In [33]:
logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(funcName)s: %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    handlers=[logging.StreamHandler(stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [3]:
env_path = Path("../.env")
load_dotenv(env_path)
AV_API_KEY = os.getenv("AVIATION_API_KEY", "")
AV_API_URL = "http://api.aviationstack.com/v1/"
FLIGHT_API_URL = AV_API_URL + "flights"

In [21]:
toml_path = Path("../pyproject.toml")
with open(toml_path, "rb") as f:
    config = tomllib.load(f)

DB_NAME = config["sqlite"]["db_name"]
TBL_NAME = config["sqlite"]["tbl_name"]
JSON_COL = config["sqlite"]["json_col"]

In [4]:
def write_local_json(
    api_response: dict,
    json_dir: Path,
    str_date: str = str(datetime.now(tz=timezone.utc).date()),
    offset: int = 0,
    limit: int = 100,
):
    """
    Saves the flight api response as json, to be uploaded to a data lake
    json will be named according to the UTC date of when it was retrieved
    """
    if not json_dir.exists():
        json_dir.mkdir(parents=True)
    local_json_path = json_dir / f"flight-{str_date}-{offset}-{offset+limit}.json"
    logger.info(f"saving to {local_json_path}")
    with open(local_json_path, "w") as j:
        json.dump(api_response, j)
        logger.debug(f"saved to {local_json_path}")
    return local_json_path

In [12]:
def get_all_delays(
    json_dir: str,
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    str_date: str = str(datetime.now(tz=timezone.utc).date() - timedelta(days=1)),
):
    sesh = Session()
    adapter = HTTPAdapter(
        max_retries=Retry(
            total=3,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
            # allowed_methods={"POST"},
        )
    )
    sesh.mount(AV_API_URL, adapter)
    responses = []
    retrieved = total = 0
    logger.info(f"Retrieving delayed flights for {str_date}")
    while not total or retrieved < total:
        sleep(0.5)
        logger.info(f"retrieving {retrieved}th to {retrieved + limit}th")
        params = {
            "access_key": AV_API_KEY,  # retrieved from .env, global scope
            "offset": retrieved,
            "limit": limit,
            "airline_name": airline,
            "min_delay_arr": min_delay,
        }
        try:
            response = sesh.get(
                url=FLIGHT_API_URL,
                params=params,
                timeout=30.0,
            )
            response.raise_for_status()
        except HTTPError as exc:
            logger.error(f"HTTP Error: \n{exc}")

        except ReadTimeout as e:
            logger.error(
                f"Timeout retrieving {retrieved}th to {retrieved + limit}th:\n{e}"
            )
        # save response
        logger.debug(f"retrieved {retrieved}th to {retrieved + limit}th")
        responses.append(response.json())
        json_path = write_local_json(
            responses[-1], json_dir=json_dir, str_date=str_date, offset=retrieved
        )
        retrieved += responses[-1]["pagination"]["count"]
        if not total:
            # First request; get total count
            total = responses[0]["pagination"]["total"]
            logger.info(f"Total records count: {total}")
            if total == 0:
                # prevent infinite loop if there are no records retrieved
                logger.error("Zero records retrieved; exiting")
                break
    return responses

## 1. Fetch the responses

In [14]:
responses = get_all_delays(Path("../data/responses/"))

2023/10/16 13:17:25 [ERROR] get_all_delays: Zero records retrieved; exiting


In [32]:
def execute_template_sql(
    db_conn: sqlite3.Connection,
    env: jinja2.Environment,
    template: str,
    params: dict,
):
    """
    Renders the jinja templated sql and executes,
    returning results if any
    """
    sql = env.get_template(template).render(params)
    with db_conn:
        curs = db_conn.executescript(sql)

    return curs.fetchall()

## 2. Insert JSON to sqlite

In [None]:
# app args
data_dir = "../data"
template_dir = "../templates"
params = dict(
    tbl_name=TBL_NAME,
    json_col=JSON_COL,
)
# instantiate db conn and jinja env
env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
# search for existing db
db_path = data_dir / f"{DB_NAME}.db"
db_conn = sqlite3.connect(db_path)
if not db_path.exists():
    logger.info(f"{db_path} does not exist, initializing...")
    execute_template_sql(db_conn, env, "create.sql", params)
else:
    logger.info(f"Using existing db @ {db_path}")

# UPSERT data
flights = [(json.dumps(flight),) for flight in response["data"]]
cur.executemany(f"INSERT OR REPLACE INTO {tbl_name} ({json_col}) VALUES( ? )", flights)
con.commit()