# Putting it all together

1. call aviationstack API
2. save json response to sqlite3 db
3. extract data from response
4. tweet data

## Calling `aviationstack` API

In [1]:
import os
import requests
import sqlite3
import json
from pathlib import Path
from dotenv import load_dotenv
from datetime import date
from time import sleep

In [2]:
env_path = Path("../.env")
load_dotenv(env_path)
av_api_key = os.getenv("AVIATION_API_KEY", "")
av_api_url = "http://api.aviationstack.com/v1/"
flight_api_url = av_api_url + "flights"

In [12]:
def get_flight_api(
    offset: int = 0,
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    flight_api_url = "http://api.aviationstack.com/v1/flights",
) -> dict:
    """
    Requests aviationstack API for flight data
    Returns responses in a dict
    """
    params = {
        "access_key": av_api_key,  # retrieved from .env, global scope
        "offset": offset,
        "limit": limit,
        "airline_name": airline,
        "min_delay_arr": min_delay,
    }
    result = requests.get(flight_api_url, params, timeout=5)
    return result.json()


def write_local_json(api_response: dict, offset: int = 0, limit: int = 100):
    """
    Saves the flight api response as json, to be uploaded to a data lake
    """
    local_json_path = Path(
        f"../data/responses/flight-{str(date.today())}-{offset}-{offset+limit}.json"
    )
    with open(local_json_path, "w") as j:
        json.dump(api_response, j)
        print(f"saved to {local_json_path}")
    return local_json_path


def get_all_delays(
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    flight_api_url="http://api.aviationstack.com/v1/flights",
):
    responses = []
    retrieved = total = 0
    while not total or retrieved < total:
        sleep(0.5)
        print(f"retrieving {retrieved}th to {retrieved + limit}th")
        responses.append(get_flight_api(offset=retrieved, limit=limit))
        # save response
        json_path = write_local_json(responses[-1], offset=retrieved)
        retrieved += responses[-1]["pagination"]["count"]
        if not total:
            total = responses[0]['pagination']['total']
            print(f'Total records count: {total}')
    return responses

## 1. Fetch the responses

In [None]:
responses = get_all_delays() 

## Insert response into sqlite

1. Flatten the json response 
1. create a table if it doesn't already exist.
    - primary keys:
      - `flight__iata` 
      - `departure__iata`
      - `departure__scheduled`
      - `arrival__iata`
    - considered `hashlib.sha` for generating a unique ID from the entire entry, but if the above fields are enough, then let's just use the built-in `PRIMARY KEY` function
      - downside is that we're now presuming the fields and separator in our fields
      - perhaps just force sep as `__`
    - Create schema by comparing the keys of all the entries
1. Upsert entries into the database
    

In [4]:
from collections.abc import MutableMapping


def json_flatten(data: dict, parent_key="", sep="_"):
    """
    Normalizes json, if nested
    """
    items = []
    for key, val in data.items():
        new_key = parent_key + sep + key if parent_key else key
        if isinstance(val, MutableMapping):
            items.extend(json_flatten(val, parent_key=new_key, sep=sep).items())
        else:
            items.append((new_key, val))

    # creates {key: val} from (key, val) tuple
    return dict(items)


def issubstring(text: str, checklist, sep="__") -> bool:
    """
    Returns True for overlapped keys
    """
    for check in checklist:
        if text + sep in check:
            return True
    return False


def find_json_schema(entries: list[dict]) -> list:
    fields = set()
    for entry in entries:
        fields.update(entry.keys())

    fields_uniq = [field for field in fields if not issubstring(field, fields)]
    return fields_uniq

## 2 Insert to sqlite db

### 2.1 flatten responses

In [15]:
# load previously saved local json
json_paths = Path("../data/responses").glob("flight-2023-09-16-*.json")

# flatten the nested dicts in the response jsons
entries = []
for json_file in json_paths:
    with open(json_file) as j:
        flight_page = json.load(j)
        flat = [json_flatten(nested, sep="__") for nested in flight_page["data"]]
        entries.extend(flat)



In [62]:
def create_table(
    schema: list, 
    db_conn: sqlite3.Connection,
    tbl_name: str = 'import_flight_records', sep = '__'):
    """
    Creates the table in sqlite if it doesn't already exist
    """
    pk = [["flight", "iata"],
          ["departure", "iata"],
          ["departure", "scheduled"],
          ["arrival", "iata"]]
    pk = [sep.join(field) for field in pk]
    if not all([key in schema for key in pk]):
        raise ValueError(f"one of primary keys: {pk} not in schema list")

    ddl = f"""
    CREATE TABLE IF NOT EXISTS {tbl_name} (
        {", ".join([f"{field} TEXT DEFAULT NULL" for field in schema])},
        PRIMARY KEY ({", ".join(pk)})
    )"""
    db_conn.execute(ddl)


### 2.2 Create table [optional]

In [63]:
# create table if not exists
# find schema
schema = find_json_schema(entries)
db_conn = sqlite3.connect("../data/flights.db")

create_table(schema, db_conn)

In [61]:
sep = '__'
pk = [["flight", "iata"],
          ["departure", "iata"],
          ["departure", "scheduled"],
          ["arrival", "iata"]]
pk = [sep.join(field) for field in pk]
[key in schema for key in pk]

[True, True, True, True]

In [40]:
%load_ext sql

In [41]:
%sql sqlite:///../data/flights.db

In [42]:
%%sql
select * from sqlite_master;

 * sqlite:///../data/flights.db
Done.


type,name,tbl_name,rootpage,sql
table,import_flight_records,import_flight_records,2,"CREATE TABLE import_flight_records (  arrival__scheduled TEXT DEFAULT NULL, arrival__iata TEXT DEFAULT NULL, live__speed_horizontal TEXT DEFAULT NULL, flight__codeshared__flight_iata TEXT DEFAULT NULL, departure__delay TEXT DEFAULT NULL, departure__estimated_runway TEXT DEFAULT NULL, arrival__actual TEXT DEFAULT NULL, flight__number TEXT DEFAULT NULL, arrival__baggage TEXT DEFAULT NULL, departure__terminal TEXT DEFAULT NULL, departure__actual TEXT DEFAULT NULL, live__speed_vertical TEXT DEFAULT NULL, flight__codeshared__airline_icao TEXT DEFAULT NULL, departure__icao TEXT DEFAULT NULL, flight__icao TEXT DEFAULT NULL, airline__name TEXT DEFAULT NULL, flight__codeshared__airline_name TEXT DEFAULT NULL, departure__estimated TEXT DEFAULT NULL, departure__gate TEXT DEFAULT NULL, arrival__gate TEXT DEFAULT NULL, departure__actual_runway TEXT DEFAULT NULL, flight_status TEXT DEFAULT NULL, departure__timezone TEXT DEFAULT NULL, arrival__airport TEXT DEFAULT NULL, aircraft__icao TEXT DEFAULT NULL, live__latitude TEXT DEFAULT NULL, arrival__icao TEXT DEFAULT NULL, aircraft__iata TEXT DEFAULT NULL, flight__codeshared__flight_icao TEXT DEFAULT NULL, flight__codeshared__flight_number TEXT DEFAULT NULL, flight__codeshared__airline_iata TEXT DEFAULT NULL, departure__iata TEXT DEFAULT NULL, departure__scheduled TEXT DEFAULT NULL, arrival__estimated_runway TEXT DEFAULT NULL, airline__iata TEXT DEFAULT NULL, live__altitude TEXT DEFAULT NULL, live__direction TEXT DEFAULT NULL, arrival__delay TEXT DEFAULT NULL, flight__iata TEXT DEFAULT NULL, live__longitude TEXT DEFAULT NULL, arrival__terminal TEXT DEFAULT NULL, arrival__estimated TEXT DEFAULT NULL, airline__icao TEXT DEFAULT NULL, arrival__actual_runway TEXT DEFAULT NULL, live__updated TEXT DEFAULT NULL, live__is_ground TEXT DEFAULT NULL, arrival__timezone TEXT DEFAULT NULL, aircraft__icao24 TEXT DEFAULT NULL, aircraft__registration TEXT DEFAULT NULL, flight_date TEXT DEFAULT NULL, departure__airport TEXT DEFAULT NULL,  PRIMARY KEY (  flight__iata,  departure__iata,  departure__scheduled,  arrival__iata  )  )"
index,sqlite_autoindex_import_flight_records_1,import_flight_records,3,


In [73]:
from sqlite3 import ProgrammingError

def upsert_entries(
    entries: dict, 
    schema: list,
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records"
):
    """
    UPSERT data into the import table
    """
    entries_expanded = [
        {field: entry.get(field) for field in schema} 
        for entry in entries
    ]
    vals_placeholder = ", ".join(len(schema) * "?")
    try:
        with db_conn:
            db_conn.executemany(
                f"INSERT INTO {tbl_name} VALUES({vals_placeholder})", entries_expanded.values()
            )
    except ProgrammingError as e:
        print(e)    

In [56]:
len(entries)

714

### 2.3 UPSERT into table

In [74]:
upsert_entries(entries,schema, db_conn)

AttributeError: 'list' object has no attribute 'values'