# Putting it all together

1. call aviationstack API
2. save json response to sqlite3 db
3. extract data from response
4. tweet data

## Calling `aviationstack` API

In [4]:
import os
import requests
from urllib3.util import Retry
from requests import Session, HTTPError
from requests.adapters import HTTPAdapter
from requests.exceptions import ReadTimeout
import sqlite3
import json
from pathlib import Path
from dotenv import load_dotenv
from datetime import date
from time import sleep

In [6]:
import logging
from sys import stdout

logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(funcName)s: %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    handlers=[logging.StreamHandler(stdout)],
)
logger = logging.getLogger(__name__)

In [5]:
env_path = Path("../.env")
load_dotenv(env_path)
AV_API_KEY = os.getenv("AVIATION_API_KEY", "")
AV_API_URL = "http://api.aviationstack.com/v1/"
FLIGHT_API_URL = AV_API_URL + "flights"

In [11]:
def write_local_json(
    api_response: dict,
    json_dir: Path,
    str_date: str = str(date.today()),
    offset: int = 0,
    limit: int = 100,
):
    """
    Saves the flight api response as json, to be uploaded to a data lake
    """
    if not json_dir.exists():
        json_dir.mkdir(parents=True)
    local_json_path = json_dir / f"flight-{str_date}-{offset}-{offset+limit}.json"
    logger.info(f"saving to {local_json_path}")
    with open(local_json_path, "w") as j:
        json.dump(api_response, j)
        logger.debug(f"saved to {local_json_path}")
    return local_json_path


def get_all_delays(
    json_dir: str,
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    str_date: str = str(date.today()),
):
    sesh = Session()
    adapter = HTTPAdapter(
        max_retries=Retry(
            total=3,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
            # allowed_methods={"POST"},
        )
    )
    sesh.mount(AV_API_URL, adapter)
    responses = []
    retrieved = total = 0
    while not total or retrieved < total:
        sleep(0.5)
        logger.info(f"retrieving {retrieved}th to {retrieved + limit}th")
        params = {
            "access_key": AV_API_KEY,  # retrieved from .env, global scope
            "offset": retrieved,
            "limit": limit,
            "airline_name": airline,
            "min_delay_arr": min_delay,
        }
        try:
            response = sesh.get(
                url=FLIGHT_API_URL,
                params=params,
                timeout=30.0,
            )
            response.raise_for_status()
        except HTTPError as exc:
            logger.error(f"HTTP Error: \n{exc}")

        except ReadTimeout as e:
            logger.error(
                f"Timeout retrieving {retrieved}th to {retrieved + limit}th:\n{e}"
            )
        # save response
        logger.debug(f"retrieved {retrieved}th to {retrieved + limit}th")
        responses.append(response.json())
        json_path = write_local_json(
            responses[-1], json_dir=json_dir, str_date=str_date, offset=retrieved
        )
        retrieved += responses[-1]["pagination"]["count"]
        if not total:
            total = responses[0]["pagination"]["total"]
            logger.info(f"Total records count: {total}")
    return responses

## 1. Fetch the responses

In [13]:
responses = get_all_delays(Path("../data/responses/"))

KeyboardInterrupt: 

## Insert response into sqlite

1. Flatten the json response 
1. create a table if it doesn't already exist.
    - primary keys:
      - `flight__iata` 
      - `departure__iata`
      - `departure__scheduled`
      - `arrival__iata`
    - considered `hashlib.sha` for generating a unique ID from the entire entry, but if the above fields are enough, then let's just use the built-in `PRIMARY KEY` function
      - downside is that we're now presuming the fields and separator in our fields
      - perhaps just force sep as `__`
    - Create schema by comparing the keys of all the entries
1. Upsert entries into the database
    

In [3]:
from collections.abc import MutableMapping


def json_flatten(data: dict, parent_key="", sep="_"):
    """
    Normalizes json, if nested
    """
    items = []
    for key, val in data.items():
        new_key = parent_key + sep + key if parent_key else key
        if isinstance(val, MutableMapping):
            items.extend(json_flatten(val, parent_key=new_key, sep=sep).items())
        else:
            items.append((new_key, val))

    # creates {key: val} from (key, val) tuple
    return dict(items)


def issubstring(text: str, checklist, sep="__") -> bool:
    """
    Returns True for overlapped keys
    """
    for check in checklist:
        if text + sep in check:
            return True
    return False


def find_json_schema(entries: list[dict]) -> list:
    fields = set()
    for entry in entries:
        fields.update(entry.keys())

    fields_uniq = [field for field in fields if not issubstring(field, fields)]
    return fields_uniq

## 2 Insert to sqlite db

### 2.1 flatten responses

In [4]:
# load previously saved local json
json_paths = Path("../data/responses").glob("flight-2023-09-16-*.json")

# flatten the nested dicts in the response jsons
entries = []
responses = []
for json_file in json_paths:
    with open(json_file) as j:
        flight_page = json.load(j)
        flat = [json_flatten(nested, sep="__") for nested in flight_page["data"]]
        entries.extend(flat)
        responses.append(flight_page)

In [6]:
def create_table(
    schema: list,
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
    sep="__",
):
    """
    Creates the table in sqlite if it doesn't already exist
    """
    pk = [
        ["flight", "iata"],
        ["departure", "iata"],
        ["departure", "scheduled"],
        ["arrival", "iata"],
    ]
    pk = [sep.join(field) for field in pk]
    if not all([key in schema for key in pk]):
        raise ValueError(f"one of primary keys: {pk} not in schema list")

    ddl = f"""
    CREATE TABLE IF NOT EXISTS {tbl_name} (
        {", ".join([f"{field} TEXT DEFAULT NULL" for field in schema])},
        PRIMARY KEY ({", ".join(pk)})
    )"""
    db_conn.execute(ddl)

### 2.2 Create table [optional]

In [5]:
# create table if not exists
# find schema
schema = find_json_schema(entries)
db_conn = sqlite3.connect("../data/flights.db")

In [None]:
create_table(schema, db_conn)

In [6]:
%load_ext sql

In [7]:
%sql sqlite:///../data/flights.db

In [8]:
%%sql
pragma table_info('import_flight_records')

 * sqlite:///../data/flights.db
Done.


cid,name,type,notnull,dflt_value,pk
0,arrival__scheduled,TEXT,0,,0
1,arrival__iata,TEXT,0,,4
2,live__speed_horizontal,TEXT,0,,0
3,flight__codeshared__flight_iata,TEXT,0,,0
4,departure__delay,TEXT,0,,0
5,departure__estimated_runway,TEXT,0,,0
6,arrival__actual,TEXT,0,,0
7,flight__number,TEXT,0,,0
8,arrival__baggage,TEXT,0,,0
9,departure__terminal,TEXT,0,,0


In [10]:
from sqlite3 import ProgrammingError


def dict_factory(cursor, row):
    """
    cursor: sqlite3 cursor object
    row: tuple from query result
    returns the tuple row as dict
    """
    # .description attr returns a 7-tuple; only 1st is the col name
    fields = [descr[0] for descr in cursor.description]
    return {field: val for field, val in zip(fields, row)}


def upsert_entries(
    entries: dict,
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
):
    """
    UPSERT data into the import table
    There is no UPSERT function that replaces INSERT;
    rather it's an ON CONFLICT DO clause that can be added to an INSERT
    statement to handle unique key constraints
    """
    # retrieve the table column order for correct insertion
    with db_conn:
        curs = db_conn.execute(f"pragma table_info('{tbl_name}')")
        tbl_info = curs.fetchall()
    col_names = [col_info["name"] for col_info in tbl_info]

    # inserting nulls where the field is empty
    entries_expanded = (
        {field: entry.get(field) for field in col_names} for entry in entries
    )
    cols_placeholder = ", ".join([f":{field}" for field in col_names])
    try:
        with db_conn:
            db_conn.executemany(
                f"INSERT OR REPLACE INTO {tbl_name} VALUES({cols_placeholder})",
                entries_expanded,
            )
    except ProgrammingError as e:
        print(e)

### 2.3 UPSERT into table

In [11]:
db_conn.row_factory = dict_factory
upsert_entries(entries, db_conn)

In [9]:
%%sql
select
    departure__iata,
    arrival__iata,
    CAST(arrival__delay AS INTEGER) arrival_delay,
    ROW_NUMBER() OVER (ORDER BY CAST(arrival__delay AS INTEGER) DESC) delay_rank
    --avg(departure__delay) avg_delay
from import_flight_records
group by departure__iata, arrival__iata
order by arrival_delay desc
limit 10;

 * sqlite:///../data/flights.db
Done.


departure__iata,arrival__iata,arrival_delay,delay_rank
DOH,HEL,246,1
LHE,CMB,152,2
CMB,LHE,148,3
KUL,NRT,142,4
PVG,KUL,135,5
KUL,AKL,126,6
CTS,HND,125,7
LHR,MIA,107,8
ORD,PHL,105,9
ORD,SEA,96,10


In [11]:
%%sql
select count(*) num_delayed,
max(departure__delay) max_delay,
avg(departure__delay) avg_delay
from import_flight_records
where date(departure__scheduled) = '2023-09-24';


 * sqlite:///../data/flights.db
Done.


num_delayed,max_delay,avg_delay
0,,


## Tweeting it out

In [23]:
import tweepy

In [25]:
tbl_name = "import_flight_records"
sep = "__"
str_date = "2023-09-16"

flight_num = f"flight{sep}iata"
a_port = f"arrival{sep}airport"
a_delay = f"arrival{sep}delay"
d_port = f"departure{sep}airport"
d_delay = f"departure{sep}delay"
d_sched = f"departure{sep}scheduled"

agg_sql = f"""
        SELECT COUNT(*) num_delayed,
        AVG({a_delay}) avg_delay
        FROM {tbl_name}
        WHERE DATE({d_sched}) = '{str_date}'
    """
with db_conn:
    curs = db_conn.execute(query_sql)
    result = curs.fetchall()
result

[{'num_delayed': 204, 'max_delay': '98', 'avg_delay': 36.6551724137931}]

In [None]:
agg_sql = f"""
        SELECT COUNT(*) num_delayed,
        AVG({a_delay}) avg_delay
        FROM {tbl_name}
        WHERE DATE({d_sched}) = '{str_date}'
    """

In [28]:
most_delayed_sql = f"""
        select 
            ROW_NUMBER() OVER (ORDER BY CAST({a_delay} AS INTEGER) DESC) delay_rank,
            {flight_num},
            {a_port},
            {d_port},
            {a_delay}
        from {tbl_name}
        where date({d_sched}) = '{str_date}' 
        order by delay_rank
        limit 3;
    """
with db_conn:
    curs = db_conn.execute(most_delayed_sql)
    result = curs.fetchall()
result

[{'delay_rank': 1,
  'flight__iata': 'MH133',
  'arrival__airport': 'Auckland International',
  'departure__airport': 'Kuala Lumpur International Airport (klia)',
  'arrival__delay': '126'},
 {'delay_rank': 2,
  'flight__iata': 'MH9210',
  'arrival__airport': 'John F Kennedy International',
  'departure__airport': 'Hong Kong International',
  'arrival__delay': '91'},
 {'delay_rank': 3,
  'flight__iata': 'MH9885',
  'arrival__airport': 'Heathrow',
  'departure__airport': 'Newcastle Airport',
  'arrival__delay': '89'}]

In [69]:
%%sql
WITH RECURSIVE 
    ranks(d_delay, d_delay_rank) AS
        (SELECT
            CAST(departure__delay AS INTEGER),
            ROW_NUMBER() OVER (ORDER BY CAST(departure__delay AS INTEGER) DESC)
        FROM import_flight_records)
SELECT
    d_delay,
    100 * ROUND(
        CAST(d_delay_rank AS FLOAT) / (
            select max(d_delay_rank) from ranks
        ), 2) as percentile
FROM ranks
WHERE percentile in (0, 25, 50, 75, 100);

 * sqlite:///../data/flights.db
Done.


d_delay,percentile
178.0,0.0
169.0,0.0
49.0,25.0
49.0,25.0
49.0,25.0
49.0,25.0
49.0,25.0
48.0,25.0
34.0,50.0
34.0,50.0


In [62]:
%%sql
WITH RECURSIVE
    a(arrdelay, rank) AS 
        (SELECT 
             arrival__delay, 
             ROW_NUMBER() OVER (ORDER BY CAST(arrival__delay AS INTEGER) DESC) rank
        FROM import_flight_records)
SELECT arrdelay, rank FROM a
ORDER BY rank;

 * sqlite:///../data/flights.db
Done.


arrdelay,rank
142,1
135,2
126,3
125,4
109,5
105,6
96,7
95,8
91,9
91,10


In [57]:
def write_flight_tweet(
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
    str_date: str = str(date.today()),
    sep: str = "__",
) -> str:
    """
    Queries the flight records database to write the tweet
    Prepared queries makes some assumption about the table schema
    1. follows aviationstack flights endpoint
    1. flattened, with the same sep character
    """
    flight_num = f"flight{sep}iata"
    a_port = f"arrival{sep}airport"
    a_delay = f"arrival{sep}delay"
    d_port = f"departure{sep}airport"
    d_delay = f"departure{sep}delay"
    d_sched = f"departure{sep}scheduled"

    agg_sql = f"""
        SELECT COUNT(*) num_delayed,
        AVG({a_delay}) avg_delay
        FROM {tbl_name}
        WHERE DATE({d_sched}) = '{str_date}'
    """

    most_delayed_sql = f"""
        SELECT 
            ROW_NUMBER() OVER (ORDER BY CAST({a_delay} AS INTEGER) DESC) delay_rank,
            {flight_num},
            REPLACE(
            REPLACE(
            REPLACE({a_port}, ' International Airport', '')
            , ' International', '')
            , ' Airport', '') AS {a_port},
            REPLACE(
            REPLACE(
            REPLACE({d_port}, ' International Airport', '')
            , ' International', '')
            , ' Airport', '') AS {d_port},
            {a_delay}
        FROM {tbl_name}
        WHERE DATE({d_sched}) = '{str_date}'
        ORDER BY delay_rank
        LIMIT 3;
    """
    with db_conn:
        curs = db_conn.execute(agg_sql)
        num_delay, avg_delay = curs.fetchall()[0].values()
        curs = db_conn.execute(most_delayed_sql)
        delays = curs.fetchall()

    delays_in_sentences = "\n" + "\n".join(
        [
            f"{d[flight_num]} from {d[a_port]} to {d[d_port]} by {int(d[a_delay])} min"
            for d in delays
        ]
    )
    pt1 = f"On {str_date}, {num_delay} MH flights were delayed"
    pt2 = f"by an average of {avg_delay:.0f} min."
    pt3 = f"Most delayed flights: {delays_in_sentences}"
    return " ".join([pt1, pt2, pt3])

### 3.1 writing the tweet

In [58]:
payload = write_flight_tweet(db_conn, str_date="2023-09-15")
print(payload, f"\n{len(payload)}")

On 2023-09-15, 396 MH flights were delayed by an average of 20 min. Most delayed flights: 
MH88 from Narita to Kuala Lumpur (klia) by 142 min
MH389 from Kuala Lumpur (klia) to Shanghai Pudong by 135 min
MH4050 from Haneda to Chitose by 125 min 
243


In [59]:
TWITTER_API_KEY = os.getenv("TWITTER_API_KEY")
TWITTER_API_SECRET = os.getenv("TWITTER_API_SECRET")
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
TWITTER_ACCESS_SECRET = os.getenv("TWITTER_ACCESS_SECRET")

### 3.2 tweeting

In [60]:
oauth1_client = tweepy.Client(
    consumer_key=TWITTER_API_KEY,
    consumer_secret=TWITTER_API_SECRET,
    access_token=TWITTER_ACCESS_TOKEN,
    access_token_secret=TWITTER_ACCESS_SECRET,
)

t_response = oauth1_client.create_tweet(text=payload, user_auth=True)
print(f"https://twitter.com/user/status/{t_response.data['id']}")

https://twitter.com/user/status/1704761063189921835
