# Putting it all together

1. call aviationstack API
2. save json response to sqlite3 db
3. extract data from response
4. tweet data

## Calling `aviationstack` API

In [1]:
import os
import requests
import sqlite3
import json
from pathlib import Path
from dotenv import load_dotenv
from datetime import date
from time import sleep

In [2]:
env_path = Path("../.env")
load_dotenv(env_path)
av_api_key = os.getenv("AVIATION_API_KEY", "")
av_api_url = "http://api.aviationstack.com/v1/"
flight_api_url = av_api_url + "flights"

In [3]:
def get_flight_api(
    offset: int = 0,
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    flight_api_url="http://api.aviationstack.com/v1/flights",
) -> dict:
    """
    Requests aviationstack API for flight data
    Returns responses in a dict
    """
    params = {
        "access_key": av_api_key,  # retrieved from .env, global scope
        "offset": offset,
        "limit": limit,
        "airline_name": airline,
        "min_delay_arr": min_delay,
    }
    result = requests.get(flight_api_url, params, timeout=5)
    return result.json()


def write_local_json(api_response: dict, offset: int = 0, limit: int = 100):
    """
    Saves the flight api response as json, to be uploaded to a data lake
    """
    local_json_path = Path(
        f"../data/responses/flight-{str(date.today())}-{offset}-{offset+limit}.json"
    )
    with open(local_json_path, "w") as j:
        json.dump(api_response, j)
        print(f"saved to {local_json_path}")
    return local_json_path


def get_all_delays(
    limit: int = 100,
    airline: str = "Malaysia Airlines",
    min_delay: int = 1,
    flight_api_url="http://api.aviationstack.com/v1/flights",
):
    responses = []
    retrieved = total = 0
    while not total or retrieved < total:
        sleep(0.5)
        print(f"retrieving {retrieved}th to {retrieved + limit}th")
        responses.append(get_flight_api(offset=retrieved, limit=limit))
        # save response
        json_path = write_local_json(responses[-1], offset=retrieved)
        retrieved += responses[-1]["pagination"]["count"]
        if not total:
            total = responses[0]["pagination"]["total"]
            print(f"Total records count: {total}")
    return responses

## 1. Fetch the responses

In [None]:
responses = get_all_delays()

## Insert response into sqlite

1. Flatten the json response 
1. create a table if it doesn't already exist.
    - primary keys:
      - `flight__iata` 
      - `departure__iata`
      - `departure__scheduled`
      - `arrival__iata`
    - considered `hashlib.sha` for generating a unique ID from the entire entry, but if the above fields are enough, then let's just use the built-in `PRIMARY KEY` function
      - downside is that we're now presuming the fields and separator in our fields
      - perhaps just force sep as `__`
    - Create schema by comparing the keys of all the entries
1. Upsert entries into the database
    

In [4]:
from collections.abc import MutableMapping


def json_flatten(data: dict, parent_key="", sep="_"):
    """
    Normalizes json, if nested
    """
    items = []
    for key, val in data.items():
        new_key = parent_key + sep + key if parent_key else key
        if isinstance(val, MutableMapping):
            items.extend(json_flatten(val, parent_key=new_key, sep=sep).items())
        else:
            items.append((new_key, val))

    # creates {key: val} from (key, val) tuple
    return dict(items)


def issubstring(text: str, checklist, sep="__") -> bool:
    """
    Returns True for overlapped keys
    """
    for check in checklist:
        if text + sep in check:
            return True
    return False


def find_json_schema(entries: list[dict]) -> list:
    fields = set()
    for entry in entries:
        fields.update(entry.keys())

    fields_uniq = [field for field in fields if not issubstring(field, fields)]
    return fields_uniq

## 2 Insert to sqlite db

### 2.1 flatten responses

In [5]:
# load previously saved local json
json_paths = Path("../data/responses").glob("flight-2023-09-16-*.json")

# flatten the nested dicts in the response jsons
entries = []
for json_file in json_paths:
    with open(json_file) as j:
        flight_page = json.load(j)
        flat = [json_flatten(nested, sep="__") for nested in flight_page["data"]]
        entries.extend(flat)

In [6]:
def create_table(
    schema: list,
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
    sep="__",
):
    """
    Creates the table in sqlite if it doesn't already exist
    """
    pk = [
        ["flight", "iata"],
        ["departure", "iata"],
        ["departure", "scheduled"],
        ["arrival", "iata"],
    ]
    pk = [sep.join(field) for field in pk]
    if not all([key in schema for key in pk]):
        raise ValueError(f"one of primary keys: {pk} not in schema list")

    ddl = f"""
    CREATE TABLE IF NOT EXISTS {tbl_name} (
        {", ".join([f"{field} TEXT DEFAULT NULL" for field in schema])},
        PRIMARY KEY ({", ".join(pk)})
    )"""
    db_conn.execute(ddl)

### 2.2 Create table [optional]

In [10]:
# create table if not exists
# find schema
schema = find_json_schema(entries)
db_conn = sqlite3.connect("../data/flights.db")

create_table(schema, db_conn)

In [12]:
%load_ext sql

In [13]:
%sql sqlite:///../data/flights.db

In [14]:
%%sql
pragma table_info('import_flight_records')

 * sqlite:///../data/flights.db
Done.


cid,name,type,notnull,dflt_value,pk
0,arrival__scheduled,TEXT,0,,0
1,arrival__iata,TEXT,0,,4
2,live__speed_horizontal,TEXT,0,,0
3,flight__codeshared__flight_iata,TEXT,0,,0
4,departure__delay,TEXT,0,,0
5,departure__estimated_runway,TEXT,0,,0
6,arrival__actual,TEXT,0,,0
7,flight__number,TEXT,0,,0
8,arrival__baggage,TEXT,0,,0
9,departure__terminal,TEXT,0,,0


In [7]:
from sqlite3 import ProgrammingError


def dict_factory(cursor, row):
    """
    cursor: sqlite3 cursor object
    row: tuple from query result
    returns the tuple row as dict
    """
    # .description attr returns a 7-tuple; only 1st is the col name
    fields = [descr[0] for descr in cursor.description]
    return {field: val for field, val in zip(fields, row)}


def upsert_entries(
    entries: dict,
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
):
    """
    UPSERT data into the import table
    There is no UPSERT function that replaces INSERT;
    rather it's an ON CONFLICT DO clause that can be added to an INSERT
    statement to handle unique key constraints
    """
    # retrieve the table column order for correct insertion
    with db_conn:
        curs = db_conn.execute(f"pragma table_info('{tbl_name}')")
        tbl_info = curs.fetchall()
    col_names = [col_info["name"] for col_info in tbl_info]

    # inserting nulls where the field is empty
    entries_expanded = (
        {field: entry.get(field) for field in col_names} for entry in entries
    )
    cols_placeholder = ", ".join([f":{field}" for field in col_names])
    try:
        with db_conn:
            db_conn.executemany(
                f"INSERT OR REPLACE INTO {tbl_name} VALUES({cols_placeholder})",
                entries_expanded,
            )
    except ProgrammingError as e:
        print(e)

### 2.3 UPSERT into table

In [52]:
upsert_entries(entries, db_conn)

In [15]:
%%sql
select
    departure__iata,
    arrival__iata,
    avg(departure__delay) avg_delay,
    count(*) num_flights
from import_flight_records
group by departure__iata, arrival__iata
order by avg_delay desc
limit 10;

 * sqlite:///../data/flights.db
Done.


departure__iata,arrival__iata,avg_delay,num_flights
KUL,NRT,178.0,1
CTS,HND,144.0,1
ORD,SEA,139.0,1
ORD,PHL,130.0,1
HND,HNL,125.5,2
KUL,IST,122.0,1
HND,SFO,121.0,1
TPE,KUL,120.0,1
HKG,JFK,118.0,2
NCL,LHR,116.0,1


In [16]:
%%sql
select count(*) num_delayed,
max(departure__delay) max_delay,
avg(departure__delay) avg_delay
from import_flight_records
where date(departure__scheduled) = '2023-09-15';


 * sqlite:///../data/flights.db
Done.


num_delayed,max_delay,avg_delay
396,98,42.86111111111112


## Tweeting it out

In [8]:
import tweepy

In [21]:
tbl_name = "import_flight_records"
sep = "__"
str_date = "2023-09-15"
query_sql = f"""
        select count(*) num_delayed,
        max(departure__delay) max_delay,
        avg(departure__delay) avg_delay
        from import_flight_records
        where date(departure__scheduled) = '{str_date}'
    """
with db_conn:
    curs = db_conn.execute(query_sql)
    result = curs.fetchall()
result

[(396, '98', 42.861111111111114)]

In [29]:
most_delayed_sql = f"""
        select 
            flight{sep}iata,
            arrival{sep}airport,
            departure{sep}airport,
            departure{sep}delay,
            ROW_NUMBER() over (order by departure{sep}delay desc) delay_rank
        from {tbl_name}
        where date(departure{sep}scheduled) = '{str_date}' 
        order by delay_rank
        limit 5"""
db_conn.row_factory = dict_factory
with db_conn:
    curs = db_conn.execute(most_delayed_sql)
    result = curs.fetchall()
result

[{'flight__iata': 'MH9341',
  'arrival__airport': 'John F Kennedy International',
  'departure__airport': 'Miami International Airport',
  'departure__delay': '98',
  'delay_rank': 1},
 {'flight__iata': 'MH4338',
  'arrival__airport': 'Istanbul Airport',
  'departure__airport': 'St-Exupéry',
  'departure__delay': '98',
  'delay_rank': 2},
 {'flight__iata': 'MH4283',
  'arrival__airport': 'Istanbul Airport',
  'departure__airport': 'Adnan Menderes Airport',
  'departure__delay': '97',
  'delay_rank': 3},
 {'flight__iata': 'MH9343',
  'arrival__airport': 'Dallas/Fort Worth International',
  'departure__airport': 'Philadelphia International',
  'departure__delay': '97',
  'delay_rank': 4},
 {'flight__iata': 'MH9684',
  'arrival__airport': 'Dyce',
  'departure__airport': 'Heathrow',
  'departure__delay': '96',
  'delay_rank': 5}]

In [69]:
def write_flight_tweet(
    db_conn: sqlite3.Connection,
    tbl_name: str = "import_flight_records",
    str_date: str = str(date.today()),
    sep: str = "__",
) -> str:
    """
    Queries the flight records database to write the tweet
    """
    agg_sql = f"""
        select count(*) num_delayed,
        max(departure{sep}delay) max_delay,
        avg(departure{sep}delay) avg_delay
        from {tbl_name}
        where date(departure{sep}scheduled) = '{str_date}'
    """
    most_delayed_sql = f"""
        select 
            flight{sep}iata,
            arrival{sep}airport,
            departure{sep}airport,
            departure{sep}delay,
            dense_rank() over (order by departure{sep}delay) desc) delay_rank
        from {tbl_name}
        where date(departure{sep}scheduled) = '{str_date}' 
        order by delay_rank
        limit 5;
    """
    with db_conn:
        curs = db_conn.execute(agg_sql)
        num_delay, max_delay, avg_delay = curs.fetchall()[0]

    return f"On {str_date}, {result[0]} MH flights were delayed, by an average pf {result[3]} minutes"

In [56]:
TWITTER_API_KEY = os.getenv("TWITTER_API_KEY")
TWITTER_API_SECRET = os.getenv("TWITTER_API_SECRET")
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
TWITTER_ACCESS_SECRET = os.getenv("TWITTER_ACCESS_SECRET")

In [70]:
payload = write_flight_tweet(db_conn, "2023-09-16")

IndexError: list index out of range

In [None]:
oauth1_client = tweepy.Client(
    consumer_key=TWITTER_API_KEY,
    consumer_secret=TWITTER_API_SECRET,
    access_token=TWITTER_ACCESS_TOKEN,
    access_token_secret=TWITTER_ACCESS_SECRET,
)

t_response = oauth1_client.create_tweet(text=payload, user_auth=True)
print(f"https://twitter.com/user/status/{t_response.data['id']}")