# sqlite3

## Basics - creating and writing to db

`sqlite3.connect` implicitly creates a .db file if not existent

In [1]:
import sqlite3
import json
from pprint import pprint
from collections.abc import MutableMapping

In [2]:
with open("../tests/data/sample_flight_response.json", "r") as j:
    response = json.load(j)

print(response.keys())

dict_keys(['pagination', 'data'])


In [3]:
con = sqlite3.connect("../tests/sample.db")

from the `con` object we create a cursor with which to execute our statements

In [4]:
cur = con.cursor()

In [8]:
cols = [
    "flight_date",
    "flight_status",
    "departure",
    "arrival",
    "airline",
    "flight",
    "aircraft",
    "live",
]
cols = ", ".join(cols)
cols
create_table = f"CREATE TABLE flights({cols})"
cur.execute(create_table)

<sqlite3.Cursor at 0x7fd030284840>

In [9]:
res = cur.execute("SELECT name FROM sqlite_master")
res.fetchone()

('flights',)

When inserting `VALUES`, data must be in a list, with the number of items corresponding exactly to number of columns.

Alternatively we can supply a `dict`, with the keys mapped to column names

In [17]:
data = [
    (
        "2000-01-01",
        "active",
        "{'airport': 'KLIA'}",
        "{'airport': 'Seeb'}",
        "Malaysian Airlines",
        None,
        None,
        None,
    ),
    (
        "2023-08-23",
        "scheduled",
        "{'airport': 'KIX'}",
        "{'airport': 'HND'}",
        "Malaysian AIrlines",
        None,
        None,
        None,
    ),
]
placeholders = ", ".join(len(cols.split(",")) * "?")
cur.executemany(f"INSERT INTO flights VALUES({placeholders})", data)
con.commit()

In [19]:
for row in cur.execute(
    "SELECT flight_date, departure FROM flights order by flight_date"
):
    print(row)

('2000-01-01', "{'airport': 'KLIA'}")
('2023-08-23', "{'airport': 'KIX'}")


## JSON in sqlite

### Import JSON into sqlite

Using the sample response json to insert into our sample db. Sqlite3 does have a built-in json function, but it still treats it as a simple string, with unnecessary whitespaces removed to preserve storage

In [5]:
tblname_json = "flights_json"
colname_json = "flights_response"
cur.execute(f"DROP TABLE IF EXISTS {tblname_json}")
cur.execute(
    f"CREATE TABLE {tblname_json}(id INTEGER PRIMARY KEY, {colname_json} JSON)"
)

<sqlite3.Cursor at 0x7fa01073b240>

In [6]:
# dumping into a json str, and putting it into a list
cur.execute(
    f"INSERT INTO {tblname_json} ({colname_json}) VALUES( ? )",
    [json.dumps(response["data"][0])],
)
con.commit()

In [7]:
# convert dict to str, then put into a tuple by itself (hence the comma)
# result is a list of tuple, where each tuple is one json str
flights = [(json.dumps(flight),) for flight in response["data"]]
cur.executemany(
    f"INSERT INTO {tblname_json} ({colname_json}) VALUES( ? )", flights
)
con.commit()

Doesn't look like `->` operator is available here.


In [54]:
cur.close()
con.close()

### Extract JSON from sqlite

Insert

```sql
INSERT INTO users (id, data) VALUES (1, json_encode({
  "name": "Alice",
  "age": 25,
  "email": "alice@example.com"

```

Query:

```sql
SELECT json_extract(data, '$.email') AS email
FROM users
WHERE id = 1```;
}));

In [8]:
res = cur.execute(f"SELECT * FROM {tblname_json}")
read_flights = res.fetchall()
read_flights[99]

(100,
 '{"flight_date": "2023-08-28", "flight_status": "scheduled", "departure": {"airport": "Kuala Lumpur International Airport (klia)", "timezone": "Asia/Kuala_Lumpur", "iata": "KUL", "icao": "WMKK", "terminal": "1", "gate": null, "delay": 65, "scheduled": "2023-08-28T14:55:00+00:00", "estimated": "2023-08-28T14:55:00+00:00", "actual": "2023-08-28T15:59:00+00:00", "estimated_runway": "2023-08-28T15:59:00+00:00", "actual_runway": "2023-08-28T15:59:00+00:00"}, "arrival": {"airport": null, "timezone": null, "iata": "KNO", "icao": "WIMM", "terminal": "1", "gate": null, "baggage": null, "delay": 40, "scheduled": "2023-08-28T15:00:00+00:00", "estimated": "2023-08-28T15:00:00+00:00", "actual": null, "estimated_runway": null, "actual_runway": null}, "airline": {"name": "Malaysia Airlines", "iata": "MH", "icao": "MAS"}, "flight": {"number": "864", "iata": "MH864", "icao": "MAS864", "codeshared": null}, "aircraft": null, "live": null}')

In [14]:
delay_sql = f"""
SELECT
    json_extract({colname_json},'$.flight_date') as date,
    json_extract({colname_json},'$.departure.airport') as start,
    json_extract({colname_json},'$.arrival.airport') as destination,
    CAST(json_extract({colname_json},'$.arrival.delay') AS INTEGER) as delay
FROM {tblname_json}
ORDER BY delay DESC
LIMIT 5
"""
res = cur.execute(delay_sql)
rows = res.fetchall()
rows

[('2023-08-28',
  'Sydney Kingsford Smith Airport',
  'Kuala Lumpur International Airport (klia)',
  390),
 ('2023-08-28',
  'Kuala Lumpur International Airport (klia)',
  'Seoul (Incheon)',
  357),
 ('2023-08-28',
  'Penang International',
  'Kota-Kinabalu International Airport',
  187),
 ('2023-08-28',
  'Kota-Kinabalu International Airport',
  'Penang International',
  177),
 ('2023-08-28',
  'Miami International Airport',
  'Philadelphia International',
  142)]

In [16]:
cte_sql = f"""
WITH RECURSIVE 
delays(flight_date, start, dest, delay) as (
    SELECT
        json_extract({colname_json},'$.flight_date') as date,
        json_extract({colname_json},'$.departure.airport') as start,
        json_extract({colname_json},'$.arrival.airport') as dest,
        CAST(json_extract({colname_json},'$.arrival.delay') AS INTEGER) as delay
    FROM {tblname_json}
)
SELECT
    flight_date,
    REPLACE(
    REPLACE(
    REPLACE(start, ' International Airport', ''), 
    ' International', ''),
    ' Airport', '') AS start,
    REPLACE(
    REPLACE(
    REPLACE(dest, ' International Airport', ''), 
    ' International', ''),
    ' Airport', '') AS dest,
    delay
FROM delays
ORDER BY delay DESC
LIMIT 3;
"""
res = cur.execute(cte_sql)
rows = res.fetchall()
rows

[('2023-08-28', 'Sydney Kingsford Smith', 'Kuala Lumpur (klia)', 390),
 ('2023-08-28', 'Kuala Lumpur (klia)', 'Seoul (Incheon)', 357),
 ('2023-08-28', 'Penang', 'Kota-Kinabalu', 187)]

### Indexing JSON columns

Generate another column using expressions on the JSON column, and index that. This allows us to index fields within the JSON

In [21]:
add_flightnum_sql = f"""
ALTER TABLE {tblname_json}
ADD COLUMN flight_num 
AS (JSON_EXTRACT({colname_json}, '$.flight.iata'));
"""
add_start_sql = f"""
ALTER TABLE {tblname_json}
ADD COLUMN start 
AS (JSON_EXTRACT({colname_json}, '$.departure.iata'));
"""
add_sched_sql = f"""
ALTER TABLE {tblname_json}
ADD COLUMN scheduled 
AS (JSON_EXTRACT({colname_json}, '$.departure.scheduled'));
"""
add_end_sql = f"""
ALTER TABLE {tblname_json}
ADD COLUMN end 
AS (JSON_EXTRACT({colname_json}, '$.arrival.iata'));
"""

add_index_sql = f"""
CREATE INDEX flight_index 
ON {tblname_json}(flight_num, start, end, scheduled)"""

for sql in [add_flightnum_sql, add_start_sql, add_sched_sql, add_end_sql]:
    cur.execute(sql)

In [22]:
res = cur.execute(f"select * from {tblname_json} where end = 'KUL'")
rows = res.fetchall()
rows[:3]

[(4,
  '{"flight_date": "2023-08-29", "flight_status": "scheduled", "departure": {"airport": "Guangzhou Baiyun International", "timezone": "Asia/Shanghai", "iata": "CAN", "icao": "ZGGG", "terminal": "2", "gate": null, "delay": 45, "scheduled": "2023-08-29T08:15:00+00:00", "estimated": "2023-08-29T08:15:00+00:00", "actual": "2023-08-29T09:36:00+00:00", "estimated_runway": "2023-08-29T09:36:00+00:00", "actual_runway": "2023-08-29T09:36:00+00:00"}, "arrival": {"airport": "Kuala Lumpur International Airport (klia)", "timezone": "Asia/Kuala_Lumpur", "iata": "KUL", "icao": "WMKK", "terminal": "1", "gate": null, "baggage": null, "delay": 54, "scheduled": "2023-08-29T12:00:00+00:00", "estimated": "2023-08-29T12:00:00+00:00", "actual": null, "estimated_runway": null, "actual_runway": null}, "airline": {"name": "Malaysia Airlines", "iata": "MH", "icao": "MAS"}, "flight": {"number": "4481", "iata": "MH4481", "icao": "MAS4481", "codeshared": {"airline_name": "china southern airlines", "airline_iat

#### Add primary key on generated columns?

sqlite doesn't allow `ALTER TABLE` statements to modify primary keys. Create new table instead

While we're recreating the table, specify the new index columns as `GENERATED COLUMNS` so it knows that its values are only provided by other columns, i.e. our JSON column.

We also cannot add GENERATED COLUMNs as primary keys, so simply add them as unique index

```sql
CREATE INDEX flight_id ON flights_json(list_of_gen_columns);
```

#### VIRTUAL vs STORED

Generated columns have two types, and *neither can be PRIMARY KEYs*

- VIRTUAL - computed when read;
    - less space and more compute
    - can be added with ALTER TABLE ADD COLUMN
- STORED - computed when row is written;
    - more space and less compute
    - *cannot* be added with ALTER TABLE ADD COLUMN
 


In [24]:
cur.execute(f"DROP TABLE IF EXISTS {tblname_json}")
create_sql = f"""
CREATE TABLE {tblname_json}(
    {colname_json} JSON,
    flight_num TEXT GENERATED ALWAYS AS (JSON_EXTRACT({colname_json}, '$.flight.iata')) VIRTUAL,
    start TEXT GENERATED ALWAYS AS (JSON_EXTRACT({colname_json}, '$.departure.iata')) VIRTUAL,
    dest TEXT GENERATED ALWAYS AS (JSON_EXTRACT({colname_json}, '$.arrival.iata')) VIRTUAL,
    ts_takeoff TEXT GENERATED ALWAYS AS (JSON_EXTRACT({colname_json}, '$.arrival.iata')) VIRTUAL
);
"""
index_sql = """
CREATE UNIQUE INDEX flight_id 
ON flights_json(
    flight_num,
    start,
    dest,
    ts_takeoff
);
"""
cur.execute(create_sql)
cur.execute(index_sql)

<sqlite3.Cursor at 0x7fa01073b240>

Only create the table if `.db` doesn't already exist

By storing `json` as is, no need to go through each json to create an imperfect schema that could be broken later on

## Using %sql magic

requires `ipython-sql` module

In [1]:
%load_ext sql

In [30]:
%sql sqlite:///../tests/data/tutorial.db

In [5]:
%%sql
select * from sqlite_master;

 * sqlite:///../tests/data/tutorial.db
Done.


type,name,tbl_name,rootpage,sql
table,flights,flights,2,"CREATE TABLE flights(flight_date, flight_status, departure, arrival, airline, flight, aircraft, live)"
table,flights_json,flights_json,3,CREATE TABLE flights_json(flights_response json)


In [None]:
%%sql
select * from $tblname_json limit 1;

## generating DDL from json schema

In [3]:
sample_data = response["data"][0]
pprint(sample_data)

{'aircraft': None,
 'airline': {'iata': 'MH', 'icao': 'MAS', 'name': 'Malaysia Airlines'},
 'arrival': {'actual': None,
             'actual_runway': None,
             'airport': 'Cape Town International',
             'baggage': '1.4',
             'delay': 31,
             'estimated': '2023-08-29T10:50:00+00:00',
             'estimated_runway': None,
             'gate': 'A5',
             'iata': 'CPT',
             'icao': 'FACT',
             'scheduled': '2023-08-29T10:50:00+00:00',
             'terminal': 'B',
             'timezone': 'Africa/Johannesburg'},
 'departure': {'actual': '2023-08-29T03:10:00+00:00',
               'actual_runway': '2023-08-29T03:10:00+00:00',
               'airport': 'Doha International',
               'delay': 25,
               'estimated': '2023-08-29T02:05:00+00:00',
               'estimated_runway': '2023-08-29T03:10:00+00:00',
               'gate': 'C37',
               'iata': 'DOH',
               'icao': 'OTHH',
               'sched

I expect the generator to collect all possible keys in this json, and group them as required, e.g. arrival_actual, flight_codeshared_airline_iata for nested data.

Naive approach - depth first search

- store `key1`
- check type of `data[key1]`
    - if not `dict`, return `key1` and proceed to `key2`
    - if `dict`, search through `data[key1][key11]`
        - if not `dict`, return `key1_key11` and proceed to `key1_key12`
        - if `dict`, search through `data[key1][key11][key111]`
     
Framed as a recursion problem:

base:

- if `data[key]` is `not dict`: return `key`

All others:

- if `data[key]` is `dict`: pass `data[key]` into the function again

In [9]:
from collections.abc import MutableMapping


def json_flatten(data: dict, parent_key="", sep="_"):
    """
    Normalizes json, if nested
    """
    items = []
    for key, val in data.items():
        new_key = parent_key + sep + key if parent_key else key
        if isinstance(val, MutableMapping):
            items.extend(
                json_flatten(val, parent_key=new_key, sep=sep).items()
            )
        else:
            items.append((new_key, val))

    # creates {key: val} from (key, val) tuple
    return dict(items)

In [10]:
flat = json_flatten(sample_data, sep="__")
flat

{'flight_date': '2023-08-29',
 'flight_status': 'scheduled',
 'departure__airport': 'Doha International',
 'departure__timezone': 'Asia/Qatar',
 'departure__iata': 'DOH',
 'departure__icao': 'OTHH',
 'departure__terminal': None,
 'departure__gate': 'C37',
 'departure__delay': 25,
 'departure__scheduled': '2023-08-29T02:05:00+00:00',
 'departure__estimated': '2023-08-29T02:05:00+00:00',
 'departure__actual': '2023-08-29T03:10:00+00:00',
 'departure__estimated_runway': '2023-08-29T03:10:00+00:00',
 'departure__actual_runway': '2023-08-29T03:10:00+00:00',
 'arrival__airport': 'Cape Town International',
 'arrival__timezone': 'Africa/Johannesburg',
 'arrival__iata': 'CPT',
 'arrival__icao': 'FACT',
 'arrival__terminal': 'B',
 'arrival__gate': 'A5',
 'arrival__baggage': '1.4',
 'arrival__delay': 31,
 'arrival__scheduled': '2023-08-29T10:50:00+00:00',
 'arrival__estimated': '2023-08-29T10:50:00+00:00',
 'arrival__actual': None,
 'arrival__estimated_runway': None,
 'arrival__actual_runway': No

In [7]:
flat.keys()

dict_keys(['flight_date', 'flight_status', 'departure_airport', 'departure_timezone', 'departure_iata', 'departure_icao', 'departure_terminal', 'departure_gate', 'departure_delay', 'departure_scheduled', 'departure_estimated', 'departure_actual', 'departure_estimated_runway', 'departure_actual_runway', 'arrival_airport', 'arrival_timezone', 'arrival_iata', 'arrival_icao', 'arrival_terminal', 'arrival_gate', 'arrival_baggage', 'arrival_delay', 'arrival_scheduled', 'arrival_estimated', 'arrival_actual', 'arrival_estimated_runway', 'arrival_actual_runway', 'airline_name', 'airline_iata', 'airline_icao', 'flight_number', 'flight_iata', 'flight_icao', 'flight_codeshared_airline_name', 'flight_codeshared_airline_iata', 'flight_codeshared_airline_icao', 'flight_codeshared_flight_number', 'flight_codeshared_flight_iata', 'flight_codeshared_flight_icao', 'aircraft', 'live'])

For each key, create a `TEXT` field in the import table

In [23]:
fields_sql = ", ".join([f"{field} TEXT" for field in flat.keys()])
create_sql = f"CREATE TABLE import_flights_response ({fields_sql})"

In [24]:
create_sql

'CREATE TABLE import_flights_response (flight_date TEXT, flight_status TEXT, departure__airport TEXT, departure__timezone TEXT, departure__iata TEXT, departure__icao TEXT, departure__terminal TEXT, departure__gate TEXT, departure__delay TEXT, departure__scheduled TEXT, departure__estimated TEXT, departure__actual TEXT, departure__estimated_runway TEXT, departure__actual_runway TEXT, arrival__airport TEXT, arrival__timezone TEXT, arrival__iata TEXT, arrival__icao TEXT, arrival__terminal TEXT, arrival__gate TEXT, arrival__baggage TEXT, arrival__delay TEXT, arrival__scheduled TEXT, arrival__estimated TEXT, arrival__actual TEXT, arrival__estimated_runway TEXT, arrival__actual_runway TEXT, airline__name TEXT, airline__iata TEXT, airline__icao TEXT, flight__number TEXT, flight__iata TEXT, flight__icao TEXT, flight__codeshared__airline_name TEXT, flight__codeshared__airline_iata TEXT, flight__codeshared__airline_icao TEXT, flight__codeshared__flight_number TEXT, flight__codeshared__flight_iat

In [26]:
con = sqlite3.connect("../tests/data/tutorial.db")
curs = con.cursor()
curs.execute(create_sql)

<sqlite3.Cursor at 0x7f4589f0ac40>

In [28]:
curs.execute("select name from sqlite_master")
curs.fetchall()

[('flights',), ('flights_json',), ('import_flights_response',)]

In [31]:
%%sql
select * from sqlite_master
where name = 'import_flights_response';

 * sqlite:///../tests/data/tutorial.db
Done.


type,name,tbl_name,rootpage,sql
table,import_flights_response,import_flights_response,64,"CREATE TABLE import_flights_response (flight_date TEXT, flight_status TEXT, departure__airport TEXT, departure__timezone TEXT, departure__iata TEXT, departure__icao TEXT, departure__terminal TEXT, departure__gate TEXT, departure__delay TEXT, departure__scheduled TEXT, departure__estimated TEXT, departure__actual TEXT, departure__estimated_runway TEXT, departure__actual_runway TEXT, arrival__airport TEXT, arrival__timezone TEXT, arrival__iata TEXT, arrival__icao TEXT, arrival__terminal TEXT, arrival__gate TEXT, arrival__baggage TEXT, arrival__delay TEXT, arrival__scheduled TEXT, arrival__estimated TEXT, arrival__actual TEXT, arrival__estimated_runway TEXT, arrival__actual_runway TEXT, airline__name TEXT, airline__iata TEXT, airline__icao TEXT, flight__number TEXT, flight__iata TEXT, flight__icao TEXT, flight__codeshared__airline_name TEXT, flight__codeshared__airline_iata TEXT, flight__codeshared__airline_icao TEXT, flight__codeshared__flight_number TEXT, flight__codeshared__flight_iata TEXT, flight__codeshared__flight_icao TEXT, aircraft TEXT, live TEXT)"


In [38]:
vals = [
    list(json_flatten(entry, sep="__").values()) for entry in response["data"]
]

In [39]:
vals[3]

['2023-08-29',
 'scheduled',
 'Doha International',
 'Asia/Qatar',
 'DOH',
 'OTHH',
 None,
 'C36',
 22,
 '2023-08-29T02:10:00+00:00',
 '2023-08-29T02:10:00+00:00',
 '2023-08-29T05:18:00+00:00',
 '2023-08-29T05:18:00+00:00',
 '2023-08-29T05:18:00+00:00',
 'Leonardo Da Vinci (Fiumicino)',
 'Europe/Rome',
 'FCO',
 'LIRF',
 '3',
 None,
 '12',
 131,
 '2023-08-29T07:15:00+00:00',
 '2023-08-29T07:15:00+00:00',
 None,
 None,
 None,
 'Malaysia Airlines',
 'MH',
 'MAS',
 '9237',
 'MH9237',
 'MAS9237',
 'qatar airways',
 'qr',
 'qtr',
 '115',
 'qr115',
 'qtr115',
 None,
 None]

In [37]:
f"INSERT INTO import_flights_response VALUES({vals_placeholder})"

'INSERT INTO import_flights_response VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'

In [42]:
len(vals[0])

41

In [89]:
from sqlite3 import ProgrammingError

vals_placeholder = ", ".join(len(vals[0]) * "?")
try:
    curs.executemany(
        f"INSERT INTO import_flights_response VALUES({vals_placeholder})", vals
    )
except ProgrammingError as e:
    print(e)

Incorrect number of bindings supplied. The current statement uses 41, and there are 36 supplied.


Can't just use the first entry as the field guide for all entries.

Collect fields from all entries, and take the union as the table schema.

For cases with insufficient entries, need to map the available entry to the superset, and default `None` for empty fields

In [60]:
def compare_first(l):
    """
    compares all elements of iterable, l, vs the first
    It doesn't answer which elems are different, only
    if all are equal; if all are equal to 1st, then they're all equal
    """
    cmps = [elem == l[0] for elem in l]
    return all(cmps)

In [61]:
[compare_first(l) for l in [fa, fb, fc]]

[True, True, True]

In [67]:
from itertools import groupby


def all_equal(l):
    """
    If all elements in l are equal, g should only have 1 element,
    thus, the first next(g, True) should return True, and
    the second next(g, False) should return StopIteration, and
    default to False; True && not False equates to True
    If there are multiple elements in g (not all in l are equal)
    the second next(g, False) will return an actual element,
    evaluating to True; thus True && not True equates to False
    """
    g = groupby(l)
    return next(g, True) and not next(g, False)

In [70]:
g = groupby(fields)

In [81]:
len(list(g))

27

In [82]:
with open("../tests/data/sample_flights_api_response.json") as j:
    sample_resp = json.load(j)

sample_resp.keys()

dict_keys(['pagination', 'data'])

In [85]:
sample_flat = json_flatten(sample_resp["data"][0], sep="__")
len(sample_flat)

46

In [87]:
a = ["foo", "bar", "boo"]
b = ["foo", "baz"]
c = ["bar", "fuu"]
abc = set()
for names in [a, b, c]:
    abc |= set(names)

abc

{'bar', 'baz', 'boo', 'foo', 'fuu'}

In [97]:
a = list(sample_data[0].keys())
b = list(sample_data[44].keys())

In [105]:
set(a).update(sample_data[44].keys())

In [108]:
sample_data = [json_flatten(entry, sep="__") for entry in response["data"]]

In [109]:
fields = set()
for entry in sample_data:
    fields.update(entry.keys())

fields

{'aircraft',
 'aircraft__iata',
 'aircraft__icao',
 'aircraft__icao24',
 'aircraft__registration',
 'airline__iata',
 'airline__icao',
 'airline__name',
 'arrival__actual',
 'arrival__actual_runway',
 'arrival__airport',
 'arrival__baggage',
 'arrival__delay',
 'arrival__estimated',
 'arrival__estimated_runway',
 'arrival__gate',
 'arrival__iata',
 'arrival__icao',
 'arrival__scheduled',
 'arrival__terminal',
 'arrival__timezone',
 'departure__actual',
 'departure__actual_runway',
 'departure__airport',
 'departure__delay',
 'departure__estimated',
 'departure__estimated_runway',
 'departure__gate',
 'departure__iata',
 'departure__icao',
 'departure__scheduled',
 'departure__terminal',
 'departure__timezone',
 'flight__codeshared',
 'flight__codeshared__airline_iata',
 'flight__codeshared__airline_icao',
 'flight__codeshared__airline_name',
 'flight__codeshared__flight_iata',
 'flight__codeshared__flight_icao',
 'flight__codeshared__flight_number',
 'flight__iata',
 'flight__icao',
 '

In [110]:
len(fields)

54

In [112]:
fields.difference(sample_flat.keys())

{'aircraft',
 'flight__codeshared__airline_iata',
 'flight__codeshared__airline_icao',
 'flight__codeshared__airline_name',
 'flight__codeshared__flight_iata',
 'flight__codeshared__flight_icao',
 'flight__codeshared__flight_number',
 'live'}

In [128]:
def issubstring(text: str, checklist, sep="__") -> bool:
    """
    Returns True for overlapped keys
    """
    for check in checklist:
        if text + sep in check:
            return True
    return False


fields_uniq = [field for field in fields if not issubstring(field, fields)]

In [129]:
sorted(fields_uniq)

['aircraft__iata',
 'aircraft__icao',
 'aircraft__icao24',
 'aircraft__registration',
 'airline__iata',
 'airline__icao',
 'airline__name',
 'arrival__actual',
 'arrival__actual_runway',
 'arrival__airport',
 'arrival__baggage',
 'arrival__delay',
 'arrival__estimated',
 'arrival__estimated_runway',
 'arrival__gate',
 'arrival__iata',
 'arrival__icao',
 'arrival__scheduled',
 'arrival__terminal',
 'arrival__timezone',
 'departure__actual',
 'departure__actual_runway',
 'departure__airport',
 'departure__delay',
 'departure__estimated',
 'departure__estimated_runway',
 'departure__gate',
 'departure__iata',
 'departure__icao',
 'departure__scheduled',
 'departure__terminal',
 'departure__timezone',
 'flight__codeshared__airline_iata',
 'flight__codeshared__airline_icao',
 'flight__codeshared__airline_name',
 'flight__codeshared__flight_iata',
 'flight__codeshared__flight_icao',
 'flight__codeshared__flight_number',
 'flight__iata',
 'flight__icao',
 'flight__number',
 'flight_date',
 'f

In [127]:
# using .get() avoids KeyError for missing keys; defaults None
sample_entry = sample_data[0]
expanded_entry = {field: sample_entry.get(field) for field in fields_uniq}
pprint(expanded_entry)

{'aircraft__iata': None,
 'aircraft__icao': None,
 'aircraft__icao24': None,
 'aircraft__registration': None,
 'airline__iata': 'MH',
 'airline__icao': 'MAS',
 'airline__name': 'Malaysia Airlines',
 'arrival__actual': None,
 'arrival__actual_runway': None,
 'arrival__airport': 'Cape Town International',
 'arrival__baggage': '1.4',
 'arrival__delay': 31,
 'arrival__estimated': '2023-08-29T10:50:00+00:00',
 'arrival__estimated_runway': None,
 'arrival__gate': 'A5',
 'arrival__iata': 'CPT',
 'arrival__icao': 'FACT',
 'arrival__scheduled': '2023-08-29T10:50:00+00:00',
 'arrival__terminal': 'B',
 'arrival__timezone': 'Africa/Johannesburg',
 'departure__actual': '2023-08-29T03:10:00+00:00',
 'departure__actual_runway': '2023-08-29T03:10:00+00:00',
 'departure__airport': 'Doha International',
 'departure__delay': 25,
 'departure__estimated': '2023-08-29T02:05:00+00:00',
 'departure__estimated_runway': '2023-08-29T03:10:00+00:00',
 'departure__gate': 'C37',
 'departure__iata': 'DOH',
 'departu

In [130]:
sample_data_exp = [
    {field: entry.get(field) for field in fields_uniq} for entry in sample_data
]

In [133]:
curs.execute("drop table if exists import_flights_response")

<sqlite3.Cursor at 0x7f4589f0ac40>

In [134]:
fields_sql = ", ".join([f"{field} TEXT DEFAULT NULL" for field in fields_uniq])
create_sql = f"CREATE TABLE import_flights_response ({fields_sql})"

In [135]:
curs.execute(create_sql)

<sqlite3.Cursor at 0x7f4589f0ac40>

In [136]:
%%sql
select * from sqlite_master
where name = 'import_flights_response';

 * sqlite:///../tests/data/tutorial.db
Done.


type,name,tbl_name,rootpage,sql
table,import_flights_response,import_flights_response,64,"CREATE TABLE import_flights_response (flight_date TEXT, flight_status TEXT, departure__airport TEXT, departure__timezone TEXT, departure__iata TEXT, departure__icao TEXT, departure__terminal TEXT, departure__gate TEXT, departure__delay TEXT, departure__scheduled TEXT, departure__estimated TEXT, departure__actual TEXT, departure__estimated_runway TEXT, departure__actual_runway TEXT, arrival__airport TEXT, arrival__timezone TEXT, arrival__iata TEXT, arrival__icao TEXT, arrival__terminal TEXT, arrival__gate TEXT, arrival__baggage TEXT, arrival__delay TEXT, arrival__scheduled TEXT, arrival__estimated TEXT, arrival__actual TEXT, arrival__estimated_runway TEXT, arrival__actual_runway TEXT, airline__name TEXT, airline__iata TEXT, airline__icao TEXT, flight__number TEXT, flight__iata TEXT, flight__icao TEXT, flight__codeshared__airline_name TEXT, flight__codeshared__airline_iata TEXT, flight__codeshared__airline_icao TEXT, flight__codeshared__flight_number TEXT, flight__codeshared__flight_iata TEXT, flight__codeshared__flight_icao TEXT, aircraft TEXT, live TEXT)"


In [139]:
vals = [list(map(str, entry.values())) for entry in sample_data_exp]
vals_placeholder = ", ".join(len(vals[0]) * "?")
try:
    curs.executemany(
        f"INSERT INTO import_flights_response VALUES({vals_placeholder})", vals
    )
except ProgrammingError as e:
    print(e)

In [151]:
con.commit()

In [155]:
%%sql
select
    arrival__iata,
    departure__iata,
    CAST(departure__delay AS INTEGER) AS delay
from import_flights_response
where delay > 50
ORDER BY delay DESC, arrival__iata, departure__iata;

 * sqlite:///../tests/data/tutorial.db
Done.


arrival__iata,departure__iata,delay
ICN,KUL,391
KUL,SYD,336
PEN,BKI,208
BKI,PEN,206
PHL,MIA,181
PHL,LHR,165
LAX,ORD,164
DOH,EDI,155
BCN,LHR,153
MIA,JFK,150


In [146]:
len(vals[0])

51

In [None]:
len(vals_placeholder.split("?"))

### Inserting data using named placeholder

sqlite uses this format for named placeholders

```py
# This is the named style used with executemany():
data = (
    {"name": "C", "year": 1972},
    {"name": "Fortran", "year": 1957},
    {"name": "Python", "year": 1991},
    {"name": "Go", "year": 2009},
)
cur.executemany("INSERT INTO lang VALUES(:name, :year)"
```

However the order in which we reference the columns in `VALUES(...)` is still important because SQL does not inherently recognize column tables. If for example the `lang` table was created thus:

```sql
CREATE TABLE lang (
    year integer,
    name text
)
```

the `.executemany()` statement would actually insert `name` into `year` and vice versa, because the order of `INSERT` does not match order of `CREATE TABLE`. The naming in `.executemany()` allows the statement to retreive the correct value from `data` only, and does not bind to the correct column name.

Use `PRAGMA table_info('tbl_name');` to retrieve the order of the columns, data)

In [2]:
%sql sqlite:///../data/flights.db

In [5]:
%%sql
select * from sqlite_master

 * sqlite:///../data/flights.db
Done.


type,name,tbl_name,rootpage,sql
table,import_flight_records,import_flight_records,2,"CREATE TABLE import_flight_records (  arrival__terminal TEXT DEFAULT NULL, departure__actual TEXT DEFAULT NULL, arrival__baggage TEXT DEFAULT NULL, departure__actual_runway TEXT DEFAULT NULL, arrival__actual TEXT DEFAULT NULL, flight__codeshared__flight_icao TEXT DEFAULT NULL, flight__number TEXT DEFAULT NULL, live TEXT DEFAULT NULL, arrival__estimated TEXT DEFAULT NULL, arrival__scheduled TEXT DEFAULT NULL, airline__iata TEXT DEFAULT NULL, airline__name TEXT DEFAULT NULL, flight__iata TEXT DEFAULT NULL, departure__terminal TEXT DEFAULT NULL, flight__codeshared__airline_iata TEXT DEFAULT NULL, flight_date TEXT DEFAULT NULL, departure__scheduled TEXT DEFAULT NULL, arrival__delay TEXT DEFAULT NULL, arrival__gate TEXT DEFAULT NULL, flight_status TEXT DEFAULT NULL, arrival__airport TEXT DEFAULT NULL, departure__timezone TEXT DEFAULT NULL, departure__airport TEXT DEFAULT NULL, flight__codeshared__flight_iata TEXT DEFAULT NULL, arrival__iata TEXT DEFAULT NULL, departure__delay TEXT DEFAULT NULL, arrival__estimated_runway TEXT DEFAULT NULL, departure__icao TEXT DEFAULT NULL, flight__codeshared__airline_icao TEXT DEFAULT NULL, arrival__icao TEXT DEFAULT NULL, flight__codeshared__airline_name TEXT DEFAULT NULL, arrival__actual_runway TEXT DEFAULT NULL, arrival__timezone TEXT DEFAULT NULL, departure__estimated TEXT DEFAULT NULL, airline__icao TEXT DEFAULT NULL, departure__iata TEXT DEFAULT NULL, flight__codeshared__flight_number TEXT DEFAULT NULL, flight__icao TEXT DEFAULT NULL, aircraft TEXT DEFAULT NULL, departure__gate TEXT DEFAULT NULL, departure__estimated_runway TEXT DEFAULT NULL,  PRIMARY KEY (flight__iata, departure__iata, departure__scheduled, arrival__iata)  )"
index,sqlite_autoindex_import_flight_records_1,import_flight_records,3,


In [3]:
%%sql
select avg(arrival__delay)
from import_flight_records
where flight_date = '2023-10-12';

 * sqlite:///../data/flights.db
Done.


avg(arrival__delay)
17.91304347826087


In [16]:
%%sql
select
    arrival__iata,
    departure__iata,
    arrival__delay,
    row_number() over (partition by flight_date order by CAST(arrival__delay AS INTEGER) desc) delay_rank
from import_flight_records
where flight_date = '2023-10-13'
order by CAST(arrival__delay AS INTEGER) desc
limit 10

 * sqlite:///../data/flights.db
Done.


arrival__iata,departure__iata,arrival__delay,delay_rank
KUL,COK,,1
KUL,MEL,,2
MNL,CEB,,3
KUL,AKL,,4
KUL,PER,,5
MNL,DVO,,6
KUL,BLR,,7
DOH,KUL,,8
KUL,HYD,,9
JFK,HKG,,10


In [15]:
%%sql
SELECT
     ROW_NUMBER() OVER (ORDER BY CAST(arrival__delay AS INTEGER) DESC) delay_rank,
     flight__iata,
     REPLACE(
     REPLACE(
     REPLACE(arrival__airport, ' International Airport', ''),
     ' International', ''),
     ' Airport', '') AS arrival__airport,
     REPLACE(
     REPLACE(
     REPLACE(departure__airport, ' International Airport', ''),
     ' International', ''),
     ' Airport', '') AS departure__airport,
     arrival__delay
 FROM import_flight_records
 WHERE DATE(arrival__scheduled) = '2023-10-12'
 AND arrival__delay IS NOT NULL
 ORDER BY delay_rank
 LIMIT 3;

 * sqlite:///../data/flights.db
Done.


delay_rank,flight__iata,arrival__airport,departure__airport,arrival__delay
1,MH9789,Washington Dulles,Doha,258
2,MH9014,Kuala Lumpur (klia),Bandaranaike,232
3,MH5735,Singapore Changi,Kuala Lumpur (klia),66


In [20]:
%%sql
SELECT 
    substr(flight_iata_number, 1, 2) airline_iata,
    flight_date,
    COUNT(*) total 
FROM import_flight_records
WHERE flight_date = '2023-11-21'
AND substr(flight_iata_number, 1, 2) = 'AK'


 * sqlite:///../data/flights.db
Done.


airline_iata,flight_date,total
AK,2023-11-21,842


In [23]:
%%sql
SELECT 
    substr(flight_iata_number, 1, 2) airline_iata,
    COUNT(*) num_delayed,
    AVG(CAST(arr_delay AS INTEGER)) avg_delay
FROM import_flight_records 
WHERE substr(flight_iata_number, 1, 2) = 'AK' 
AND flight_date = '2023-11-21'
AND arr_delay != '0';

 * sqlite:///../data/flights.db
Done.


airline_iata,num_delayed,avg_delay
AK,281,16.83274021352313


In [24]:
%%sql
WITH RECURSIVE 
    t(flight_date, total) AS (
        SELECT 
            flight_date,
            COUNT(*) total 
        FROM import_flight_records
        WHERE flight_date = '2023-11-21'
        AND substr(flight_iata_number, 1, 2) = 'AK'
    )
SELECT 
    t.total total,
    COUNT(arr_delay) num_delayed,
    AVG(arr_delay) avg_delay
FROM import_flight_records d LEFT JOIN t
USING (flight_date)
WHERE substr(flight_iata_number, 1, 2) = 'AK'
AND arr_delay > 0;

 * sqlite:///../data/flights.db
Done.


total,num_delayed,avg_delay
,297,16.34006734006734


In [35]:
%%timeit
%%sql
explain QUERY PLAN
WITH RECURSIVE
    a(airline, flight, delay) AS (
        SELECT
            substr(flight_iata_number, 1, 2) airline,
            flight_iata_number flight,
            arr_delay delay
        FROM import_flight_records
        WHERE airline = 'AK' 
        AND flight_date = '2023-11-21'        
    )
SELECT 
    count(flight) total,
    count(delay) num_delayed,
    avg(delay) avg_delayed
FROM a

 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite://

In [36]:
%%timeit
%%sql
SELECT 
    count(flight) total,
    count(delay) num_delayed,
    avg(delay) avg_delayed
FROM (
        SELECT
            substr(flight_iata_number, 1, 2) airline,
            flight_iata_number flight,
            arr_delay delay
        FROM import_flight_records
        WHERE airline = 'AK' 
        AND flight_date = '2023-11-21'        
    )

 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite:///../data/flights.db
Done.
 * sqlite://

CTE (1.3 ms) is 20x faster than a subquery (26 ms) approach