In [1]:
import polars as pl
import duckdb

airlines = pl.read_parquet('./parquet/airlines.parquet')
airports = pl.read_parquet('./parquet/airports.parquet')
flights = pl.read_parquet('./parquet/flights.parquet')
planes = pl.read_parquet('./parquet/planes.parquet')
weather = pl.read_parquet('./parquet/weather.parquet')

## add calculated new variables (mutate)

In [17]:
query = f'''
from flights
select
    *,
    gain: arr_delay - dep_delay,
    hours: air_time / 60,
    gain_per_hour: gain / hours -- note: created column can be used immediately
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result.head(4))

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute,gain,hours,gain_per_hour
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40,-19,5.216667,-3.642173
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57,13,3.6,3.611111
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59,12,6.266667,1.914894
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41,-30,2.25,-13.333333


In [18]:
flights.with_columns(
    gain = pl.col('arr_delay') - pl.col('dep_delay'),
    hours = pl.col('air_time') / 60,
).with_columns(
    gain_per_hour = pl.col('gain') / pl.col('hours'),
).head(4)

year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute,gain,hours,gain_per_hour
i64,i64,i64,i64,i64,i64,i64,str,str,i64,str,str,i64,i64,i64,i64,i64,f64,f64
2013,6,30,940,15,1216,-4,"""VX""","""N626VA""",407,"""JFK""","""LAX""",313,2475,9,40,-19,5.216667,-3.642173
2013,5,7,1657,-3,2104,10,"""DL""","""N3760C""",329,"""JFK""","""SJU""",216,1598,16,57,13,3.6,3.611111
2013,12,8,859,-1,1238,11,"""DL""","""N712TW""",422,"""JFK""","""LAX""",376,2475,8,59,12,6.266667,1.914894
2013,5,14,1841,-4,2122,-34,"""DL""","""N914DL""",2391,"""JFK""","""TPA""",135,1005,18,41,-30,2.25,-13.333333


## only calculated variables (transmute)

In [19]:
query = f'''
from flights
select
    gain: arr_delay - dep_delay,
    hours: air_time / 60,
    gain_per_hour: gain / hours -- note: created column can be used immediately
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result.head(4))

Unnamed: 0,gain,hours,gain_per_hour
0,-19,5.216667,-3.642173
1,13,3.6,3.611111
2,12,6.266667,1.914894
3,-30,2.25,-13.333333


In [20]:
flights.select(
    gain = pl.col('arr_delay') - pl.col('dep_delay'),
    hours = pl.col('air_time') / 60,
).with_columns(
    gain_per_hour = pl.col('gain') / pl.col('hours'),
).head(4)

gain,hours,gain_per_hour
i64,f64,f64
-19,5.216667,-3.642173
13,3.6,3.611111
12,6.266667,1.914894
-30,2.25,-13.333333


## sorting

In [25]:
query = f'''
from flights
order by arr_delay desc, arr_delay desc
limit 10
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result)

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,1,9,641,1301,1242,1272,HA,N384HA,51,JFK,HNL,640,4983,6,41
1,2013,12,19,734,849,1046,847,DL,N375NC,1223,EWR,SLC,290,1969,7,34
2,2013,6,27,753,803,937,802,AA,N571AA,2019,LGA,STL,134,888,7,53
3,2013,6,27,615,790,853,769,DL,N372DA,503,JFK,SAN,312,2446,6,15
4,2013,6,28,121,502,329,490,DL,N360NB,2042,EWR,ATL,106,746,1,21
5,2013,9,2,2218,473,2349,444,EV,N744EV,4949,LGA,GSO,69,461,22,18
6,2013,4,18,2200,427,106,435,UA,N811UA,479,LGA,IAH,208,1416,22,0
7,2013,11,4,1822,413,2143,434,AA,N3GRAA,1139,LGA,DFW,210,1389,18,22
8,2013,4,24,2048,423,2207,422,WN,N931WN,2226,LGA,MDW,117,725,20,48
9,2013,7,10,2054,355,102,421,9E,N937XJ,3325,JFK,DFW,191,1391,20,54


In [26]:
flights.sort(['arr_delay', 'dep_delay'], descending=[True, True]).head(10)

year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
i64,i64,i64,i64,i64,i64,i64,str,str,i64,str,str,i64,i64,i64,i64
2013,1,9,641,1301,1242,1272,"""HA""","""N384HA""",51,"""JFK""","""HNL""",640,4983,6,41
2013,12,19,734,849,1046,847,"""DL""","""N375NC""",1223,"""EWR""","""SLC""",290,1969,7,34
2013,6,27,753,803,937,802,"""AA""","""N571AA""",2019,"""LGA""","""STL""",134,888,7,53
2013,6,27,615,790,853,769,"""DL""","""N372DA""",503,"""JFK""","""SAN""",312,2446,6,15
2013,6,28,121,502,329,490,"""DL""","""N360NB""",2042,"""EWR""","""ATL""",106,746,1,21
2013,9,2,2218,473,2349,444,"""EV""","""N744EV""",4949,"""LGA""","""GSO""",69,461,22,18
2013,4,18,2200,427,106,435,"""UA""","""N811UA""",479,"""LGA""","""IAH""",208,1416,22,0
2013,11,4,1822,413,2143,434,"""AA""","""N3GRAA""",1139,"""LGA""","""DFW""",210,1389,18,22
2013,4,24,2048,423,2207,422,"""WN""","""N931WN""",2226,"""LGA""","""MDW""",117,725,20,48
2013,7,10,2054,355,102,421,"""9E""","""N937XJ""",3325,"""JFK""","""DFW""",191,1391,20,54


## summaries

In [31]:
query = f'''
from flights
select delay: mean(dep_delay)
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result)

Unnamed: 0,delay
0,12.705147


In [30]:
flights.select(
    delay = pl.col('dep_delay').mean()
)

delay
f64
12.705147


## grouped summaries

In [36]:
query = f'''
from flights
select year, month, day, delay: mean(dep_delay)
group by year, month, day
order by year, month, day
limit 8
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result)

Unnamed: 0,year,month,day,delay
0,2013,1,1,8.377778
1,2013,1,2,8.653333
2,2013,1,3,13.366667
3,2013,1,4,9.24
4,2013,1,5,8.144928
5,2013,1,6,6.808219
6,2013,1,7,3.595745
7,2013,1,8,3.553191


In [37]:
flights.group_by(['year', 'month', 'day']).agg(
    delay = pl.col('dep_delay').mean()
).sort(['year', 'month', 'day']).head(8)

year,month,day,delay
i64,i64,i64,f64
2013,1,1,8.377778
2013,1,2,8.653333
2013,1,3,13.366667
2013,1,4,9.24
2013,1,5,8.144928
2013,1,6,6.808219
2013,1,7,3.595745
2013,1,8,3.553191


## missing values

In [None]:
query = f'''
from flights
select
    year: count_if(year is null),
    month: count_if(month is null),
    day: count_if(day is null),
    dep_time: count_if(dep_time is null),
    dep_delay: count_if(dep_delay is null)
    -- ...
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result)

Unnamed: 0,year,month,day,dep_time,dep_delay
0,0.0,0.0,0.0,0.0,0.0


In [39]:
flights.select(
    pl.all().is_null().sum(),
)

year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## joins

In [17]:
query = f'''
from
    flights f
    left join airports a on f.dest = a.faa
    left join planes p using (tailnum)
    left join airlines c using (carrier)
select
    * exclude(a.faa, a.tz, a.dst, p.type, p.speed)
using sample 8
'''

con = duckdb.connect('./db/nycflights13.duckdb')
result = con.sql(query).df()
con.close()

display(result)

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,...,lon,alt,tzone,year_1,manufacturer,model,engines,seats,engine,name_1
0,2013,7,25,1432,-9,1726,-29,DL,N367NW,1779,...,-80.15275,9,America/New_York,1999,AIRBUS INDUSTRIE,A320-212,2,182,Turbo-fan,Delta Air Lines Inc.
1,2013,2,9,1433,17,1649,-1,FL,N945AT,349,...,-84.428067,1026,America/New_York,1999,BOEING,717-200,2,100,Turbo-fan,AirTran Airways Corporation
2,2013,6,9,1351,-8,1620,-15,UA,N529UA,244,...,-118.408075,126,America/Los_Angeles,1991,BOEING,757-222,2,178,Turbo-jet,United Air Lines Inc.
3,2013,3,24,1139,49,1534,92,DL,N957DL,1903,...,-82.554389,30,America/New_York,1990,MCDONNELL DOUGLAS AIRCRAFT CO,MD-88,2,142,Turbo-fan,Delta Air Lines Inc.
4,2013,9,29,1206,-4,1443,-16,9E,N605LR,3508,...,-81.687861,30,America/New_York,2008,BOMBARDIER INC,CL-600-2D24,2,95,Turbo-fan,Endeavor Air Inc.
5,2013,11,11,829,-6,1027,-18,EV,N13908,4638,...,-84.667822,896,America/New_York,2001,EMBRAER,EMB-145LR,2,55,Turbo-fan,ExpressJet Airlines Inc.
6,2013,3,18,1940,75,2244,109,WN,N285WN,2805,...,-90.258028,4,America/Chicago,2007,BOEING,737-7H4,2,140,Turbo-fan,Southwest Airlines Co.
7,2013,12,13,1745,15,1916,2,B6,N229JB,1185,...,-78.787472,435,America/New_York,2006,EMBRAER,ERJ 190-100 IGW,2,20,Turbo-fan,JetBlue Airways


In [16]:
(flights
  .join(airports, left_on='dest', right_on='faa')
  .join(planes, on='tailnum')
  .join(airlines, on='carrier')
  .drop('tz', 'dst', 'type', 'speed')
).sample(8)

year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute,name,lat,lon,alt,tzone,year_right,manufacturer,model,engines,seats,engine,name_right
i64,i64,i64,i64,i64,i64,i64,str,str,i64,str,str,i64,i64,i64,i64,str,f64,f64,i64,str,str,str,str,i64,i64,str,str
2013,3,9,808,-2,1024,-19,"""FL""","""N899AT""",361,"""LGA""","""ATL""",105,762,8,8,"""Hartsfield Jackson Atlanta Int…",33.636719,-84.428067,1026,"""America/New_York""","""2005""","""BOEING""","""717-200""",2,100,"""Turbo-fan""","""AirTran Airways Corporation"""
2013,5,31,1157,-3,1449,-9,"""DL""","""N704X""",863,"""JFK""","""LAX""",323,2475,11,57,"""Los Angeles Intl""",33.942536,-118.408075,126,"""America/Los_Angeles""","""1997""","""BOEING""","""757-2Q8""",2,178,"""Turbo-fan""","""Delta Air Lines Inc."""
2013,3,15,1103,11,1330,-33,"""B6""","""N535JB""",1471,"""LGA""","""FLL""",132,1076,11,3,"""Fort Lauderdale Hollywood Intl""",26.072583,-80.15275,9,"""America/New_York""","""2002""","""AIRBUS INDUSTRIE""","""A320-232""",2,200,"""Turbo-fan""","""JetBlue Airways"""
2013,11,10,1320,-9,1427,-13,"""B6""","""N236JB""",308,"""JFK""","""PWM""",49,273,13,20,"""Portland Intl Jetport""",43.646161,-70.309281,77,"""America/New_York""","""2006""","""EMBRAER""","""ERJ 190-100 IGW""",2,20,"""Turbo-fan""","""JetBlue Airways"""
2013,3,6,1014,14,1138,1,"""UA""","""N26210""",1460,"""LGA""","""ORD""",109,733,10,14,"""Chicago Ohare Intl""",41.978603,-87.904842,668,"""America/Chicago""","""1998""","""BOEING""","""737-824""",2,149,"""Turbo-fan""","""United Air Lines Inc."""
2013,10,15,1039,-7,1303,-27,"""B6""","""N587JB""",483,"""JFK""","""MCO""",122,944,10,39,"""Orlando Intl""",28.429394,-81.308994,96,"""America/New_York""","""2004""","""AIRBUS""","""A320-232""",2,200,"""Turbo-fan""","""JetBlue Airways"""
2013,5,10,1358,-1,1719,34,"""UA""","""N519UA""",682,"""EWR""","""IAH""",221,1400,13,58,"""George Bush Intercontinental""",29.984433,-95.341442,97,"""America/Chicago""","""1990""","""BOEING""","""757-222""",2,178,"""Turbo-jet""","""United Air Lines Inc."""
2013,5,28,853,-7,1111,-15,"""UA""","""N12109""",1643,"""EWR""","""DEN""",232,1605,8,53,"""Denver Intl""",39.861656,-104.673178,5431,"""America/Denver""","""1994""","""BOEING""","""757-224""",2,178,"""Turbo-jet""","""United Air Lines Inc."""
