## DuckDB

- [DuckDB-Web](https://duckdb.org/)
- [GitHub](https://github.com/duckdb/duckdb)
- [get started with](https://github.com/duckdb/duckdb/blob/master/examples/python/duckdb-python.py)

- [The Guide to Data Analysis with DuckDB](https://www.analyticsvidhya.com/blog/2021/12/the-guide-to-data-analysis-with-duckdb/)

```
$ pip install duckdb
```

In [1]:
import duckdb
import pandas as pd

In [2]:
# basic SQL API

# connect to an in-memory temporary database
# when a file name is given, DuckDB will be created (if new) and connection established 
conn = duckdb.connect("test.duckdb")

# if you want, you can create a cursor() like described in PEP 249 but its fully redundant
cursor = conn.cursor()

### base table & dataframe

#### Create

In [4]:
# run arbitrary SQL commands
conn.execute("CREATE TABLE if not exists test_table (i INTEGER, j STRING)")

<duckdb.DuckDBPyConnection at 0x1c7e6ce07b0>

In [5]:
# add some data
conn.execute("INSERT INTO test_table VALUES (1, 'one')")

# we can use placeholders for parameters
conn.execute("INSERT INTO test_table VALUES (?, ?)", [2, 'two'])

# we can provide multiple sets of parameters to executemany()
conn.executemany("INSERT INTO test_table VALUES (?, ?)", [[3, 'three'], [4, 'four']])

<duckdb.DuckDBPyConnection at 0x24436fbb330>

In [5]:
# fetch as pandas data frame
df = conn.execute("SELECT * FROM test_table").fetchdf()
df

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,6,


In [6]:
# fetch as list of masked numpy arrays, cleaner when handling NULLs
res = conn.execute("SELECT * FROM test_table").fetchnumpy()
res

{'i': array([1, 2, 3, 4, 6]),
 'j': array(['one', 'two', 'three', 'four', ''], dtype=object)}

#### Insert

In [7]:
conn.executemany("INSERT INTO test_table VALUES (?, ?)", [[5, None], [6, '']])

# fetch as dataframe
df = conn.execute("SELECT * FROM test_table").fetchdf()
df

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,6,
5,5,
6,6,


In [8]:
# fetch as list of masked numpy arrays, cleaner when handling NULLs
res = conn.execute("SELECT * FROM test_table").fetchnumpy()
res

{'i': array([1, 2, 3, 4, 6, 5, 6]),
 'j': masked_array(data=['one', 'two', 'three', 'four', '', --, ''],
              mask=[False, False, False, False, False,  True, False],
        fill_value='?',
             dtype=object)}

#### Update

In [9]:
test_df_3 = conn.execute("update test_table set j = 'NULL' where i=5;").fetchdf()
test_df_3

Unnamed: 0,Count
0,1


In [10]:
test_df_3a = conn.execute("SELECT * from test_table;").fetchdf()
test_df_3a

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,6,
5,5,
6,6,


#### Delete

In [11]:
test_df_4 = conn.execute("delete from test_table where i=5;").fetchdf()
test_df_4

Unnamed: 0,Count
0,1


In [12]:
test_df_4a = conn.execute("SELECT * from test_table;").fetchdf()
test_df_4a

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,6,
5,6,


In [13]:
test_df_5 = conn.execute("SELECT * from test_table where j = '';").fetchdf()
test_df_5

Unnamed: 0,i,j
0,6,
1,6,


#### Support Common Table Expression

https://github.com/duckdb/duckdb/pull/404

In [19]:
sql_stmt = """
--with recursive t as (select 42 as x union all select 42) select * from t;
with recursive t as (select 1 as x union all select x+1 from t where x < 5) select min(a1.x), max(a2.x) from t a1, t a2;
"""

test_df_cte = conn.execute(sql_stmt).fetchdf()
test_df_cte

Unnamed: 0,min(a1.x),max(a2.x)
0,1,5


### dataframe & view

In [14]:
# we can query pandas data frames as if they were SQL views
# create a sample pandas data frame

test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4,5,6], "j":["one", "two", "three", "four", None, ""]})
test_df

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,5,
5,6,


In [15]:
# make this data frame available as a view in duckdb
conn.register("test_df", test_df)
test_df_2 = conn.execute("SELECT * from test_df;").fetchdf()
test_df_2

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,5,
5,6,


dataframe view cannot be updated

In [13]:
test_df_3 = conn.execute("update test_df set j = 'NULL' where i=5;").fetchdf()
test_df_3

BinderException: Binder Error: Can only update base table!

### Relation API

In [16]:
# relation API, programmatic querying. relations are lazily evaluated chains of relational operators

# create a "relation" from a pandas data frame with an existing connection
rel = conn.from_df(test_df)
print(rel)

---------------------
--- Relation Tree ---
---------------------
pandas_scan(0x2cfb9088df0)

---------------------
-- Result Columns  --
---------------------
- i (BIGINT)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
BIGINT	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [17]:
# alternative shorthand, use a built-in default connection to create a relation from a pandas data frame
rel = duckdb.df(test_df)
print(rel)

---------------------
--- Relation Tree ---
---------------------
pandas_scan(0x2cfb9088a60)

---------------------
-- Result Columns  --
---------------------
- i (BIGINT)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
BIGINT	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [18]:
# create a relation from a CSV file

# first create a CSV file from our pandas example 
import tempfile, os
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))
test_df.to_csv(temp_file_name, index=False)

In [20]:
temp_file_name

'C:\\Users\\w_gon\\AppData\\Local\\Temp\\tmpprkfi2ih\\pr4pgac8'

<img src=tmpcsv.png>

In [21]:
# now create a relation from it
rel = duckdb.from_csv_auto(temp_file_name)
print(rel)

---------------------
--- Relation Tree ---
---------------------
read_csv_auto(C:\Users\w_gon\AppData\Local\Temp\tmpprkfi2ih\pr4pgac8)

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	NULL





In [22]:
# create a relation from an existing table
rel = conn.table("test_table")
print(rel)

---------------------
--- Relation Tree ---
---------------------
Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [25]:
# a relation has an alias (like a table name)
print(rel.alias)

test_table


In [26]:
# we can change the alias, useful for (self)joins for example
rel2 = rel.set_alias('new_alias')
print(rel2.alias)

new_alias


In [27]:
# we can inspect the type of a relation
print(rel.type)

TABLE_RELATION


In [28]:
# or the column names that are in it
print(rel.columns)

['i', 'j']


In [29]:
# or the types of those columns
print(rel.types)

['INTEGER', 'VARCHAR']


In [30]:
# now we can apply some operators to the relation
# filter the relation
print(rel.filter('i > 1'))

---------------------
--- Relation Tree ---
---------------------
Filter [(i > 1)]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 5]
2	two
3	three
4	four
5	NULL
6	





In [31]:
rel.filter("j is not null")

---------------------
--- Relation Tree ---
---------------------
Filter [(j IS NOT NULL)]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 5]
1	one
2	two
3	three
4	four
6	



In [32]:
# project the relation, get some columns
print(rel.project('i, j'))

---------------------
--- Relation Tree ---
---------------------
Projection [i as , j as ]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [33]:
# or transform them
print(rel.project('i + 1'))

---------------------
--- Relation Tree ---
---------------------
Projection [(i + 1) as ]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- (i + 1) (INTEGER)

---------------------
-- Result Preview  --
---------------------
(i + 1)	
INTEGER	
[ Rows: 6]
2
3
4
5
6
7





In [34]:
# order the relation
print(rel.order('j'))

---------------------
--- Relation Tree ---
---------------------
Order [j DESC]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
5	NULL
6	
4	four
1	one
3	three
2	two





In [35]:
# limit the rows returned
print(rel.limit(2))

---------------------
--- Relation Tree ---
---------------------
Limit 2
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 2]
1	one
2	two





In [36]:
# skip the first row and limit the number of results
print(rel.limit(2, offset=1))

---------------------
--- Relation Tree ---
---------------------
Limit 2 Offset 1
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 2]
2	two
3	three





In [39]:
# of course these things can be chained
print(rel.filter('i > 1').project('i + 1, j').order('j').limit(2))

---------------------
--- Relation Tree ---
---------------------
Limit 2
  Order [j DESC]
    Projection [(i + 1) as , j as ]
      Filter [(i > 1)]
        Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- (i + 1) (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
(i + 1)	j	
INTEGER	VARCHAR	
[ Rows: 2]
6	NULL
7	





In [40]:
# aggregate the relation
print(rel.aggregate("sum(i)"))

---------------------
--- Relation Tree ---
---------------------
Aggregate [sum(i)]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- sum(i) (HUGEINT)

---------------------
-- Result Preview  --
---------------------
sum(i)	
HUGEINT	
[ Rows: 1]
21





In [41]:
# non-aggregated columns create implicit grouping
print(rel.aggregate("j, sum(i)"))

---------------------
--- Relation Tree ---
---------------------
Aggregate [j, sum(i)]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- j (VARCHAR)
- sum(i) (HUGEINT)

---------------------
-- Result Preview  --
---------------------
j	sum(i)	
VARCHAR	HUGEINT	
[ Rows: 6]
one	1
two	2
three	3
four	4
NULL	5
	6





In [42]:
# we can also explicit group the relation before aggregating
print(rel.aggregate("sum(i)", "j"))

---------------------
--- Relation Tree ---
---------------------
Aggregate [sum(i)]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- sum(i) (HUGEINT)

---------------------
-- Result Preview  --
---------------------
sum(i)	
HUGEINT	
[ Rows: 6]
1
2
3
4
5
6





In [43]:
# distinct values
print(rel.distinct())

---------------------
--- Relation Tree ---
---------------------
Distinct
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [44]:
# multi-relation operators are also supported, e.g union
print(rel.union(rel))

---------------------
--- Relation Tree ---
---------------------
Union
  Scan Table [test_table]  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 10]
1	one
2	two
3	three
4	four
5	NULL
6	
1	one
2	two
3	three
4	four





In [45]:
# join rel with itself on i
rel2 = conn.from_df(test_df)
print(rel.join(rel2, 'i'))

---------------------
--- Relation Tree ---
---------------------
Join INNER
  Scan Table [test_table]
  pandas_scan(0x2cfb90b0f70)

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	j	
INTEGER	VARCHAR	VARCHAR	
[ Rows: 6]
1	one	one
2	two	two
3	three	three
4	four	four
5	NULL	NULL
6		





In [46]:
# for explicit join conditions the relations can be named using alias()
print(rel.set_alias('a').join(rel.set_alias('b'), 'a.i=b.i'))

---------------------
--- Relation Tree ---
---------------------
Join INNER (a.i = b.i)
  Scan Table [test_table]
  Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	i	j	
INTEGER	VARCHAR	INTEGER	VARCHAR	
[ Rows: 6]
1	one	1	one
2	two	2	two
3	three	3	three
4	four	4	four
5	NULL	5	NULL
6		6	





In [49]:
# there are also shorthand methods to directly create a relation and apply an operator from pandas data frame objects
print(duckdb.filter(test_df, 'i > 1'))

---------------------
--- Relation Tree ---
---------------------
Filter [(i > 1)]
  pandas_scan(0x2cfb90b0b20)

---------------------
-- Result Columns  --
---------------------
- i (BIGINT)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
BIGINT	VARCHAR	
[ Rows: 5]
2	two
3	three
4	four
5	NULL
6	





In [50]:
print(duckdb.project(test_df, 'i +1'))
print(duckdb.order(test_df, 'j'))
print(duckdb.limit(test_df, 2))

print(duckdb.aggregate(test_df, "sum(i)"))
print(duckdb.distinct(test_df))

# when chaining only the first call needs to include the data frame parameter
print(duckdb.filter(test_df, 'i > 1').project('i + 1,j').order('j').limit(2))

# turn the relation into something else again

---------------------
--- Relation Tree ---
---------------------
Projection [(i + 1) as ]
  pandas_scan(0x2cfb90dd700)

---------------------
-- Result Columns  --
---------------------
- (i + 1) (BIGINT)

---------------------
-- Result Preview  --
---------------------
(i + 1)	
BIGINT	
[ Rows: 6]
2
3
4
5
6
7



---------------------
--- Relation Tree ---
---------------------
Order [j DESC]
  pandas_scan(0x2cfb90a4fa0)

---------------------
-- Result Columns  --
---------------------
- i (BIGINT)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
BIGINT	VARCHAR	
[ Rows: 6]
5	NULL
6	
4	four
1	one
3	three
2	two



---------------------
--- Relation Tree ---
---------------------
Limit 2
  pandas_scan(0x2cfb097d490)

---------------------
-- Result Columns  --
---------------------
- i (BIGINT)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
BIGINT	VARCHAR	
[ Rows: 2]
1	one
2	two



---------------------
--- Rela

In [51]:
# compute the query result from the relation 
res = rel.execute()
print(res)
# res is a query result, you can call fetchdf() or fetchnumpy() or fetchone() on it
print(res.fetchone())
print(res.fetchall())

# convert a relation back to a pandas data frame
print(rel.to_df())

# df() is shorthand for to_df() on relations
print(rel.df())

<duckdb.DuckDBPyResult object at 0x000002CFB90AEBB0>
(1, 'one')
[(2, 'two'), (3, 'three'), (4, 'four'), (5, None), (6, '')]
   i      j
0  1    one
1  2    two
2  3  three
3  4   four
4  5    NaN
5  6       
   i      j
0  1    one
1  2    two
2  3  three
3  4   four
4  5    NaN
5  6       


In [52]:
# create a table in duckdb from the relation
print(rel.create("test_table2"))

None


In [53]:
# insert the relation's data into an existing table
conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)")
print(rel.insert_into("test_table3"))

None


In [54]:
# Inserting elements into table_3
print(conn.values([5, 'five']).insert_into("test_table3"))
rel_3 = conn.table("test_table3")
rel_3.insert([6,'six'])

# create a SQL-accessible view of the relation
print(rel.create_view('test_view'))


# we can also directly run SQL queries on relation objects without explicitly creating a view
# the first parameter gives the rel object a view name so we can refer to it in queries
res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel')
print(res)

None
---------------------
--- Relation Tree ---
---------------------
Scan Table [test_table]

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	



---------------------
--- Relation Tree ---
---------------------
Subquery

---------------------
-- Result Columns  --
---------------------
- i (INTEGER)
- j (VARCHAR)

---------------------
-- Result Preview  --
---------------------
i	j	
INTEGER	VARCHAR	
[ Rows: 6]
1	one
2	two
3	three
4	four
5	NULL
6	





In [56]:
# res is a query result, we can fetch with the methods described above, e.g.
print(res.fetchone())
print(res.fetchall())
# or just use df(), a shorthand for fetchdf() on query results
print(res.df())

(1, 'one')
[(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (5, None), (6, '')]
   i      j
0  1    one
1  2    two
2  3  three
3  4   four
4  5    NaN
5  6       


In [57]:
test_df

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three
3,4,four
4,5,
5,6,


In [60]:
conn.close()