In [1]:
# ruff: noqa: F401

In [2]:
%load_ext autoreload
%load_ext pyinstrument

%autoreload 2

In [3]:
import sys

sys.path.insert(0, "..")

In [4]:
from datetime import datetime, timedelta
from pathlib import Path

import hvplot.polars
import numpy as np
import polars as pl

In [5]:
from tsdb_benchmarks.monetdb import MonetDB
from tsdb_benchmarks.monetdb.fetch import fetch_binary, fetch_pymonetdb
from tsdb_benchmarks.monetdb.insert import insert
from tsdb_benchmarks.monetdb.utils import drop_table

db = MonetDB()

In [6]:
df = pl.read_parquet("../data/input/data_0.2M_0.5k.parquet").with_columns(
    (100 * pl.col.col_1 - 50).cast(pl.Int8).alias("col_1_int8"),
    (100 * pl.col.col_1 - 50).cast(pl.Int16).alias("col_1_int16"),
    (100 * pl.col.col_1 - 50).cast(pl.Int32).alias("col_1_int32"),
    (100 * pl.col.col_1 - 50).cast(pl.Int64).alias("col_1_int64"),
    pl.col.col_1.cast(pl.Float32).alias("col_1_float32"),
    pl.col.col_1.cast(pl.Float64).alias("col_1_float64"),
    pl.col.col_1.cast(pl.String).alias("col_1_str"),
    (pl.col.col_1 > 0.5).cast(pl.Boolean).alias("col_1_bool"),
    pl.col.col_1.cast(pl.Binary).alias("col_1_blob"),
    ('{ "val": ' + pl.col.col_1.cast(pl.String) + "}").alias("col_1_json"),
    pl.col.time.cast(pl.Time).alias("time_time"),
    pl.col.time.cast(pl.Date).alias("time_date"),
)


df = df.with_columns(
    pl.when((pl.col.time > df.get_column("time")[4]) | (pl.col.time == df.get_column("time")[0]))
    .then(pl.col(n))
    .otherwise(None)
    for n in df.columns
    if n != "time"
)

df = df.select([n for n in df.columns if n == "time" or n.count("_") == 2])

df

time,col_1_int8,col_1_int16,col_1_int32,col_1_int64,col_1_float32,col_1_float64,col_1_str,col_1_bool,col_1_blob,col_1_json
datetime[ms],i8,i16,i32,i64,f32,f64,str,bool,binary,str
2024-08-15 02:41:00,2,2,2,2,0.524554,0.524554,"""0.52455395""",true,"b""0.52455395""","""{ ""val"": 0.52455395}"""
2024-08-15 02:42:00,,,,,,,,,,
2024-08-15 02:43:00,,,,,,,,,,
2024-08-15 02:44:00,,,,,,,,,,
2024-08-15 02:45:00,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…
2024-12-31 23:56:00,-35,-35,-35,-35,0.140222,0.140222,"""0.14022161""",false,"b""0.14022161""","""{ ""val"": 0.14022161}"""
2024-12-31 23:57:00,5,5,5,5,0.551554,0.551554,"""0.55155426""",true,"b""0.55155426""","""{ ""val"": 0.55155426}"""
2024-12-31 23:58:00,21,21,21,21,0.718039,0.718039,"""0.71803874""",true,"b""0.71803874""","""{ ""val"": 0.71803874}"""
2024-12-31 23:59:00,32,32,32,32,0.82746,0.82746,"""0.82746017""",true,"b""0.82746017""","""{ ""val"": 0.82746017}"""


In [7]:
drop_table("test_insert", db.connect())

  import pkg_resources


In [8]:
insert(df, "test_insert", db.connect(), primary_key="time", json_columns="col_1_json")

In [9]:
%%pyinstrument

drop_table("test_insert", db.connect())
insert(df, "test_insert", db.connect(), primary_key="time", json_columns="col_1_json")

* JSON is read differently by binary and pymonetdb due to a bug in pymonetdb (ignore `col_1_json`)


In [10]:
fetch_pymonetdb("select * from test_insert order by time", db.connect()).drop("col_1_json").equals(
    fetch_binary("select * from test_insert order by time", db.connect()).drop("col_1_json")
)

Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - s.map_elements(json.loads)
with this one instead:
  + s.str.json_decode()

  return s.map_elements(json.loads, pl.Object)


True

In [11]:
fetch_binary("select * from test_insert order by time", db.connect()).drop("col_1_json").equals(
    df.drop("col_1_json").sort("time")
)

True