In [3]:
# Imports & Inits
import os
import polars as pl

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

In [None]:
year = 2023
for month in range(6,13):
    df.filter(pl.col("creation_timestamp").dt.month() == month) \
        .filter(pl.col("creation_timestamp").dt.year() == year) \
        .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id")) \
        .sort("creation_timestamp") \
        .collect() \
        .write_parquet(os.path.join(DATA_DIRECTORY, "download","measurements", str(year), f"{year}_{month}_acropolis.parquet"))

In [None]:
year = 2024
for month in range(1,12):
    df.filter(pl.col("creation_timestamp").dt.month() == month) \
        .filter(pl.col("creation_timestamp").dt.year() == year) \
        .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id")) \
        .sort("creation_timestamp") \
        .collect() \
        .write_parquet(os.path.join(DATA_DIRECTORY, "download","measurements", str(year), f"{year}_{month}_acropolis.parquet"))

In [None]:
import glob

measurements = glob.glob(os.path.join(DATA_DIRECTORY, "download", "measurements", "*", "*.parquet"))

latest_acropolis_file = sorted(measurements, key=os.path.getmtime)[-1]

acropolis = pl.scan_parquet(latest_acropolis_file)
sensors = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "metadata", "sensors.parquet"))

In [None]:
latest_acropolis_file = sorted(glob.glob(os.path.join(DATA_DIRECTORY, "download", "measurements", "*", "*.parquet")), key=os.path.getmtime)[-1]

pl.scan_parquet(latest_acropolis_file).sort("creation_timestamp").select("creation_timestamp").last().collect().row(0)

In [None]:
pivots = []

paths = glob.glob(
    os.path.join(DATA_DIRECTORY, "download", "chunks", "*.parquet"))

# Merge chunks & sensor metadata
for path in paths:
    pivots.append(
        pl.scan_parquet(path).join(
            sensors.select("identifier", "name"),
            how="left",
            left_on="sensor_identifier",
            right_on="identifier",
        ).drop("sensor_identifier").rename({
            "name": "system_name"
        }).with_columns(
            pl.col("creation_timestamp").dt.cast_time_unit("us")))
    
pivots = [acropolis] + pivots
result = pl.concat(pivots, how="diagonal").collect()

In [None]:
months = result.select(pl.col("creation_timestamp").dt.month()).to_series().unique().to_list()

years = result.select(pl.col("creation_timestamp").dt.year()).to_series().unique().to_list()

In [None]:
for year in years:
    for month in months:
        result.filter(pl.col("creation_timestamp").dt.month() == month) \
        .filter(pl.col("creation_timestamp").dt.year() == year) \
        .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id")) \
        .sort("creation_timestamp") \
        .write_parquet(os.path.join(DATA_DIRECTORY, "download","measurements", str(year), f"{year}_{month}_acropolis.parquet"))

In [None]:
acropolis.head().collect()

In [5]:
import glob
measurement_months = []

year = 2024

paths = sorted(glob.glob(os.path.join(DATA_DIRECTORY, "download", "measurements", str(year), "*.parquet")), key=os.path.getmtime)

for path in paths:
    measurement_months.append(pl.scan_parquet(path))
    
measurements = pl.concat(measurement_months, how="diagonal").collect()