# Baseline model
## Introduction
This notebook contains the baseline model that we need to outperform. The baseline model is a weighted average (seasonal) timeseries forecast for the next 4 quarters.

In [None]:
import polars as pl
import polars.selectors as cs
from sqlalchemy import create_engine
from pathlib import Path
from config import DIR_DB_SILVER

# --- Database Connection ---
# Check if database exists; handle both root and 'code' directory execution
if not DIR_DB_SILVER.exists():
    if (Path("..") / DIR_DB_SILVER).exists():
        DIR_DB_SILVER = Path("..") / DIR_DB_SILVER
    else:
        raise FileNotFoundError(f"❌ Database not found at {DIR_DB_SILVER}. Ensure you are in the project root.")

# Create a simple SQLAlchemy engine for the Silver database
engine = create_engine(f"sqlite:///{DIR_DB_SILVER}")

# SQL Query to extract and format absenteeism data
query = """
SELECT 
    Perioden as Timeperiod_text,
    printf('%s-%s-01', 
        substr(Perioden, 1, 4), 
        CASE substr(Perioden, 7, 2)
            WHEN '01' THEN '01'
            WHEN '02' THEN '04'
            WHEN '03' THEN '07'
            WHEN '04' THEN '10'
        END
    ) AS Period_startdate, 
    DATE(
        printf('%s-%s-01', 
            substr(Perioden, 1, 4), 
            CASE substr(Perioden, 7, 2)
                WHEN '01' THEN '01'
                WHEN '02' THEN '04'
                WHEN '03' THEN '07'
                WHEN '04' THEN '10'
            END
        ), 
        '+3 months', 
        '-1 day'
    ) AS Period_enddate,
    CAST(Ziekteverzuimpercentage_1 AS REAL) as Absenteeism_perc,
    BedrijfskenmerkenSBI2008_CategoryGroupID as SBI_code
FROM "80072ned_silver"
WHERE Perioden NOT LIKE '%JJ%' 
AND Period_startdate >= '2016-01-01'
"""

# Load into Polars DataFrame
with engine.connect() as conn:
    df_org = pl.read_database(query=query, connection=conn)

print(f"✅ Success! Loaded {len(df_org)} rows.")
df_org.head()

In [None]:
df_modified = df_org.with_columns(
    # Convert columns ending with 'date' to Date type
    cs.ends_with("date").str.to_date("%Y-%m-%d")
)
df_modified.head()