# Baseline model
## Introduction
This notebook contains the baselinemodel that we need to outperform. The baseline model is a weighted average (seasonal) timeseries forecast for the next 4 quarters.

In [40]:
import sqlite3
import polars as pl
import polars.selectors as cs
import os

# 1. Manually define the path to your local clone of the git repo to make sure it's correct
# Using a raw string (r"") handles the backslashes correctly on Windows
db_path = r"C:\Git\eaisi-uwv\data\2_silver\silver_data.db"

# 2. Check if the file actually exists before trying to open it
if not os.path.exists(db_path):
    print(f"❌ Error: File not found at {db_path}")
    print("Check if you are currently synced to the correct branch in Git!")
else:
    try:
        # 3. Create a standard connection (No URI strings needed!)
        # We open it in read-only mode to avoid any locking issues
        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
        
        # 4. Define your query
        query = """SELECT Perioden as Timeperiod_text,
                        -- Extract Year and Map Quarter to the first month of that quarter
                        printf('%s-%s-01', 
                            substr(Perioden, 1, 4), 
                            CASE substr(Perioden, 7, 2)
                                WHEN '01' THEN '01' -- Q1 -> January
                                WHEN '02' THEN '04' -- Q2 -> April
                                WHEN '03' THEN '07' -- Q3 -> July
                                WHEN '04' THEN '10' -- Q4 -> October
                            END
                        ) AS Period_startdate, 
                        DATE(
                            printf('%s-%s-01', 
                                substr(Perioden, 1, 4), 
                                CASE substr(Perioden, 7, 2)
                                    WHEN '01' THEN '01'
                                    WHEN '02' THEN '04'
                                    WHEN '03' THEN '07'
                                    WHEN '04' THEN '10'
                                END
                            ), 
                            '+3 months', 
                            '-1 day'
                        ) AS Period_enddate,
                        CAST(Ziekteverzuimpercentage_1 AS Decimal) as Absenteeism_perc,
                        BedrijfskenmerkenSBI2008_CategoryGroupID as SBI_code
                    FROM "80072ned_silver"
                    WHERE Perioden NOT LIKE '%JJ%' 
                    AND Period_startdate >= '2016-01-01'
                    """
        
        # 5. Load into Polars
        df_org = pl.read_database(query=query, connection=conn)
        
        print(f"✅ Success! Loaded {len(df)} rows.")
        print(df_org.head())
        
        conn.close()
        
    except Exception as e:
        print(f"❌ Connection error: {e}")

✅ Success! Loaded 4641 rows.
shape: (5, 5)
┌─────────────────┬──────────────────┬────────────────┬──────────────────┬──────────┐
│ Timeperiod_text ┆ Period_startdate ┆ Period_enddate ┆ Absenteeism_perc ┆ SBI_code │
│ ---             ┆ ---              ┆ ---            ┆ ---              ┆ ---      │
│ str             ┆ str              ┆ str            ┆ f64              ┆ str      │
╞═════════════════╪══════════════════╪════════════════╪══════════════════╪══════════╡
│ 2016KW01        ┆ 2016-01-01       ┆ 2016-03-31     ┆ 4.3              ┆ 1        │
│ 2016KW02        ┆ 2016-04-01       ┆ 2016-06-30     ┆ 3.8              ┆ 1        │
│ 2016KW03        ┆ 2016-07-01       ┆ 2016-09-30     ┆ 3.5              ┆ 1        │
│ 2016KW04        ┆ 2016-10-01       ┆ 2016-12-31     ┆ 4.1              ┆ 1        │
│ 2017KW01        ┆ 2017-01-01       ┆ 2017-03-31     ┆ 4.3              ┆ 1        │
└─────────────────┴──────────────────┴────────────────┴──────────────────┴──────────┘


In [None]:
df_modified = df_org.with_columns(
    # using polars selectors (cs) to find all columns that end with "date" and convert them to date format
    cs.ends_with("date").str.to_date("%Y-%m-%d")
)
print(df_modified.head())

shape: (5, 5)
┌─────────────────┬──────────────────┬────────────────┬──────────────────┬──────────┐
│ Timeperiod_text ┆ Period_startdate ┆ Period_enddate ┆ Absenteeism_perc ┆ SBI_code │
│ ---             ┆ ---              ┆ ---            ┆ ---              ┆ ---      │
│ str             ┆ str              ┆ str            ┆ f64              ┆ str      │
╞═════════════════╪══════════════════╪════════════════╪══════════════════╪══════════╡
│ 2016KW01        ┆ 2016-01-01       ┆ 2016-03-31     ┆ 4.3              ┆ 1        │
│ 2016KW02        ┆ 2016-04-01       ┆ 2016-06-30     ┆ 3.8              ┆ 1        │
│ 2016KW03        ┆ 2016-07-01       ┆ 2016-09-30     ┆ 3.5              ┆ 1        │
│ 2016KW04        ┆ 2016-10-01       ┆ 2016-12-31     ┆ 4.1              ┆ 1        │
│ 2017KW01        ┆ 2017-01-01       ┆ 2017-03-31     ┆ 4.3              ┆ 1        │
└─────────────────┴──────────────────┴────────────────┴──────────────────┴──────────┘
