In [1]:
import pandas as pd

FILES = {
    "kiwo": "kiwo.csv",
    "ferien": "Ferien_SH.csv",
    "feiertage": "Feiertage_holidays_sh_2013_2019.csv",
}


In [2]:
def detect_date_col(df):
    for c in ["Datum", "date", "Date", "DATE"]:
        if c in df.columns:
            return c
    return df.columns[0]  # fallback

def detect_indicator_col(df, date_col, key):
    preferred = {
        "kiwo": ["KielerWoche", "kiwo"],
        "ferien": ["Ferien", "school_holiday", "Schulferien"],
        "feiertage": ["Feiertag", "holiday", "public_holiday", "is_holiday"],
    }
    for c in preferred.get(key, []):
        if c in df.columns:
            return c

    candidates = [c for c in df.columns if c != date_col]
    return candidates[0] if candidates else None  # dates-only case


In [4]:
def expand_and_fill(df, date_col, ind_col):
    df = df.copy()

    # parse dates
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col])

    # natural min / max of the file
    start = df[date_col].min()
    end = df[date_col].max()

    # full daily calendar
    calendar = pd.DataFrame({
        date_col: pd.date_range(start=start, end=end, freq="D")
    })

    # dates-only files (e.g. Ferien_SH)
    if ind_col is None:
        df["_indicator_"] = 1
        ind_col = "_indicator_"
    else:
        df[ind_col] = pd.to_numeric(df[ind_col], errors="coerce")
        if df[ind_col].isna().all():
            df[ind_col] = 1

    # collapse duplicates → any 1 wins
    df_daily = (
        df[[date_col, ind_col]]
        .groupby(date_col, as_index=False)[ind_col]
        .max()
    )

    # merge and fill missing dates with 0
    out = calendar.merge(df_daily, on=date_col, how="left")
    out[ind_col] = out[ind_col].fillna(0).astype(int)

    # standardize date column name
    out = out.rename(columns={date_col: "date"})

    return out, start, end


In [5]:
for key, path in FILES.items():
    print(f"\n================ {key.upper()} ================")

    df = pd.read_csv(path)

    date_col = detect_date_col(df)
    ind_col = detect_indicator_col(df, date_col, key)

    df_clean, start, end = expand_and_fill(df, date_col, ind_col)

    # rename indicator nicely
    if key == "kiwo":
        df_clean = df_clean.rename(columns={df_clean.columns[1]: "KielerWoche"})
        indicator = "KielerWoche"
    elif key == "ferien":
        df_clean = df_clean.rename(columns={df_clean.columns[1]: "school_holiday"})
        indicator = "school_holiday"
    else:
        df_clean = df_clean.rename(columns={df_clean.columns[1]: "public_holiday"})
        indicator = "public_holiday"

    # reporting
    expected = len(pd.date_range(start=start, end=end, freq="D"))
    actual = len(df_clean)

    print(f"Date range: {start.date()} → {end.date()}")
    print(f"Expected rows (full calendar): {expected}")
    print(f"Actual rows: {actual}")
    print(f"Missing rows added: {expected - df[df[date_col].notna()].shape[0]}")
    print(f"NaNs remaining: {df_clean.isna().sum().sum()}")
    print(f"Value counts:\n{df_clean[indicator].value_counts()}")

    # save
    out_name = f"{key}_expanded.csv"
    df_clean.to_csv(out_name, index=False)
    print(f"Saved: {out_name}")



Date range: 2012-06-16 → 2019-06-30
Expected rows (full calendar): 2571
Actual rows: 2571
Missing rows added: 2499
NaNs remaining: 0
Value counts:
KielerWoche
0    2499
1      72
Name: count, dtype: int64
Saved: kiwo_expanded.csv

Date range: 2013-01-31 → 2019-12-31
Expected rows (full calendar): 2526
Actual rows: 2526
Missing rows added: 1876
NaNs remaining: 0
Value counts:
school_holiday
0    1877
1     649
Name: count, dtype: int64
Saved: ferien_expanded.csv

Date range: 2013-01-01 → 2019-12-31
Expected rows (full calendar): 2556
Actual rows: 2556
Missing rows added: 0
NaNs remaining: 0
Value counts:
public_holiday
0    2490
1      66
Name: count, dtype: int64
Saved: feiertage_expanded.csv


  df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
