In [3]:
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 0. CONFIG
CSV_IN        = "dataset.csv"
CSV_OUT       = "dataset.csv"
N_FOLDS       = 5
H_GRID_POINTS = 50
EPS           = 1e-6

# 1. LOAD & PREPARE TIMESTAMPS
df = pd.read_csv(CSV_IN)
df["OccupancyDateTime"] = pd.to_datetime(df["OccupancyDateTime"],
                                         format="%Y-%m-%d %H:%M:%S")
df["is_weekend"] = (
    df["OccupancyDateTime"].dt.weekday.isin([5, 6])
).astype(int)

cal = USFederalHolidayCalendar()
holidays = cal.holidays(
    start=df["OccupancyDateTime"].dt.normalize().min(),
    end=df["OccupancyDateTime"].dt.normalize().max()
)
df["is_holiday"] = (
    df["OccupancyDateTime"].dt.normalize().isin(holidays)
).astype(int)

# 2. BASE FEATURES & TARGET
FEATS  = ["is_weekend", "is_holiday",
          "Max Temp", "Min Temp", "Avg Temp", "Precipitation"]
TARGET = "rate"

X_base = df[FEATS].values
y      = df[TARGET].values

# 3. BASELINE MODEL 
base = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist"
).fit(X_base, y)

df["d"] = base.predict(X_base)

# 4. RELATIVE DIFFERENCE k
df["k"] = (df["d"] - df[TARGET]) / (df[TARGET] + EPS)

# 5. PICK THRESHOLD H VIA CV
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

def cv_mse(H: float) -> float:
    folds = []
    for tr, te in kf.split(X_base):
        Xt, Xv = X_base[tr], X_base[te]
        yt, yv = y[tr],      y[te]
        # recompute d inside fold
        b = XGBRegressor(
            objective="reg:squarederror",
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            tree_method="hist"
        ).fit(Xt, yt)
        d_tr = b.predict(Xt)
        d_te = b.predict(Xv)
        k_tr = (d_tr - yt) / (yt + EPS)
        k_te = (d_te - yv) / (yv + EPS)
        ind_tr = np.where(np.abs(k_tr) > H, k_tr, 0.0)
        ind_te = np.where(np.abs(k_te) > H, k_te, 0.0)
        aug = XGBRegressor(
            objective="reg:squarederror",
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            tree_method="hist"
        ).fit(np.hstack([Xt, ind_tr[:, None]]), yt)
        yhat = aug.predict(np.hstack([Xv, ind_te[:, None]]))
        folds.append(((yv - yhat) ** 2).mean())
    return np.mean(folds)

H_grid  = np.linspace(0, np.abs(df["k"]).max(), H_GRID_POINTS)
best_H   = min(H_grid, key=cv_mse)

# 6. BUILD INDICATOR FEATURE
df["indicator"] = np.where(np.abs(df["k"]) > best_H, df["k"], 0.0)

# 7. Zâ€‘SCORE NORMALISATION
mu    = df["indicator"].mean()
sigma = df["indicator"].std(ddof=0)
df["indicator_norm"] = (df["indicator"] - mu) / (sigma + 1e-9)

# 8. SAVE & SUMMARY
df.to_csv(CSV_OUT, index=False)

print(f"Best H                   : {best_H:.6f}")
print(f"Rows with indicator = 0  : {(df['indicator']==0).sum():,}/{len(df):,}")
print(f"Augmented data written to: {CSV_OUT}")


Best H                   : 0.000000
Rows with indicator = 0  : 0/15,000
Augmented data written to: dataset.csv
