In [13]:
import os
import numpy as np
import pandas as pd

LOG_CSV = "data/pred_log.csv"
TZ = "Europe/Dublin"

In [2]:
df = pd.read_csv(LOG_CSV)

In [3]:
df.head()

Unnamed: 0,date,yhat,tweeted_at,err_abs,actual
0,2025-08-19,15.391441,2025-08-18 20:44:14.464306+01:00,6.0,21.0
1,2025-08-20,14.0,2025-08-19 20:53:24.762552+01:00,5.0,9.0
2,2025-08-21,13.0,2025-08-20 13:35:15.607435+01:00,1.0,14.0
3,2025-08-22,15.0,2025-08-21 14:43:55.525867+01:00,2.0,13.0
4,2025-08-25,10.0,2025-08-24 15:26:13.716259+01:00,5.0,5.0


In [4]:
df

Unnamed: 0,date,yhat,tweeted_at,err_abs,actual
0,2025-08-19,15.391441,2025-08-18 20:44:14.464306+01:00,6.0,21.0
1,2025-08-20,14.0,2025-08-19 20:53:24.762552+01:00,5.0,9.0
2,2025-08-21,13.0,2025-08-20 13:35:15.607435+01:00,1.0,14.0
3,2025-08-22,15.0,2025-08-21 14:43:55.525867+01:00,2.0,13.0
4,2025-08-25,10.0,2025-08-24 15:26:13.716259+01:00,5.0,5.0
5,2025-08-26,15.0,2025-08-25 15:26:13.716259+01:00,7.0,22.0
6,2025-08-27,14.0,2025-08-26 15:26:13.716259+01:00,2.0,16.0
7,2025-08-28,12.0,2025-08-27 15:26:13.716259+01:00,1.0,13.0
8,2025-08-29,16.0,2025-08-28 15:26:13.716259+01:00,4.0,20.0
9,2025-09-01,11.0,2025-08-31 15:26:13.716259+01:00,2.0,9.0


In [18]:
counts=[9,36,25,22,15,27,12,12,18,8,23,16,12]
preds=[15,14,16,11,21,19,15,17,10,17,16,13]
dates=["2025-09-02","2025-09-03","2025-09-04","2025-09-05","2025-09-08","2025-09-09","2025-09-10","2025-09-11","2025-09-12","2025-09-15","2025-09-16","2025-09-17","2025-09-18"]

In [19]:
len(preds)

13

In [20]:
log = pd.read_csv(LOG_CSV, parse_dates=["date","tweeted_at"], on_bad_lines="skip")
for c in ["yhat","err_abs","actual"]:
    if c not in log.columns: log[c] = np.nan
log["date"] = pd.to_datetime(log["date"], errors="coerce").dt.normalize()
log["tweeted_at"] = pd.to_datetime(log["tweeted_at"], errors="coerce")

if log["tweeted_at"].notna().any():
    ref_ts = log.loc[log["tweeted_at"].notna(), "tweeted_at"].iloc[-1]
    ref_time = (ref_ts.tz_convert(TZ) if ref_ts.tzinfo else ref_ts.tz_localize(TZ)).time()
else:
    ref_time = pd.Timestamp("17:00").time()

def tweeted_at_for(date_ts: pd.Timestamp) -> pd.Timestamp:
    base = pd.Timestamp(date_ts.date())
    local_dt = pd.Timestamp.combine(base, ref_time).tz_localize(TZ) - pd.Timedelta(days=1)
    return local_dt

records = {
    pd.to_datetime(d).normalize(): (int(a), float(y))
    for d, a, y in zip(dates, counts, preds)
}
idx_by_date = {d.normalize(): i for i, d in enumerate(log["date"])}

for d, (actual, yhat) in records.items():
    if d in idx_by_date:
        i = idx_by_date[d]
        log.loc[i, "yhat"] = yhat
        log.loc[i, "actual"] = actual
        log.loc[i, "err_abs"] = abs(actual - yhat)
        if pd.isna(log.loc[i, "tweeted_at"]):
            log.loc[i, "tweeted_at"] = tweeted_at_for(d)
    else:
        log = pd.concat([log, pd.DataFrame([{
            "date": d,
            "yhat": yhat,
            "tweeted_at": tweeted_at_for(d),
            "err_abs": abs(actual - yhat),
            "actual": float(actual),
        }])], ignore_index=True)

log = log.sort_values("date").reset_index(drop=True)

# atomic write
tmp = LOG_CSV + ".tmp"
log.to_csv(tmp, index=False)
os.replace(tmp, LOG_CSV)

print("Updated through:", max(records.keys()).date(), "Rows:", len(log))
print(log.tail(10))

Updated through: 2025-09-18 Rows: 23
         date  yhat                       tweeted_at  err_abs  actual
13 2025-09-05  11.0 2025-09-04 15:26:13.716259+01:00     11.0    22.0
14 2025-09-08  21.0 2025-09-07 15:26:13.716259+01:00      6.0    15.0
15 2025-09-09  19.0 2025-09-08 15:26:13.716259+01:00      8.0    27.0
16 2025-09-10  15.0 2025-09-09 15:26:13.716259+01:00      3.0    12.0
17 2025-09-11  17.0 2025-09-10 15:26:13.716259+01:00      5.0    12.0
18 2025-09-12  10.0 2025-09-11 15:26:13.716259+01:00      8.0    18.0
19 2025-09-15  17.0 2025-09-14 15:26:13.716259+01:00      9.0     8.0
20 2025-09-16  16.0 2025-09-15 15:26:13.716259+01:00      7.0    23.0
21 2025-09-17  13.0 2025-09-16 15:26:13.716259+01:00      3.0    16.0
22 2025-09-18  18.0 2025-09-17 15:26:13.716259+01:00      6.0    12.0


In [16]:
tmp

'data/pred_log.csv.tmp'