In [1]:
import pandas as pd
from pathlib import Path

# ---------------------------------------------------------
# Load merged FULL-CALENDAR file
# ---------------------------------------------------------
df = pd.read_csv(
    "merged_expanded_data_fullcalendar.csv",
    parse_dates=["date"]
)

print("Full data range:", df["date"].min(), "→", df["date"].max())
print("Total rows:", len(df))


# ---------------------------------------------------------
# Date ranges (as agreed)
# ---------------------------------------------------------
train_start, train_end = "2013-07-01", "2017-07-31"
val_start,   val_end   = "2017-08-01", "2018-07-31"
test_start,  test_end  = "2018-08-01", "2019-07-31"


# ---------------------------------------------------------
# Initial splits (time-based)
# ---------------------------------------------------------
train_df = df[(df["date"] >= train_start) & (df["date"] <= train_end)].copy()
val_df   = df[(df["date"] >= val_start)   & (df["date"] <= val_end)].copy()
test_df  = df[(df["date"] >= test_start)  & (df["date"] <= test_end)].copy()


# ---------------------------------------------------------
# EXCLUDE missing sales from TRAIN & VALIDATION ONLY
# (store-closure days)
# ---------------------------------------------------------
train_df = train_df.dropna(subset=["umsatz"]).copy()
val_df   = val_df.dropna(subset=["umsatz"]).copy()
# test_df is intentionally NOT filtered


# ---------------------------------------------------------
# Sanity checks
# ---------------------------------------------------------
print("\nSplit sizes AFTER exclusion:")
print("Train rows:", len(train_df))
print("Validation rows:", len(val_df))
print("Test rows:", len(test_df))

print("\nNaN umsatz counts (should be 0 for train/val):")
print("Train:", train_df["umsatz"].isna().sum())
print("Val:  ", val_df["umsatz"].isna().sum())
print("Test: ", test_df["umsatz"].isna().sum(), "(expected: all NaN)")


print("\nTrain date range:", train_df["date"].min(), "→", train_df["date"].max())
print("Validation date range:", val_df["date"].min(), "→", val_df["date"].max())
print("Test date range:", test_df["date"].min(), "→", test_df["date"].max())


# ---------------------------------------------------------
# Preview test set (features only)
# ---------------------------------------------------------
print("\nFirst rows of test set:")
print(test_df.head())


# ---------------------------------------------------------
# Save splits
# ---------------------------------------------------------
train_df.to_csv("train_split_merged_expanded_data.csv", index=False)
val_df.to_csv("val_split_merged_expanded_data.csv", index=False)
test_df.to_csv("test_split_merged_expanded_data.csv", index=False)

print("\nSplit files saved:")
print("train_split_merged_expanded_data.csv")
print("val_split_merged_expanded_data.csv")
print("test_split_merged_expanded_data.csv")


Full data range: 2012-01-01 00:00:00 → 2019-12-31 00:00:00
Total rows: 15098

Split sizes AFTER exclusion:
Train rows: 7475
Validation rows: 1840
Test rows: 1886

NaN umsatz counts (should be 0 for train/val):
Train: 0
Val:   0
Test:  1886 (expected: all NaN)

Train date range: 2013-07-01 00:00:00 → 2017-07-31 00:00:00
Validation date range: 2017-08-01 00:00:00 → 2018-07-31 00:00:00
Test date range: 2018-08-01 00:00:00 → 2019-07-31 00:00:00

First rows of test set:
            date  warengruppe       id  umsatz  Bewoelkung  Temperatur  \
12386 2018-08-01            1  1808011     NaN         0.0     23.7625   
12387 2018-08-01            2  1808012     NaN         0.0     23.7625   
12388 2018-08-01            3  1808013     NaN         0.0     23.7625   
12389 2018-08-01            4  1808014     NaN         0.0     23.7625   
12390 2018-08-01            5  1808015     NaN         0.0     23.7625   

       Windgeschwindigkeit  Wettercode  KielerWoche  school_holiday  \
12386         