In [1]:
%run "C:/Users/vince/user-behavior-predictor/notebooks/01_eda.ipynb"
import pandas as pd

# 1) FIRST‑ORDER METRICS (very first purchase only) 
first_tx = (
    df_clean
      .sort_values("InvoiceDate")
      .groupby("CustomerID", as_index=False)
      .first()
)
first_tx["first_quantity"] = first_tx["Quantity"]
first_tx["first_revenue"]  = first_tx["Quantity"] * first_tx["Price"]
first_features = first_tx[["CustomerID","first_quantity","first_revenue"]]

# 2) COUNTRY ONE‑HOT FLAGS 
country_dummies = (
    pd.get_dummies(
        df_clean.groupby("CustomerID")["Country"].first(),
        prefix="country"
    )
    .reset_index()
)

# 3) PRODUCT DIVERSITY ON THE FIRST DAY 
first_day = (
    df_clean
      .merge(user_agg[["CustomerID","first_date"]], on="CustomerID", how="left")
)
mask_fd = (
    first_day["InvoiceDate"].dt.normalize() == first_day["first_date"].dt.normalize()
)
diversity_first_day = (
    first_day[mask_fd]
      .groupby("CustomerID")["StockCode"].nunique()
      .rename("diversity_first_day")
      .reset_index()
)

# 4) RECENCY TO 2ND PURCHASE
second_tx = (
    df_clean
      .sort_values("InvoiceDate")
      .groupby("CustomerID", as_index=False)   # <- keep as_index=False
      .nth(1)
      .rename(columns={"InvoiceDate": "second_date"})
)


recency = (
    user_agg[["CustomerID", "first_date"]]
      .merge(second_tx, on="CustomerID", how="left")
)
recency["recency_2nd"] = (
    recency["second_date"] - recency["first_date"]
).dt.days.fillna(999).astype(int)

# 5) 3‑DAY FREQUENCY & MONETARY (RFM) 
window = 3

df_with_first = df_clean.merge(user_agg[["CustomerID", "first_date"]], on="CustomerID")
df_with_first["days_since_first"] = (
    df_with_first["InvoiceDate"] - df_with_first["first_date"]
).dt.days
mask_3d = df_with_first["days_since_first"] <= window
agg_3d = (
    df_with_first[mask_3d]
      .assign(revenue=lambda d: d["Quantity"] * d["Price"])
      .groupby("CustomerID", as_index=False)
      .agg(
         freq_3d     = ("Invoice", "nunique"),
         monetary_3d = ("revenue", "sum")
      )
)

# 6) TIME‑OF‑FIRST PURCHASE (hour & weekday) 
time_feats = first_tx[["CustomerID", "InvoiceDate"]].rename(columns={"InvoiceDate": "first_date_time"})
time_feats["first_hour"] = time_feats["first_date_time"].dt.hour
time_feats["first_dow"]  = time_feats["first_date_time"].dt.weekday

# 7) PRODUCT DIVERSITY IN THE FIRST 7 DAYS
mask_7d = df_with_first["days_since_first"] <= 7
div7 = (
    df_with_first[mask_7d]
      .groupby("CustomerID", as_index=False)["StockCode"].nunique()
      .rename(columns={"StockCode": "diversity_7d"})
)

# 8) MERGE EVERYTHING INTO ONE FEATURE TABLE 
features = (
    user_agg[["CustomerID", "total_orders", "total_quantity"]]
      .merge(first_features,               on="CustomerID", how="left")
      .merge(diversity_first_day,          on="CustomerID", how="left")
      .merge(country_dummies,              on="CustomerID", how="left")
      .merge(recency[["CustomerID","recency_2nd"]], on="CustomerID", how="left")
      .merge(agg_3d,                       on="CustomerID", how="left")
      .merge(time_feats[["CustomerID","first_hour","first_dow"]],
                                             on="CustomerID", how="left")
      .merge(div7,                         on="CustomerID", how="left")
      .fillna(0)  # ⬅️ all remaining numeric NaNs ➜ 0
)
print("✅ Extended feature table shape:", features.shape)

# 9) BUILD FINAL MODELLING DATASET 
dataset = (
    user_final[["CustomerID", "did_repurchase_7d", "first_date", "last_date", "days_to_repurchase"]]
      .merge(features, on="CustomerID", how="left")
      .fillna(0)
)
print("✅ Modelling dataset shape:", dataset.shape)
print(dataset.head())


(1067371, 8)
Cleaned data shape: (805620, 8)
User-level agg shape: (5881, 5)
Final user table table: (5881, 7)
✅ Extended feature table shape: (5881, 53)
✅ Modelling dataset shape: (5881, 57)
   CustomerID  did_repurchase_7d          first_date           last_date  \
0       12346               True 2009-12-14 08:34:00 2011-01-18 10:01:00   
1       12347              False 2010-10-31 14:20:00 2011-12-07 15:52:00   
2       12348              False 2010-09-27 14:59:00 2011-09-25 13:13:00   
3       12349              False 2010-04-29 13:20:00 2011-11-21 09:51:00   
4       12350              False 2011-02-02 16:01:00 2011-02-02 16:01:00   

   days_to_repurchase  total_orders  total_quantity  first_quantity  \
0                 4.0            12           74285              10   
1                37.0             8            3286              10   
2                80.0             5            2714              24   
3                18.0             4            1624               4