# 02 — Model & Business Insights
One notebook to train, evaluate, and extract insights.

### Using src modules for clean structure

In [None]:
from sklearn.model_selection import train_test_split
from src.io import load_cleaned_master, detect_datetime, add_time_features, build_promo_flag
from src.features import build_feature_matrix
from src.modeling import train_evaluate
from src.insights import promotion_uplift, top_groups, write_quick_insights

# Load & enrich
df = load_cleaned_master()
df = detect_datetime(df)
df = add_time_features(df)
df = build_promo_flag(df)

# Features/target
X, y, num_cols, cat_cols = build_feature_matrix(df, target_col="amount")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model, metrics = train_evaluate(X_train, X_test, y_train, y_test, num_cols, cat_cols)
print(metrics)

# Insights
uplift = promotion_uplift(df)
prov  = top_groups(df, "province", n=5)
stores= top_groups(df, "supermarket_no", n=5)
items = top_groups(df, "code", n=5)
p = write_quick_insights("../report", metrics, uplift, [
    ("Top provinces (by revenue)", prov),
    ("Top stores (by revenue)", stores),
    ("Top items (by revenue)", items),
])
print("Wrote:", p)

In [None]:
from pathlib import Path
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA = Path("../data/cleaned_master.csv")
df = pd.read_csv(DATA, low_memory=False)

# 1) Build features/target
if "amount" not in df.columns:
    raise ValueError("Expected 'amount' column as target.")

# Make a flexible datetime
candidates = ["transaction_time","time_of_transactions","transaction_date","date","datetime","timestamp","time"]
src = next((c for c in candidates if c in df.columns), None)
if src:
    df["transaction_time"] = pd.to_datetime(df[src], errors="coerce")
    df["hour"]  = df["transaction_time"].dt.hour
    df["dow"]   = df["transaction_time"].dt.dayofweek
    df["month"] = df["transaction_time"].dt.month

# Promo flag
df["promo_flag"] = 0
for c in ["feature","display"]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip().replace({"nan":"0","None":"0"})
        df.loc[df[c].ne("0"), "promo_flag"] = 1

y = df["amount"].astype(float)
X = df.drop(columns=["amount"], errors="ignore")

num_cols = list(X.select_dtypes(include=[np.number]).columns)
cat_cols = [c for c in X.columns if c not in num_cols + ["transaction_time"]]

for c in num_cols:
    X[c] = X[c].fillna(X[c].median())
for c in cat_cols:
    X[c] = X[c].astype(str).fillna("NA")

X_train, X_test, y_train, y_test = train_test_split(X[num_cols + cat_cols], y, test_size=0.2, random_state=42)

pre = ColumnTransformer([
    ("num", StandardScaler(with_mean=False), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
])

model = Pipeline([("pre", pre), ("reg", Ridge(alpha=1.0))])
model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)
print(f"MAE: {mae:,.2f}\nRMSE: {rmse:,.2f}\nR2: {r2: .3f}")

# 2) Business insights
insights = []

# Insight A: Promotion uplift
uplift = np.nan
if {"promo_flag"}.issubset(X.columns):
    mean_base = y[X["promo_flag"]==0].mean() if (X["promo_flag"]==0).any() else np.nan
    mean_promo= y[X["promo_flag"]==1].mean() if (X["promo_flag"]==1).any() else np.nan
    if pd.notna(mean_base) and mean_base>0 and pd.notna(mean_promo):
        uplift = (mean_promo - mean_base)/mean_base*100
insights.append(f"Promotion uplift on mean amount: {'N/A' if pd.isna(uplift) else f'{uplift:.2f}%'}")

# Insight B: Top drivers by group (if columns exist)
summaries = []
if "province" in df.columns:
    prov = df.groupby("province", as_index=False)["amount"].sum().sort_values("amount", ascending=False).head(5)
    summaries.append(("Top provinces (by revenue)", prov))
if "supermarket_no" in df.columns:
    stores = df.groupby("supermarket_no", as_index=False)["amount"].sum().sort_values("amount", ascending=False).head(5)
    summaries.append(("Top stores (by revenue)", stores))
if "code" in df.columns:
    items = df.groupby("code", as_index=False)["amount"].sum().sort_values("amount", ascending=False).head(5)
    summaries.append(("Top items (by revenue)", items))

for title, tbl in summaries:
    print("\n" + title)
    print(tbl.to_string(index=False))

# Save a quick markdown summary
from pathlib import Path
REPORT = Path("../report"); REPORT.mkdir(exist_ok=True, parents=True)
lines = [f"**Model performance** — MAE: {mae:,.2f}, RMSE: {rmse:,.2f}, R²: {r2: .3f}", ""] + insights
(Path(REPORT/"quick_insights.md")).write_text("\n".join(lines), encoding="utf-8")
print("\nWrote:", (REPORT/"quick_insights.md").resolve())