In [25]:
"""
Consolidated Example: Numeric Data Pre-processing Pipeline
Dataset: retail_sales_week4.csv

What this program demonstrates (end-to-end):
1) Load dataset
2) Split train/test FIRST (avoid leakage)
3) Detect + fix invalid numeric entries (0 income, negative marketing spend)
4) Numeric pipeline:
   - Median imputation
   - Power transform (Yeo-Johnson) to handle skewness robustly
   - Scaling (StandardScaler)
5) Train a baseline model vs pipeline model
6) Compare performance + generalization gap

Note:
- We use ColumnTransformer + Pipeline (production-style)
- We keep it numeric-only for this module
"""

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge


In [26]:
# -----------------------------
# 1) Load dataset
# -----------------------------
df = pd.read_csv("D:/datasets/dpp/retail_sales_numeria_week4.csv")

TARGET = "target_sales"
ID_COLS = ["customer_id", "store_id"]



In [27]:
# -----------------------------
# 2) Minimal "data cleaning" BEFORE splitting (safe rules that don't use target)
#    These are pure business rules / validity rules, not learned from data distribution.
# -----------------------------
df_clean = df.copy()

# Annual income: 0 or negative -> treat as missing
df_clean.loc[df_clean["annual_income"] <= 0, "annual_income"] = np.nan

# Marketing spend: negative -> treat as missing
df_clean.loc[df_clean["marketing_spend"] < 0, "marketing_spend"] = np.nan


In [28]:
# -----------------------------
# 3) Define features (numeric only)
# -----------------------------
# Keep numeric columns, drop IDs and target
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [c for c in numeric_cols if c not in ID_COLS + [TARGET]]

X = df_clean[numeric_features]
y = df_clean[TARGET]

print("Total rows:", len(df_clean))
print("Numeric features used:", numeric_features)


Total rows: 8000
Numeric features used: ['annual_income', 'monthly_spend', 'avg_basket_value', 'marketing_spend']


In [29]:
# -----------------------------
# 4) Split train/test FIRST (avoid leakage)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [30]:
# -----------------------------
# 5) Baseline model (NO preprocessing except simple median imputation)
#    This is just to compare and show why full preprocessing matters.
# -----------------------------
baseline_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", Ridge(alpha=1.0, random_state=42))
])

baseline_pipe.fit(X_train, y_train)
pred_train_base = baseline_pipe.predict(X_train)
pred_test_base = baseline_pipe.predict(X_test)


In [31]:
# -----------------------------
# 6) Full preprocessing pipeline (Ely the Engineer style)
#    Impute -> PowerTransform -> Scale -> Model
# -----------------------------
numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    # Yeo-Johnson handles skew well; works with zeros/negatives (though we cleaned obvious invalids)
    ("power", PowerTransformer(method="yeo-johnson", standardize=False)),
    ("scaler", StandardScaler())
])

full_pipe = Pipeline(steps=[
    ("preprocess", numeric_preprocess),
    ("model", Ridge(alpha=1.0, random_state=42))
])

full_pipe.fit(X_train, y_train)
pred_train_full = full_pipe.predict(X_train)
pred_test_full = full_pipe.predict(X_test)



In [32]:
# -----------------------------
# 7) Evaluation helper
# -----------------------------
def report(name, y_tr, yhat_tr, y_te, yhat_te):
    mae_tr = mean_absolute_error(y_tr, yhat_tr)
    mae_te = mean_absolute_error(y_te, yhat_te)

    r2_tr = r2_score(y_tr, yhat_tr)
    r2_te = r2_score(y_te, yhat_te)

    # "Generalization gap" examples
    gap_r2 = r2_tr - r2_te
    gap_mae = mae_te - mae_tr

    print(f"\n=== {name} ===")
    print(f"Train MAE: {mae_tr:,.2f} | Test MAE: {mae_te:,.2f} | (Test-Train MAE): {gap_mae:,.2f}")
    print(f"Train R² : {r2_tr:,.4f} | Test R² : {r2_te:,.4f} | (Train-Test R²): {gap_r2:,.4f}")


In [34]:
# -----------------------------
# 8) Print results
# -----------------------------
report("Baseline (Median Impute + Ridge)", y_train, pred_train_base, y_test, pred_test_base)
report("Full Pipeline (Impute + Power + Scale + Ridge)", y_train, pred_train_full, y_test, pred_test_full)




=== Baseline (Median Impute + Ridge) ===
Train MAE: 609.26 | Test MAE: 597.74 | (Test-Train MAE): -11.52
Train R² : 0.6178 | Test R² : 0.6215 | (Train-Test R²): -0.0038

=== Full Pipeline (Impute + Power + Scale + Ridge) ===
Train MAE: 56.54 | Test MAE: 57.65 | (Test-Train MAE): 1.11
Train R² : 0.9850 | Test R² : 0.9811 | (Train-Test R²): 0.0039


In [37]:
# -----------------------------
# 9) Inspect skewness BEFORE preprocessing (train only)
# -----------------------------
skew_train = X_train.skew(numeric_only=True).sort_values(key=lambda s: s.abs(), ascending=False)
print("\nTop 10 skewed features in TRAIN split (|skew|):")
print(skew_train.head(10).round(3))



Top 10 skewed features in TRAIN split (|skew|):
marketing_spend     5.209
monthly_spend       5.137
annual_income       4.150
avg_basket_value    2.679
dtype: float64
