In [11]:
"""
Automating Pre-processing with Pipelines in scikit-learn (Numeric-focused)

Dataset: retail_sales_numeria_week4.csv
Target : target_sales

What this program demonstrates:
1) Load dataset + apply simple validity rules (0 income -> missing, negative marketing spend -> missing)
2) Train/Test split FIRST (avoid leakage)
3) Automatically detect skewed numeric features from TRAIN split only
4) Build a ColumnTransformer that:
   - Applies (Impute -> PowerTransform -> Scale) to skewed numeric columns
   - Applies (Impute -> Scale) to non-skewed numeric columns
5) Wrap everything inside a single Pipeline with a model
6) Evaluate baseline vs automated pipeline
7) Show how to reuse the same pipeline for future data (predict)

Why PowerTransformer?
- Works for many skew patterns
- Doesn't require strictly positive values (Yeo-Johnson)
- Great for "automated" preprocessing across many columns
"""

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge


In [12]:
# -----------------------------
# 1) Load dataset
# -----------------------------
df = pd.read_csv("D:/datasets/dpp/retail_sales_numeria_week4.csv")

TARGET = "target_sales"
ID_COLS = ["customer_id", "store_id"]

# -----------------------------
# 2) Validity cleaning (safe business rules; OK before split)
# -----------------------------
df_clean = df.copy()
df_clean.loc[df_clean["annual_income"] <= 0, "annual_income"] = np.nan
df_clean.loc[df_clean["marketing_spend"] < 0, "marketing_spend"] = np.nan

# -----------------------------
# 3) Select numeric features
# -----------------------------
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [c for c in numeric_cols if c not in ID_COLS + [TARGET]]

X = df_clean[numeric_features]
y = df_clean[TARGET]

print("Rows:", len(df_clean))
print("Numeric features:", numeric_features)


Rows: 8000
Numeric features: ['annual_income', 'monthly_spend', 'avg_basket_value', 'marketing_spend']


In [13]:
# -----------------------------
# 4) Split train/test FIRST (avoid leakage)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
# -----------------------------
# 5) Automatically detect skewed columns (TRAIN only)
# -----------------------------
# Rule: |skew| >= 1.0 => "skewed"
SKEW_THRESHOLD = 1.0

train_skew = X_train.skew(numeric_only=True)
skewed_cols = train_skew[train_skew.abs() >= SKEW_THRESHOLD].index.tolist()
non_skewed_cols = [c for c in numeric_features if c not in skewed_cols]

print("\nSkewness (train) top 10 by |skew|:")
print(train_skew.reindex(train_skew.abs().sort_values(ascending=False).index).head(10).round(3))

print("\nSkewed cols (|skew| >= 1.0):", skewed_cols)
print("Non-skewed cols:", non_skewed_cols)



Skewness (train) top 10 by |skew|:
marketing_spend     5.209
monthly_spend       5.137
annual_income       4.150
avg_basket_value    2.679
dtype: float64

Skewed cols (|skew| >= 1.0): ['annual_income', 'monthly_spend', 'avg_basket_value', 'marketing_spend']
Non-skewed cols: []


In [16]:
# -----------------------------
# 6) Build preprocessing pipelines
# -----------------------------
# Skewed numeric pipeline: Impute -> PowerTransform -> Scale
skewed_num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer(method="yeo-johnson", standardize=False)),
    ("scaler", StandardScaler())
])

# Non-skewed numeric pipeline: Impute -> Scale
normal_num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# ColumnTransformer: different logic per column group
preprocess = ColumnTransformer(
    transformers=[
        ("skewed", skewed_num_pipe, skewed_cols),
        ("normal", normal_num_pipe, non_skewed_cols),
    ],
    remainder="drop"
)



In [17]:
# -----------------------------
# 7) Wrap into a full ML pipeline (preprocess + model)
# -----------------------------
model = Ridge(alpha=1.0, random_state=42)

full_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

# -----------------------------
# 8) Baseline pipeline for comparison (Impute + Scale for ALL numeric)
# -----------------------------
baseline_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

baseline_pipeline = Pipeline(steps=[
    ("preprocess", baseline_preprocess),
    ("model", Ridge(alpha=1.0, random_state=42))
])


In [18]:
# -----------------------------
# 9) Evaluation helper
# -----------------------------
def evaluate(pipe, name: str):
    pipe.fit(X_train, y_train)

    pred_tr = pipe.predict(X_train)
    pred_te = pipe.predict(X_test)

    mae_tr = mean_absolute_error(y_train, pred_tr)
    mae_te = mean_absolute_error(y_test, pred_te)

    r2_tr = r2_score(y_train, pred_tr)
    r2_te = r2_score(y_test, pred_te)

    print(f"\n=== {name} ===")
    print(f"Train MAE: {mae_tr:,.2f} | Test MAE: {mae_te:,.2f} | Gap(Test-Train): {mae_te - mae_tr:,.2f}")
    print(f"Train R² : {r2_tr:,.4f} | Test R² : {r2_te:,.4f} | Gap(Train-Test): {r2_tr - r2_te:,.4f}")

    return pipe


In [20]:
# -----------------------------
# 10) Run + compare
# -----------------------------
baseline_pipeline = evaluate(baseline_pipeline, "Baseline: Impute+Scale (All Numeric)")
full_pipeline = evaluate(full_pipeline, "Automated: Skew-aware Pipelines (Power+Scale for skewed cols)")



=== Baseline: Impute+Scale (All Numeric) ===
Train MAE: 609.29 | Test MAE: 597.77 | Gap(Test-Train): -11.52
Train R² : 0.6178 | Test R² : 0.6215 | Gap(Train-Test): -0.0037

=== Automated: Skew-aware Pipelines (Power+Scale for skewed cols) ===
Train MAE: 56.54 | Test MAE: 57.65 | Gap(Test-Train): 1.11
Train R² : 0.9850 | Test R² : 0.9811 | Gap(Train-Test): 0.0039


In [21]:
# -----------------------------
# 11) Show how to reuse the same pipeline for NEW data
# -----------------------------
# Example: take 5 rows as "new incoming" data (in real life, this could be new customers)
new_data = X_test.sample(5, random_state=7)
pred_new = full_pipeline.predict(new_data)

print("\nExample predictions on new data (5 rows):")
print(pd.DataFrame({"predicted_target_sales": pred_new.round(2)}))

print("\nDone. This pipeline can now be saved and reused consistently in production.")



Example predictions on new data (5 rows):
   predicted_target_sales
0                18550.45
1                17296.60
2                19180.62
3                20677.09
4                19819.52

Done. This pipeline can now be saved and reused consistently in production.
