[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/USER/REPO/blob/main/crisp_dm_project/notebooks/CRISP_DM_Walmart_Sales.ipynb)

# CRISP‑DM: Walmart Sales Forecasting

> Business understanding → Data understanding → Data preparation → Modeling → Evaluation → Deployment

**Dataset**: Kaggle — Walmart Sales Forecast (download in the next cell).

**Deliverables**: problem statement, success criteria, data audit report, EDA report, feature plan, model card, evaluation memo, deployment plan.

In [None]:
!pip -q install pandas numpy scikit-learn matplotlib seaborn xgboost lightgbm imbalanced-learn joblib gradio fastapi uvicorn kaggle

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score, classification_report, confusion_matrix
import joblib
Path('data').mkdir(exist_ok=True)
Path('models').mkdir(exist_ok=True)
Path('reports').mkdir(exist_ok=True)

In [None]:
# --- Kaggle download (make sure you've set up your Kaggle token) ---
    # In Colab:
    # from google.colab import files
    # files.upload()  # upload kaggle.json
    # !mkdir -p ~/.kaggle
    # !cp kaggle.json ~/.kaggle/
    # !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d aslanahmedov/walmart-sales-forecast -p data
    !ls -lh data
    # If a zip is downloaded, unzip it:
    !python - << 'PY'
import zipfile, glob, os
zips = glob.glob('data/*.zip')
for z in zips:
    with zipfile.ZipFile(z) as f:
        f.extractall('data')
        print('unzipped', z)
PY

## 1. Business Understanding
- Objective & KPIs (e.g., MAE ≤ X% on holdout)
- Stakeholders & constraints
- Risks & ethics (bias, leakage)
- Project plan & assumptions

## 2. Data Understanding
- Inventory datasets, schemas
- Data quality checks (missingness, ranges, duplicates)
- Initial EDA and hypotheses

In [None]:
# Load your CSV here (adjust the filename)
df = pd.read_csv('data/Walmart.csv') if Path('data/Walmart.csv').exists() else None
print('Rows/cols:', None if df is None else df.shape)
df.head() if df is not None else None

In [None]:
# Quick EDA
def quick_eda(df):
    display(df.head())
    display(df.describe(include='all').T)
    print('Missing (%) by column:')
    display(df.isna().mean().sort_values(ascending=False).to_frame('missing_pct'))

if df is not None:
    quick_eda(df)

## 3. Data Preparation
- Define target & features
- Split strategy & time‑series guards if needed
- Feature engineering
- Leakage checks & pipelines

In [None]:
# Example preprocessing & split (edit to match columns)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

if df is not None:
    target = 'Weekly_Sales' if 'Weekly_Sales' in df.columns else df.columns[-1]
    X = df.drop(columns=[target])
    y = df[target]
    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    pre = ColumnTransformer([
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

    model = RandomForestRegressor(n_estimators=300, random_state=42)
    pipe = Pipeline([('pre', pre), ('model', model)])
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_te)
    print('MAE:', mean_absolute_error(y_te, preds))
    print('R^2:', r2_score(y_te, preds))
    Path('models').mkdir(exist_ok=True)
    joblib.dump(pipe, 'models/model.joblib')

## 4. Modeling
- Baselines vs advanced models
- Hyperparameters & CV design
- Model selection rationale
- Model card

## 5. Evaluation
- Business‑aligned metrics
- Backtesting (if time series)
- Error analysis & fairness checks
- Sign‑off memo

## 6. Deployment
- Save model, inputs schema, versioning
- FastAPI/Gradio demo
- Monitoring plan

In [None]:
# Minimal FastAPI app is in src/app.py
# Example: from a notebook, run a quick Gradio demo (replace with real features)
import gradio as gr
import numpy as np

try:
    m = joblib.load('models/model.joblib')
    def predict_stub(*features):
        X = np.array(features).reshape(1, -1)
        return float(m.predict(X)[0])
    demo = gr.Interface(fn=predict_stub, inputs=['number']*1, outputs='number', title='Walmart Sales Predictor (demo)')
except Exception as e:
    print('Load model later after training:', e)