In [13]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [15]:
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

In [16]:
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [17]:
# !pip install wandb -qU
# !pip uninstall -y pmdarima numpy scipy statsmodels
# !pip install numpy==1.24.4 scipy==1.10.1 statsmodels==0.13.5 pmdarima==2.0.3

In [18]:
import wandb
import random
import math
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

In [None]:
# -*- coding: utf-8 -*-
"""Enhanced Prophet Time Series Forecasting for Walmart Sales

Rewritten to focus on time series analysis with Prophet only.
"""

# Setup and installations
!pip install kaggle wandb prophet dill -Uq
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

# Data download (uncomment if needed)
# !kaggle competitions download -c walmart-recruiting-store-sales-forecasting
# !unzip /content/walmart-recruiting-store-sales-forecasting.zip
# !unzip /content/train.csv.zip
# !unzip /content/test.csv.zip
# !unzip /content/features.csv.zip

# Imports
import os
import sys
import pandas as pd
import numpy as np
import wandb
import dill
import logging
from datetime import datetime, timedelta
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

# Suppress warnings and logging
warnings.filterwarnings('ignore')
logging.getLogger('prophet').setLevel(logging.ERROR)
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)

# WandB setup
wandb.init(project="walmart-sales-forecasting", name="Prophet_TimeSeries_Optimized")

# =============================================================================
# Block 1: Data Loading and Initial Setup
# =============================================================================

print("Loading data...")
train_df = pd.read_csv("/content/train.csv")
features_df = pd.read_csv("/content/features.csv")
stores_df = pd.read_csv("/content/stores.csv")
test_df = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sampleSubmission.csv")

# Convert dates
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
features_df['Date'] = pd.to_datetime(features_df['Date'])

print(f"Data loaded: Train {train_df.shape}, Test {test_df.shape}")
print(f"Train columns: {list(train_df.columns)}")
print(f"Features columns: {list(features_df.columns)}")
print(f"Date range: {train_df['Date'].min()} to {train_df['Date'].max()}")

# Log basic info
wandb.log({
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "n_stores": train_df['Store'].nunique(),
    "n_departments": train_df['Dept'].nunique(),
    "date_range_days": (train_df['Date'].max() - train_df['Date'].min()).days
})

# =============================================================================
# Block 2: Time Series Feature Engineering
# =============================================================================

class TimeSeriesFeatureEngineer(BaseEstimator, TransformerMixin):
    """Focused time series feature engineering for Prophet"""

    def __init__(self):
        self.fitted = False

    def fit(self, X, y=None):
        self.fitted = True
        return self

    def transform(self, X):
        df = X.copy()
        print(f"Input shape: {df.shape}")
        print(f"Input columns: {list(df.columns)}")

        # Merge external features with proper suffix handling
        print("Merging with features...")
        df = df.merge(features_df, on=['Store', 'Date'], how='left', suffixes=('', '_feat'))
        print(f"After features merge: {df.shape}")

        print("Merging with stores...")
        df = df.merge(stores_df, on='Store', how='left')
        print(f"After stores merge: {df.shape}")
        print(f"Columns after merge: {list(df.columns)}")

        # Handle IsHoliday column conflicts
        if 'IsHoliday_feat' in df.columns:
            # Use the original IsHoliday, fill missing with features version
            df['IsHoliday'] = df['IsHoliday'].fillna(df['IsHoliday_feat'])
            df = df.drop('IsHoliday_feat', axis=1)

        # Ensure IsHoliday exists and is properly formatted
        if 'IsHoliday' in df.columns:
            df['IsHoliday'] = df['IsHoliday'].fillna(False).astype(int)
        else:
            print("Warning: IsHoliday column not found, creating default")
            df['IsHoliday'] = 0

        # Fill missing values efficiently
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())
                print(f"Filled {col}: {df[col].isnull().sum()} missing values")
            else:
                print(f"Warning: {col} not found in data")

        # Markdown columns (promotional effects)
        markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
        for col in markdown_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0)
            else:
                print(f"Creating {col} with default values")
                df[col] = 0

        # Store type encoding
        if 'Type' in df.columns:
            df['Type'] = df['Type'].fillna('A')
            print(f"Store types: {df['Type'].value_counts()}")
        else:
            print("Warning: Type column not found")
            df['Type'] = 'A'

        if 'Size' in df.columns:
            df['Size'] = df['Size'].fillna(df['Size'].median())
        else:
            print("Warning: Size column not found")
            df['Size'] = 151315  # Approximate median from typical Walmart data

        # Time-based features for Prophet regressors
        df['Month'] = df['Date'].dt.month
        df['Quarter'] = df['Date'].dt.quarter
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Week'] = df['Date'].dt.isocalendar().week

        # Seasonal indicators (key for retail)
        df['IsQ4'] = (df['Quarter'] == 4).astype(int)  # Holiday season
        df['IsBackToSchool'] = (df['Month'] == 8).astype(int)  # August
        df['IsSpringCleaning'] = (df['Month'] == 4).astype(int)  # April
        df['IsSummer'] = df['Month'].isin([6, 7, 8]).astype(int)  # Summer

        # Create total markdown effect
        df['TotalMarkDown'] = sum(df[col] for col in markdown_cols if col in df.columns)

        # Economic indicators
        if 'CPI' in df.columns and 'Unemployment' in df.columns:
            df['EconomicIndex'] = df['CPI'] * df['Unemployment']
        else:
            df['EconomicIndex'] = 0

        # Store size category
        if 'Size' in df.columns:
            df['StoreSizeCategory'] = pd.cut(df['Size'], bins=3, labels=[0, 1, 2]).astype(int)
        else:
            df['StoreSizeCategory'] = 1

        print(f"Final processed shape: {df.shape}")
        print(f"Missing values check:")
        for col in ['IsHoliday', 'Temperature', 'Fuel_Price', 'TotalMarkDown']:
            if col in df.columns:
                print(f"  {col}: {df[col].isnull().sum()} missing")

        return df

# =============================================================================
# Block 3: Prophet Model for Time Series
# =============================================================================

class SuppressOutput:
    """Context manager to suppress Prophet output"""
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

class WalmartProphetModel(BaseEstimator):
    """Optimized Prophet model for Walmart sales forecasting"""

    def __init__(self,
                 changepoint_prior_scale=0.05,
                 seasonality_prior_scale=10.0,
                 holidays_prior_scale=10.0,
                 seasonality_mode='multiplicative',
                 min_samples=20):
        self.changepoint_prior_scale = changepoint_prior_scale
        self.seasonality_prior_scale = seasonality_prior_scale
        self.holidays_prior_scale = holidays_prior_scale
        self.seasonality_mode = seasonality_mode
        self.min_samples = min_samples
        self.models = {}
        self.global_median = None

    def _create_holidays(self, df):
        """Create holiday dataframe for Prophet"""
        # Create base holidays
        base_holidays = {
            'thanksgiving': ['2010-11-25', '2011-11-24', '2012-11-22', '2013-11-28'],
            'christmas': ['2010-12-25', '2011-12-25', '2012-12-25', '2013-12-25'],
            'newyear': ['2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01'],
            'superbowl': ['2011-02-06', '2012-02-05', '2013-02-03', '2014-02-02'],
            'laborday': ['2010-09-06', '2011-09-05', '2012-09-03', '2013-09-02']
        }

        holidays_list = []
        for holiday_name, dates in base_holidays.items():
            for date_str in dates:
                holidays_list.append({
                    'holiday': holiday_name,
                    'ds': pd.to_datetime(date_str),
                    'lower_window': 0,
                    'upper_window': 0
                })

        return pd.DataFrame(holidays_list)

    def fit(self, X, y=None):
        """Fit Prophet models for each store-department combination"""
        print(f"Training Prophet models...")

        # Calculate global median for fallback
        if 'Weekly_Sales' in X.columns:
            self.global_median = X['Weekly_Sales'].median()
        else:
            self.global_median = 15000  # Reasonable default

        # Create holidays
        holidays = self._create_holidays(X)

        # Key regressors for Prophet (only use columns that exist)
        potential_regressors = [
            'Temperature', 'Fuel_Price', 'IsHoliday', 'IsQ4',
            'IsBackToSchool', 'TotalMarkDown', 'EconomicIndex', 'IsSummer'
        ]

        # Filter to only existing columns
        regressors = [col for col in potential_regressors if col in X.columns]
        print(f"Using regressors: {regressors}")

        trained_count = 0
        total_groups = len(X.groupby(['Store', 'Dept']))

        for i, ((store, dept), group) in enumerate(X.groupby(['Store', 'Dept'])):
            if i % 50 == 0:
                print(f"Progress: {i}/{total_groups} ({100*i/total_groups:.1f}%)")

            # Skip if insufficient data
            if len(group) < self.min_samples:
                continue

            # Prepare data for Prophet
            prophet_data = pd.DataFrame({
                'ds': group['Date'],
                'y': group['Weekly_Sales'] if 'Weekly_Sales' in group.columns else group.iloc[:, -1]  # fallback
            })

            # Add regressors
            for regressor in regressors:
                if regressor in group.columns:
                    prophet_data[regressor] = group[regressor].values
                else:
                    prophet_data[regressor] = 0  # Default value

            # Remove any remaining NaN values
            prophet_data = prophet_data.dropna()

            if len(prophet_data) < self.min_samples:
                continue

            try:
                with SuppressOutput():
                    # Initialize Prophet model
                    model = Prophet(
                        changepoint_prior_scale=self.changepoint_prior_scale,
                        seasonality_prior_scale=self.seasonality_prior_scale,
                        holidays_prior_scale=self.holidays_prior_scale,
                        seasonality_mode=self.seasonality_mode,
                        holidays=holidays,
                        daily_seasonality=False,
                        weekly_seasonality=True,
                        yearly_seasonality=True
                    )

                    # Add regressors
                    for regressor in regressors:
                        if regressor in prophet_data.columns:
                            model.add_regressor(regressor)

                    # Fit model
                    model.fit(prophet_data)

                    # Store model and info
                    self.models[(store, dept)] = {
                        'model': model,
                        'regressors': regressors,
                        'median_sales': prophet_data['y'].median()
                    }
                    trained_count += 1

            except Exception as e:
                # Skip problematic series
                if i < 5:  # Show first few errors for debugging
                    print(f"Error training model for Store {store}, Dept {dept}: {str(e)}")
                continue

        print(f"Successfully trained {trained_count} models out of {total_groups} store-dept combinations")
        return self

    def predict(self, X):
        """Generate predictions using trained Prophet models"""
        predictions = []

        for (store, dept), group in X.groupby(['Store', 'Dept']):
            if (store, dept) in self.models:
                model_info = self.models[(store, dept)]
                model = model_info['model']

                # Prepare future dataframe
                future = pd.DataFrame({'ds': group['Date']})

                # Add regressors
                for regressor in model_info['regressors']:
                    if regressor in group.columns:
                        future[regressor] = group[regressor].values
                    else:
                        future[regressor] = 0

                try:
                    with SuppressOutput():
                        forecast = model.predict(future)
                    predictions.extend(forecast['yhat'].values)
                except Exception as e:
                    # Fallback to median
                    median_pred = model_info['median_sales']
                    predictions.extend([median_pred] * len(group))
            else:
                # Use global median for unseen store-dept combinations
                # Apply seasonal adjustment
                seasonal_multiplier = 1.0
                if 'Month' in group.columns:
                    month = group['Month'].iloc[0]
                    if month in [11, 12]:  # Holiday season
                        seasonal_multiplier = 1.5
                    elif month in [1, 2]:  # Post holiday
                        seasonal_multiplier = 0.8

                pred_value = self.global_median * seasonal_multiplier
                predictions.extend([pred_value] * len(group))

        return np.array(predictions)

# =============================================================================
# Block 4: Pipeline Training and Evaluation
# =============================================================================

# Feature engineering
print("Applying feature engineering...")
feature_engineer = TimeSeriesFeatureEngineer()
feature_engineer.fit(train_df)

processed_train = feature_engineer.transform(train_df)
processed_test = feature_engineer.transform(test_df)

print(f"Features added. Train shape: {processed_train.shape}")

# Time-based train/validation split
print("Creating time-based validation split...")
max_date = processed_train['Date'].max()
val_split_date = max_date - timedelta(weeks=8)

train_data = processed_train[processed_train['Date'] <= val_split_date].copy()
val_data = processed_train[processed_train['Date'] > val_split_date].copy()

print(f"Train: {len(train_data)} samples, Val: {len(val_data)} samples")

# Train Prophet model
print("Training Prophet model...")
prophet_model = WalmartProphetModel(
    changepoint_prior_scale=0.05,
    seasonality_prior_scale=10.0,
    seasonality_mode='multiplicative'
)

prophet_model.fit(train_data)

# Validate
print("Validating model...")
val_predictions = prophet_model.predict(val_data)
val_mae = mean_absolute_error(val_data['Weekly_Sales'], val_predictions)
print(f"Validation MAE: {val_mae:.2f}")

# Log validation results
wandb.log({
    'validation_mae': val_mae,
    'models_trained': len(prophet_model.models),
    'val_samples': len(val_data)
})

# =============================================================================
# Block 5: Final Training and Prediction
# =============================================================================

print("Training final model on full dataset...")
final_model = WalmartProphetModel(
    changepoint_prior_scale=0.05,
    seasonality_prior_scale=10.0,
    seasonality_mode='multiplicative'
)

final_model.fit(processed_train)

# Generate test predictions
print("Generating test predictions...")
test_predictions = final_model.predict(processed_test)

# Basic sanity check
print(f"Test predictions stats:")
print(f"  Mean: {np.mean(test_predictions):.2f}")
print(f"  Std: {np.std(test_predictions):.2f}")
print(f"  Min: {np.min(test_predictions):.2f}")
print(f"  Max: {np.max(test_predictions):.2f}")

# Create submission
submission = sample_submission.copy()
submission['Weekly_Sales'] = test_predictions

# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'prophet_submission_{timestamp}.csv'
submission.to_csv(submission_filename, index=False)

print(f"Submission saved: {submission_filename}")

# =============================================================================
# Block 6: Pipeline Saving and Artifact Creation
# =============================================================================

class WalmartProphetPipeline(BaseEstimator):
    """Complete pipeline for Walmart sales forecasting"""

    def __init__(self):
        self.feature_engineer = TimeSeriesFeatureEngineer()
        self.model = WalmartProphetModel()
        self.fitted = False

    def fit(self, X, y=None):
        print("Fitting complete pipeline...")
        processed_data = self.feature_engineer.fit_transform(X)
        self.model.fit(processed_data)
        self.fitted = True
        return self

    def predict(self, X):
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before prediction")
        processed_data = self.feature_engineer.transform(X)
        return self.model.predict(processed_data)

# Create and save pipeline
print("Creating complete pipeline...")
pipeline = WalmartProphetPipeline()
pipeline.fit(train_df)

# Save pipeline
pipeline_filename = f'walmart_prophet_pipeline_{timestamp}.pkl'
with open(pipeline_filename, 'wb') as f:
    dill.dump(pipeline, f)

print(f"Pipeline saved: {pipeline_filename}")

# Create WandB artifacts
pipeline_artifact = wandb.Artifact(
    name="walmart_prophet_pipeline",
    type="model",
    description="Complete Prophet pipeline for Walmart sales forecasting",
    metadata={
        "model_type": "Prophet",
        "validation_mae": val_mae,
        "models_trained": len(final_model.models),
        "timestamp": timestamp
    }
)

pipeline_artifact.add_file(pipeline_filename)
wandb.log_artifact(pipeline_artifact)

submission_artifact = wandb.Artifact(
    name="walmart_prophet_submission",
    type="dataset",
    description=f"Prophet submission for Walmart sales - {timestamp}"
)
submission_artifact.add_file(submission_filename)
wandb.log_artifact(submission_artifact)

# Final logging
wandb.log({
    'pipeline_saved': True,
    'submission_created': True,
    'test_predictions_mean': np.mean(test_predictions),
    'test_predictions_std': np.std(test_predictions),
    'final_models_count': len(final_model.models)
})

print("Walmart Prophet forecasting completed successfully!")
print(f"Validation MAE: {val_mae:.2f}")
print(f"Models trained: {len(final_model.models)}")
print("Pipeline and submission saved to WandB!")

wandb.finish()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.0.0 requires dill<0.3.9,>=0.3.0, but you have dill 0.4.0 which is incompatible.[0m[31m
[0mDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/root/.kaggle’: File exists


0,1
n_departments,▁
n_stores,▁
test_samples,▁
train_samples,▁

0,1
n_departments,81
n_stores,45
test_samples,115064
train_samples,421570


Loading data...
Data loaded: Train (421570, 5), Test (115064, 4)
Train columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Features columns: ['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday']
Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
Applying feature engineering...
Input shape: (421570, 5)
Input columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Merging with features...
After features merge: (421570, 15)
Merging with stores...
After stores merge: (421570, 17)
Columns after merge: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_feat', 'Type', 'Size']
Filled Temperature: 0 missing values
Filled Fuel_Price: 0 missing values
Filled CPI: 0 missing values
Filled Unemployment: 0 missing values
Store types: Type
A    215478
B   

In [19]:
wandb.login()



True

In [20]:
import pandas as pd
import numpy as np
import wandb
import pickle
import joblib
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

# Initialize WandB
wandb.init(project="walmart-sales-forecasting", name="Prophet_Experiment")

# Load data
print("Loading data...")
train_df = pd.read_csv("/content/train.csv")
features_df = pd.read_csv("/content/features.csv")
stores_df = pd.read_csv("/content/stores.csv")
test_df = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sampleSubmission.csv")

print("Data loaded successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Features shape: {features_df.shape}")
print(f"Stores shape: {stores_df.shape}")
print(f"Test shape: {test_df.shape}")

# Log dataset info to WandB
wandb.log({
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "n_stores": train_df['Store'].nunique(),
    "n_departments": train_df['Dept'].nunique()
})

0,1
n_departments,▁
n_stores,▁
test_samples,▁
train_samples,▁

0,1
n_departments,81
n_stores,45
test_samples,115064
train_samples,421570


Loading data...
Data loaded successfully!
Train shape: (421570, 5)
Features shape: (8190, 12)
Stores shape: (45, 3)
Test shape: (115064, 4)


In [21]:
# =================================================
# Block 2: Enhanced Time Series Data Preprocessing
# =================================================
import logging

logging.getLogger('prophet').setLevel(logging.ERROR)
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)
logging.getLogger('prophet.forecaster').setLevel(logging.ERROR)
logging.getLogger('prophet.plot').setLevel(logging.ERROR)
logging.basicConfig(level=logging.ERROR)
logging.disable(logging.DEBUG)
logging.disable(logging.INFO)
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)


class TimeSeriesFeatureEngineer(BaseEstimator, TransformerMixin):
    """Focused time series feature engineering for Prophet"""

    def __init__(self):
        self.fitted = False

    def fit(self, X, y=None):
        self.fitted = True
        return self

    def transform(self, X):
        df = X.copy()

        # Merge external features
        df = df.merge(features_df, on=['Store', 'Date'], how='left')
        df = df.merge(stores_df, on='Store', how='left')

        # Fill missing values efficiently
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())

        # Markdown columns (promotional effects)
        markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
        for col in markdown_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0)

        # Holiday handling
        df['IsHoliday'] = df['IsHoliday'].fillna(False).astype(int)

        # Store type encoding
        if 'Type' in df.columns:
            df['Type'] = df['Type'].fillna('A')
            df['Size'] = df['Size'].fillna(df['Size'].median())

        # Time-based features for Prophet regressors
        df['Month'] = df['Date'].dt.month
        df['Quarter'] = df['Date'].dt.quarter
        df['DayOfWeek'] = df['Date'].dt.dayofweek

        # Seasonal indicators (key for retail)
        df['IsQ4'] = (df['Quarter'] == 4).astype(int)  # Holiday season
        df['IsBackToSchool'] = (df['Month'] == 8).astype(int)  # August
        df['IsSpringCleaning'] = (df['Month'] == 4).astype(int)  # April

        # Create total markdown effect
        df['TotalMarkDown'] = sum(df[col] for col in markdown_cols if col in df.columns)

        # Economic indicators
        df['EconomicIndex'] = df['CPI'] * df['Unemployment'] if 'CPI' in df.columns and 'Unemployment' in df.columns else 0

        return df

In [22]:
# =============================================================================
# Block 3: Prophet Model for Time Series
# =============================================================================

class SuppressOutput:
    """Context manager to suppress Prophet output"""
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

class WalmartProphetModel(BaseEstimator):
    """Optimized Prophet model for Walmart sales forecasting"""

    def __init__(self,
                 changepoint_prior_scale=0.05,
                 seasonality_prior_scale=10.0,
                 holidays_prior_scale=10.0,
                 seasonality_mode='multiplicative',
                 min_samples=20):
        self.changepoint_prior_scale = changepoint_prior_scale
        self.seasonality_prior_scale = seasonality_prior_scale
        self.holidays_prior_scale = holidays_prior_scale
        self.seasonality_mode = seasonality_mode
        self.min_samples = min_samples
        self.models = {}
        self.global_median = None

    def _create_holidays(self, df):
        """Create holiday dataframe for Prophet"""
        # Create base holidays
        base_holidays = {
            'thanksgiving': ['2010-11-25', '2011-11-24', '2012-11-22', '2013-11-28'],
            'christmas': ['2010-12-25', '2011-12-25', '2012-12-25', '2013-12-25'],
            'newyear': ['2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01'],
            'superbowl': ['2011-02-06', '2012-02-05', '2013-02-03', '2014-02-02'],
            'laborday': ['2010-09-06', '2011-09-05', '2012-09-03', '2013-09-02']
        }

        holidays_list = []
        for holiday_name, dates in base_holidays.items():
            for date_str in dates:
                holidays_list.append({
                    'holiday': holiday_name,
                    'ds': pd.to_datetime(date_str),
                    'lower_window': 0,
                    'upper_window': 0
                })

        return pd.DataFrame(holidays_list)

    def fit(self, X, y=None):
        """Fit Prophet models for each store-department combination"""
        print(f"Training Prophet models...")

        # Calculate global median for fallback
        if 'Weekly_Sales' in X.columns:
            self.global_median = X['Weekly_Sales'].median()
        else:
            self.global_median = 15000  # Reasonable default

        # Create holidays
        holidays = self._create_holidays(X)

        # Key regressors for Prophet (only use columns that exist)
        potential_regressors = [
            'Temperature', 'Fuel_Price', 'IsHoliday', 'IsQ4',
            'IsBackToSchool', 'TotalMarkDown', 'EconomicIndex', 'IsSummer'
        ]

        # Filter to only existing columns
        regressors = [col for col in potential_regressors if col in X.columns]
        print(f"Using regressors: {regressors}")

        trained_count = 0
        total_groups = len(X.groupby(['Store', 'Dept']))

        for i, ((store, dept), group) in enumerate(X.groupby(['Store', 'Dept'])):
            if i % 50 == 0:
                print(f"Progress: {i}/{total_groups} ({100*i/total_groups:.1f}%)")

            # Skip if insufficient data
            if len(group) < self.min_samples:
                continue

            # Prepare data for Prophet
            prophet_data = pd.DataFrame({
                'ds': group['Date'],
                'y': group['Weekly_Sales'] if 'Weekly_Sales' in group.columns else group.iloc[:, -1]  # fallback
            })

            # Add regressors
            for regressor in regressors:
                if regressor in group.columns:
                    prophet_data[regressor] = group[regressor].values
                else:
                    prophet_data[regressor] = 0  # Default value

            # Remove any remaining NaN values
            prophet_data = prophet_data.dropna()

            if len(prophet_data) < self.min_samples:
                continue

            try:
                with SuppressOutput():
                    # Initialize Prophet model
                    model = Prophet(
                        changepoint_prior_scale=self.changepoint_prior_scale,
                        seasonality_prior_scale=self.seasonality_prior_scale,
                        holidays_prior_scale=self.holidays_prior_scale,
                        seasonality_mode=self.seasonality_mode,
                        holidays=holidays,
                        daily_seasonality=False,
                        weekly_seasonality=True,
                        yearly_seasonality=True
                    )

                    # Add regressors
                    for regressor in regressors:
                        if regressor in prophet_data.columns:
                            model.add_regressor(regressor)

                    # Fit model
                    model.fit(prophet_data)

                    # Store model and info
                    self.models[(store, dept)] = {
                        'model': model,
                        'regressors': regressors,
                        'median_sales': prophet_data['y'].median()
                    }
                    trained_count += 1

            except Exception as e:
                # Skip problematic series
                if i < 5:  # Show first few errors for debugging
                    print(f"Error training model for Store {store}, Dept {dept}: {str(e)}")
                continue

        print(f"Successfully trained {trained_count} models out of {total_groups} store-dept combinations")
        return self

    def predict(self, X):
        """Generate predictions using trained Prophet models"""
        predictions = []

        for (store, dept), group in X.groupby(['Store', 'Dept']):
            if (store, dept) in self.models:
                model_info = self.models[(store, dept)]
                model = model_info['model']

                # Prepare future dataframe
                future = pd.DataFrame({'ds': group['Date']})

                # Add regressors
                for regressor in model_info['regressors']:
                    if regressor in group.columns:
                        future[regressor] = group[regressor].values
                    else:
                        future[regressor] = 0

                try:
                    with SuppressOutput():
                        forecast = model.predict(future)
                    predictions.extend(forecast['yhat'].values)
                except Exception as e:
                    # Fallback to median
                    median_pred = model_info['median_sales']
                    predictions.extend([median_pred] * len(group))
            else:
                # Use global median for unseen store-dept combinations
                # Apply seasonal adjustment
                seasonal_multiplier = 1.0
                if 'Month' in group.columns:
                    month = group['Month'].iloc[0]
                    if month in [11, 12]:  # Holiday season
                        seasonal_multiplier = 1.5
                    elif month in [1, 2]:  # Post holiday
                        seasonal_multiplier = 0.8

                pred_value = self.global_median * seasonal_multiplier
                predictions.extend([pred_value] * len(group))

        return np.array(predictions)

In [24]:
# =============================================================================
# Block 4: Pipeline Training and Evaluation
# =============================================================================

# Feature engineering
print("Applying feature engineering...")
feature_engineer = TimeSeriesFeatureEngineer()
feature_engineer.fit(train_df)

processed_train = feature_engineer.transform(train_df)
processed_test = feature_engineer.transform(test_df)

print(f"Features added. Train shape: {processed_train.shape}")

# Time-based train/validation split
print("Creating time-based validation split...")
max_date = processed_train['Date'].max()
val_split_date = max_date - timedelta(weeks=8)

train_data = processed_train[processed_train['Date'] <= val_split_date].copy()
val_data = processed_train[processed_train['Date'] > val_split_date].copy()

print(f"Train: {len(train_data)} samples, Val: {len(val_data)} samples")

# Train Prophet model
print("Training Prophet model...")
prophet_model = WalmartProphetModel(
    changepoint_prior_scale=0.05,
    seasonality_prior_scale=10.0,
    seasonality_mode='multiplicative'
)

prophet_model.fit(train_data)

# Validate
print("Validating model...")
val_predictions = prophet_model.predict(val_data)
val_mae = mean_absolute_error(val_data['Weekly_Sales'], val_predictions)
print(f"Validation MAE: {val_mae:.2f}")

# Log validation results
wandb.log({
    'validation_mae': val_mae,
    'models_trained': len(prophet_model.models),
    'val_samples': len(val_data)
})

Applying feature engineering...


KeyError: 'IsHoliday'

In [None]:
# =============================================================================
# Block 5: Final Training and Prediction
# =============================================================================

print("Training final model on full dataset...")
final_model = WalmartProphetModel(
    changepoint_prior_scale=0.05,
    seasonality_prior_scale=10.0,
    seasonality_mode='multiplicative'
)

final_model.fit(processed_train)

# Generate test predictions
print("Generating test predictions...")
test_predictions = final_model.predict(processed_test)

# Create submission
submission = sample_submission.copy()
submission['Weekly_Sales'] = test_predictions

# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'prophet_submission_{timestamp}.csv'
submission.to_csv(submission_filename, index=False)

print(f"Submission saved: {submission_filename}")


In [None]:
# =============================================================================
# Block 6: Pipeline Saving and Artifact Creation
# =============================================================================

class WalmartProphetPipeline(BaseEstimator):
    """Complete pipeline for Walmart sales forecasting"""

    def __init__(self):
        self.feature_engineer = TimeSeriesFeatureEngineer()
        self.model = WalmartProphetModel()
        self.fitted = False

    def fit(self, X, y=None):
        print("Fitting complete pipeline...")
        processed_data = self.feature_engineer.fit_transform(X)
        self.model.fit(processed_data)
        self.fitted = True
        return self

    def predict(self, X):
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before prediction")
        processed_data = self.feature_engineer.transform(X)
        return self.model.predict(processed_data)

# Create and save pipeline
print("Creating complete pipeline...")
pipeline = WalmartProphetPipeline()
pipeline.fit(train_df)

# Save pipeline
pipeline_filename = f'walmart_prophet_pipeline_{timestamp}.pkl'
with open(pipeline_filename, 'wb') as f:
    dill.dump(pipeline, f)

print(f"Pipeline saved: {pipeline_filename}")

# Create WandB artifacts
pipeline_artifact = wandb.Artifact(
    name="walmart_prophet_pipeline",
    type="model",
    description="Complete Prophet pipeline for Walmart sales forecasting",
    metadata={
        "model_type": "Prophet",
        "validation_mae": val_mae,
        "models_trained": len(final_model.models),
        "timestamp": timestamp
    }
)

pipeline_artifact.add_file(pipeline_filename)
wandb.log_artifact(pipeline_artifact)

submission_artifact = wandb.Artifact(
    name="walmart_prophet_submission",
    type="dataset",
    description=f"Prophet submission for Walmart sales - {timestamp}"
)
submission_artifact.add_file(submission_filename)
wandb.log_artifact(submission_artifact)

# Final logging
wandb.log({
    'pipeline_saved': True,
    'submission_created': True,
    'test_predictions_mean': np.mean(test_predictions),
    'test_predictions_std': np.std(test_predictions),
    'final_models_count': len(final_model.models)
})

print("Walmart Prophet forecasting completed successfully!")
print(f"Validation MAE: {val_mae:.2f}")
print(f"Models trained: {len(final_model.models)}")
print("Pipeline and submission saved to WandB!")

wandb.finish()