In [12]:
!pip install kaggle wandb onnx pmdarima -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [14]:
# !pip install wandb -qU
# !pip uninstall -y pmdarima numpy scipy statsmodels
# !pip install numpy==1.24.4 scipy==1.10.1 statsmodels==0.13.5 pmdarima==2.0.3

In [15]:
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [16]:
import wandb
import random
import math
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

import os
import sys
import pandas as pd
import numpy as np
import wandb
import dill
import logging
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from pmdarima import auto_arima
from scipy import stats
import warnings

# Suppress warnings and logging
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)

# WandB setup
wandb.init(project="walmart-sales-forecasting", name="ARIMA_TimeSeries_Optimized")

0,1
cluster_models,▁
date_range_days,▁
feature_engineering_time,▁
individual_models,▁
models_trained,▁
n_departments,▁
n_stores,▁
negative_predictions_cleaned,▁
predictions_per_second,▁
submission_creation_time,▁

0,1
cluster_models,50.0
date_range_days,994.0
feature_engineering_time,2.67433
individual_models,0.0
models_trained,50.0
n_departments,81.0
n_stores,45.0
negative_predictions_cleaned,2.0
predictions_per_second,2381.47355
submission_creation_time,0.00488


In [17]:
# =============================================================================
# Block 1: Data Loading and Initial Setup
# =============================================================================

print("Loading data...")
train_df = pd.read_csv("/content/train.csv")
features_df = pd.read_csv("/content/features.csv")
stores_df = pd.read_csv("/content/stores.csv")
test_df = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sampleSubmission.csv")

# Convert dates
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
features_df['Date'] = pd.to_datetime(features_df['Date'])

print(f"Data loaded: Train {train_df.shape}, Test {test_df.shape}")
print(f"Train columns: {list(train_df.columns)}")
print(f"Features columns: {list(features_df.columns)}")
print(f"Date range: {train_df['Date'].min()} to {train_df['Date'].max()}")

# Log basic info
wandb.log({
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "n_stores": train_df['Store'].nunique(),
    "n_departments": train_df['Dept'].nunique(),
    "date_range_days": (train_df['Date'].max() - train_df['Date'].min()).days
})

# Comprehensive logging suppression
import logging
import sys
import os

logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

Loading data...
Data loaded: Train (421570, 5), Test (115064, 4)
Train columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Features columns: ['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday']
Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00


In [18]:
# =============================================================================
# Block 2: Time Series Feature Engineering for ARIMA
# =============================================================================

class ARIMATimeSeriesFeatureEngineer(BaseEstimator, TransformerMixin):
    """Enhanced time series feature engineering optimized for ARIMA models"""

    def __init__(self):
        self.fitted = False

    def fit(self, X, y=None):
        self.fitted = True
        return self

    def transform(self, X):
        df = X.copy()
        print(f"Input shape: {df.shape}")
        print(f"Input columns: {list(df.columns)}")

        # Merge external features
        print("Merging with features...")
        df = df.merge(features_df, on=['Store', 'Date'], how='left', suffixes=('', '_feat'))
        print(f"After features merge: {df.shape}")

        print("Merging with stores...")
        df = df.merge(stores_df, on='Store', how='left')
        print(f"After stores merge: {df.shape}")

        # Handle IsHoliday column conflicts
        if 'IsHoliday_feat' in df.columns:
            df['IsHoliday'] = df['IsHoliday'].fillna(df['IsHoliday_feat'])
            df = df.drop('IsHoliday_feat', axis=1)

        # Ensure IsHoliday exists and is properly formatted
        if 'IsHoliday' in df.columns:
            df['IsHoliday'] = df['IsHoliday'].fillna(False).astype(int)
        else:
            print("Warning: IsHoliday column not found, creating default")
            df['IsHoliday'] = 0

        # Fill missing values efficiently
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())
                print(f"Filled {col}: {df[col].isnull().sum()} missing values")
            else:
                print(f"Warning: {col} not found in data")

        # Markdown columns (promotional effects)
        markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
        for col in markdown_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0)
            else:
                print(f"Creating {col} with default values")
                df[col] = 0

        # Store type and size handling
        if 'Type' in df.columns:
            df['Type'] = df['Type'].fillna('A')
        else:
            df['Type'] = 'A'

        if 'Size' in df.columns:
            df['Size'] = df['Size'].fillna(df['Size'].median())
        else:
            df['Size'] = 151315

        # Enhanced time-based features for ARIMA
        df['Month'] = df['Date'].dt.month
        df['Quarter'] = df['Date'].dt.quarter
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Week'] = df['Date'].dt.isocalendar().week
        df['DayOfMonth'] = df['Date'].dt.day
        df['WeekOfMonth'] = (df['Date'].dt.day - 1) // 7 + 1

        # Cyclical encoding for better ARIMA performance
        df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
        df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
        df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)

        # Enhanced seasonal indicators (crucial for retail ARIMA)
        df['IsQ4'] = (df['Quarter'] == 4).astype(int)
        df['IsHolidaySeason'] = df['Month'].isin([11, 12]).astype(int)
        df['IsBackToSchool'] = df['Month'].isin([8, 9]).astype(int)
        df['IsSpring'] = df['Month'].isin([3, 4, 5]).astype(int)
        df['IsSummer'] = df['Month'].isin([6, 7, 8]).astype(int)
        df['IsFall'] = df['Month'].isin([9, 10, 11]).astype(int)
        df['IsWinter'] = df['Month'].isin([12, 1, 2]).astype(int)

        # Weekend effect (important for retail)
        df['IsWeekend'] = (df['DayOfWeek'].isin([5, 6])).astype(int)
        df['IsMonday'] = (df['DayOfWeek'] == 0).astype(int)
        df['IsFriday'] = (df['DayOfWeek'] == 4).astype(int)

        # Holiday-specific features for ARIMA
        df['IsNewYear'] = ((df['Month'] == 1) & (df['DayOfMonth'] == 1)).astype(int)
        df['IsChristmas'] = ((df['Month'] == 12) & (df['DayOfMonth'] == 25)).astype(int)
        df['IsThanksgiving'] = ((df['Month'] == 11) & (df['WeekOfMonth'] == 4) & (df['DayOfWeek'] == 3)).astype(int)

        # Enhanced promotional features
        df['TotalMarkDown'] = sum(df[col] for col in markdown_cols if col in df.columns)
        df['HasPromotion'] = (df['TotalMarkDown'] > 0).astype(int)
        df['PromotionIntensity'] = np.log1p(df['TotalMarkDown'])  # Log transformation for better ARIMA handling

        # Individual markdown effects
        for i, col in enumerate(markdown_cols, 1):
            if col in df.columns:
                df[f'HasMarkDown{i}'] = (df[col] > 0).astype(int)
                df[f'MarkDown{i}_log'] = np.log1p(df[col])

        # Economic indicators (lagged effects important for ARIMA)
        if 'CPI' in df.columns and 'Unemployment' in df.columns:
            df['EconomicIndex'] = df['CPI'] / df['Unemployment']  # Modified for better scaling
            df['CPI_normalized'] = (df['CPI'] - df['CPI'].mean()) / df['CPI'].std()
            df['Unemployment_normalized'] = (df['Unemployment'] - df['Unemployment'].mean()) / df['Unemployment'].std()
        else:
            df['EconomicIndex'] = 1
            df['CPI_normalized'] = 0
            df['Unemployment_normalized'] = 0

        # Weather effects (important for retail sales)
        if 'Temperature' in df.columns:
            df['Temperature_normalized'] = (df['Temperature'] - df['Temperature'].mean()) / df['Temperature'].std()
            df['IsExtremeCold'] = (df['Temperature'] < 32).astype(int)  # Below freezing
            df['IsExtremeHot'] = (df['Temperature'] > 85).astype(int)   # Very hot
        else:
            df['Temperature_normalized'] = 0
            df['IsExtremeCold'] = 0
            df['IsExtremeHot'] = 0

        # Fuel price effects
        if 'Fuel_Price' in df.columns:
            df['Fuel_Price_normalized'] = (df['Fuel_Price'] - df['Fuel_Price'].mean()) / df['Fuel_Price'].std()
            df['IsHighFuelPrice'] = (df['Fuel_Price'] > df['Fuel_Price'].quantile(0.75)).astype(int)
        else:
            df['Fuel_Price_normalized'] = 0
            df['IsHighFuelPrice'] = 0

        # Store characteristics for ARIMA regressors
        if 'Size' in df.columns:
            df['StoreSizeCategory'] = pd.cut(df['Size'], bins=3, labels=[0, 1, 2]).astype(int)
            df['Size_normalized'] = (df['Size'] - df['Size'].mean()) / df['Size'].std()
        else:
            df['StoreSizeCategory'] = 1
            df['Size_normalized'] = 0

        # Store type dummies for ARIMA
        if 'Type' in df.columns:
            df['Type_A'] = (df['Type'] == 'A').astype(int)
            df['Type_B'] = (df['Type'] == 'B').astype(int)
            df['Type_C'] = (df['Type'] == 'C').astype(int)
        else:
            df['Type_A'] = 1
            df['Type_B'] = 0
            df['Type_C'] = 0

        # Interaction effects (important for capturing complex patterns)
        df['Holiday_Promotion'] = df['IsHoliday'] * df['HasPromotion']
        df['Weekend_Holiday'] = df['IsWeekend'] * df['IsHoliday']
        df['Q4_Promotion'] = df['IsQ4'] * df['HasPromotion']
        df['Temperature_Holiday'] = df['Temperature_normalized'] * df['IsHoliday']

        print(f"Final processed shape: {df.shape}")
        print("Sample of engineered features:")
        feature_sample = ['IsHoliday', 'TotalMarkDown', 'Month_sin', 'IsQ4', 'Holiday_Promotion']
        for col in feature_sample:
            if col in df.columns:
                print(f"  {col}: mean={df[col].mean():.3f}, std={df[col].std():.3f}")

        return df

In [19]:
# =============================================================================
# Block 3: SARIMAX Model for Time Series
# =============================================================================

class SuppressOutput:
    """Enhanced context manager to suppress all output"""
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

class WalmartARIMAModel(BaseEstimator):
    """Optimized SARIMAX model for Walmart sales forecasting with comprehensive time series handling"""

    def __init__(self,
                 seasonal_order=(1, 1, 1, 52),  # Weekly seasonality
                 max_p=3, max_q=3, max_P=2, max_Q=2,
                 stepwise=True,
                 suppress_warnings=True,
                 min_samples=30,
                 use_auto_arima=True):
        self.seasonal_order = seasonal_order
        self.max_p = max_p
        self.max_q = max_q
        self.max_P = max_P
        self.max_Q = max_Q
        self.stepwise = stepwise
        self.suppress_warnings = suppress_warnings
        self.min_samples = min_samples
        self.use_auto_arima = use_auto_arima
        self.models = {}
        self.model_orders = {}
        self.global_median = None

    def _get_exogenous_features(self, df):
        """Get the list of exogenous variables for ARIMA"""
        base_features = [
            'Temperature_normalized', 'Fuel_Price_normalized', 'IsHoliday',
            'Month_sin', 'Month_cos', 'IsQ4', 'IsHolidaySeason',
            'IsBackToSchool', 'TotalMarkDown', 'HasPromotion',
            'CPI_normalized', 'Unemployment_normalized', 'IsWeekend',
            'Holiday_Promotion', 'Q4_Promotion', 'Type_A', 'Type_B',
            'StoreSizeCategory', 'IsExtremeCold', 'IsExtremeHot'
        ]

        # Filter to only existing columns
        available_features = [col for col in base_features if col in df.columns]
        print(f"Using {len(available_features)} exogenous features: {available_features[:5]}...")
        return available_features

    def _prepare_time_series_data(self, group, exog_features):
        """Prepare data for ARIMA modeling"""
        # Sort by date to ensure proper time series order
        group_sorted = group.sort_values('Date').copy()

        # Create time series
        y = group_sorted['Weekly_Sales'] if 'Weekly_Sales' in group_sorted.columns else group_sorted.iloc[:, -1]

        # Handle negative sales (log transformation issues)
        y = np.maximum(y, 1)  # Ensure positive values

        # Prepare exogenous variables
        if exog_features:
            X = group_sorted[exog_features].fillna(0)
            # Ensure no infinite values
            X = X.replace([np.inf, -np.inf], 0)
        else:
            X = None

        return y, X, group_sorted['Date']

    def _detect_and_handle_outliers(self, y):
        """Detect and handle outliers in the time series"""
        Q1 = y.quantile(0.25)
        Q3 = y.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR  # More conservative than 1.5
        upper_bound = Q3 + 3 * IQR

        # Cap outliers instead of removing (to preserve time series structure)
        y_cleaned = y.copy()
        y_cleaned = np.where(y_cleaned < lower_bound, lower_bound, y_cleaned)
        y_cleaned = np.where(y_cleaned > upper_bound, upper_bound, y_cleaned)

        outlier_count = np.sum((y < lower_bound) | (y > upper_bound))
        return pd.Series(y_cleaned, index=y.index), outlier_count

    def _fit_individual_arima(self, y, X, store, dept):
        """Fit ARIMA model for individual store-department combination"""
        try:
            # Clean outliers
            y_clean, outlier_count = self._detect_and_handle_outliers(y)

            if self.use_auto_arima:
                # Use auto_arima for automatic model selection
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")

                    model = pm.auto_arima(
                        y_clean,
                        exogenous=X,
                        seasonal=True,
                        m=52,  # Weekly seasonality (52 weeks in a year)
                        max_p=self.max_p,
                        max_q=self.max_q,
                        max_P=self.max_P,
                        max_Q=self.max_Q,
                        stepwise=self.stepwise,
                        suppress_warnings=self.suppress_warnings,
                        error_action='ignore',
                        trace=False,
                        approximation=False,
                        maxiter=50
                    )

                # Store the order for future reference
                self.model_orders[(store, dept)] = {
                    'order': model.order,
                    'seasonal_order': model.seasonal_order,
                    'outliers_handled': outlier_count
                }

            else:
                # Use fixed SARIMAX model
                model = SARIMAX(
                    y_clean,
                    exog=X,
                    order=(1, 1, 1),  # Default ARIMA order
                    seasonal_order=self.seasonal_order,
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                model = model.fit(disp=False, maxiter=100)

                self.model_orders[(store, dept)] = {
                    'order': (1, 1, 1),
                    'seasonal_order': self.seasonal_order,
                    'outliers_handled': outlier_count
                }

            return model, y_clean.median()

        except Exception as e:
            print(f"Error fitting ARIMA for Store {store}, Dept {dept}: {str(e)}")
            return None, y.median()

    def fit(self, X, y=None):
        """Fit ARIMA models for each store-department combination"""
        print(f"Training ARIMA models...")

        # Calculate global median for fallback
        if 'Weekly_Sales' in X.columns:
            self.global_median = X['Weekly_Sales'].median()
        else:
            self.global_median = 15000  # Reasonable default

        # Get exogenous features
        exog_features = self._get_exogenous_features(X)

        trained_count = 0
        total_groups = len(X.groupby(['Store', 'Dept']))
        failed_count = 0

        print(f"Training on {total_groups} store-department combinations...")

        for i, ((store, dept), group) in enumerate(X.groupby(['Store', 'Dept'])):
            if i % 25 == 0:
                print(f"Progress: {i}/{total_groups} ({100*i/total_groups:.1f}%) - Trained: {trained_count}, Failed: {failed_count}")

            # Skip if insufficient data
            if len(group) < self.min_samples:
                continue

            try:
                # Prepare time series data
                y_series, X_exog, dates = self._prepare_time_series_data(group, exog_features)

                if len(y_series) < self.min_samples:
                    continue

                # Fit individual ARIMA model
                model, median_sales = self._fit_individual_arima(y_series, X_exog, store, dept)

                if model is not None:
                    self.models[(store, dept)] = {
                        'model': model,
                        'exog_features': exog_features,
                        'median_sales': median_sales,
                        'last_date': dates.max(),
                        'n_obs': len(y_series)
                    }
                    trained_count += 1
                else:
                    failed_count += 1

            except Exception as e:
                failed_count += 1
                if i < 3:  # Show first few errors for debugging
                    print(f"Error training model for Store {store}, Dept {dept}: {str(e)}")
                continue

        print(f"Training completed:")
        print(f"  Successfully trained: {trained_count} models")
        print(f"  Failed: {failed_count} models")
        print(f"  Total groups: {total_groups}")

        # Log model order statistics
        if self.model_orders:
            orders = [info['order'] for info in self.model_orders.values()]
            print(f"  Most common ARIMA order: {max(set(orders), key=orders.count)}")

        return self

    def predict(self, X):
        """Generate predictions using trained ARIMA models"""
        predictions = []
        exog_features = self._get_exogenous_features(X)

        for (store, dept), group in X.groupby(['Store', 'Dept']):
            if (store, dept) in self.models:
                model_info = self.models[(store, dept)]
                model = model_info['model']

                # Prepare exogenous data for prediction
                group_sorted = group.sort_values('Date')
                if exog_features:
                    X_pred = group_sorted[exog_features].fillna(0)
                    X_pred = X_pred.replace([np.inf, -np.inf], 0)
                else:
                    X_pred = None

                try:
                    # Generate predictions
                    n_periods = len(group)
                    if self.use_auto_arima:
                        forecast = model.predict(n_periods=n_periods, exogenous=X_pred)
                    else:
                        forecast = model.forecast(steps=n_periods, exog=X_pred)

                    # Ensure positive predictions
                    forecast = np.maximum(forecast, model_info['median_sales'] * 0.1)
                    predictions.extend(forecast)

                except Exception as e:
                    # Fallback to median with seasonal adjustment
                    median_pred = model_info['median_sales']

                    # Apply seasonal adjustment based on month
                    seasonal_preds = []
                    for _, row in group_sorted.iterrows():
                        seasonal_multiplier = 1.0
                        if 'Month' in row:
                            month = row['Month']
                            if month in [11, 12]:  # Holiday season
                                seasonal_multiplier = 1.4
                            elif month in [1, 2]:  # Post holiday
                                seasonal_multiplier = 0.8
                            elif month in [6, 7, 8]:  # Summer
                                seasonal_multiplier = 1.1

                        seasonal_preds.append(median_pred * seasonal_multiplier)

                    predictions.extend(seasonal_preds)
            else:
                # Use global median for unseen store-dept combinations with seasonal adjustment
                seasonal_predictions = []
                for _, row in group.iterrows():
                    seasonal_multiplier = 1.0
                    if 'Month' in row:
                        month = row['Month']
                        if month in [11, 12]:  # Holiday season
                            seasonal_multiplier = 1.3
                        elif month in [1, 2]:  # Post holiday
                            seasonal_multiplier = 0.9

                    # Also consider store type if available
                    if 'Type' in row:
                        if row['Type'] == 'A':  # Supercenters typically have higher sales
                            seasonal_multiplier *= 1.2
                        elif row['Type'] == 'C':  # Neighborhood markets typically lower
                            seasonal_multiplier *= 0.8

                    pred_value = self.global_median * seasonal_multiplier
                    seasonal_predictions.append(pred_value)

                predictions.extend(seasonal_predictions)

        return np.array(predictions)

In [23]:
# =============================================================================
# Block 4: ADVANCED Pipeline Training and Evaluation
# =============================================================================

print("=== ADVANCED WALMART FORECASTING - TARGETING SUB-4K ===")

# Enhanced feature engineering
print("Applying advanced feature engineering...")
start_time = time.time()

advanced_engineer = ARIMATimeSeriesFeatureEngineer()
advanced_engineer.fit(train_df)

processed_train = advanced_engineer.transform(train_df)
processed_test = advanced_engineer.transform(test_df)

feature_time = time.time() - start_time
print(f"Feature engineering completed in {feature_time:.2f} seconds")
print(f"Final training shape: {processed_train.shape}")

# Advanced validation strategy
print("Creating advanced validation split...")

# Use last 10 weeks for validation (more representative)
max_date = processed_train['Date'].max()
val_split_date = max_date - timedelta(weeks=10)

train_data = processed_train[processed_train['Date'] <= val_split_date].copy()
val_data = processed_train[processed_train['Date'] > val_split_date].copy()

print(f"Advanced split - Train: {len(train_data)}, Val: {len(val_data)}")

# Remove samples with insufficient history for lag features
min_date = train_data['Date'].min() + timedelta(weeks=12)  # Need 12 weeks for lag features
train_data = train_data[train_data['Date'] >= min_date]

print(f"After lag feature filtering - Train: {len(train_data)}")

# Train ensemble model
print("Training high-accuracy ensemble model...")
start_time = time.time()

ensemble_model = HighAccuracyWalmartEnsemble()
ensemble_model.fit(train_data)

training_time = time.time() - start_time
print(f"Ensemble training completed in {training_time:.2f} seconds")

# Advanced validation
print("Performing advanced validation...")
val_predictions = ensemble_model.predict(val_data)

# Multiple metrics
val_mae = mean_absolute_error(val_data['Weekly_Sales'], val_predictions)
val_rmse = np.sqrt(np.mean((val_data['Weekly_Sales'] - val_predictions) ** 2))
val_mape = np.mean(np.abs((val_data['Weekly_Sales'] - val_predictions) / val_data['Weekly_Sales'])) * 100

print(f"=== VALIDATION RESULTS ===")
print(f"MAE: {val_mae:.2f}")
print(f"RMSE: {val_rmse:.2f}")
print(f"MAPE: {val_mape:.2f}%")

# Analyze by segments for debugging
print("\n=== VALIDATION BY SEGMENTS ===")
val_with_pred = val_data.copy()
val_with_pred['Predictions'] = val_predictions
val_with_pred['Error'] = np.abs(val_with_pred['Weekly_Sales'] - val_predictions)

# By store type
if 'Type' in val_with_pred.columns:
    type_results = val_with_pred.groupby('Type')['Error'].mean()
    print("MAE by Store Type:")
    for store_type, mae in type_results.items():
        print(f"  Type {store_type}: {mae:.2f}")

# By holiday vs non-holiday
holiday_mae = val_with_pred[val_with_pred['IsHoliday'] == 1]['Error'].mean()
non_holiday_mae = val_with_pred[val_with_pred['IsHoliday'] == 0]['Error'].mean()
print(f"Holiday MAE: {holiday_mae:.2f}")
print(f"Non-Holiday MAE: {non_holiday_mae:.2f}")

# By Q4 vs other quarters
q4_mae = val_with_pred[val_with_pred['IsQ4'] == 1]['Error'].mean()
non_q4_mae = val_with_pred[val_with_pred['IsQ4'] == 0]['Error'].mean()
print(f"Q4 MAE: {q4_mae:.2f}")
print(f"Non-Q4 MAE: {non_q4_mae:.2f}")

# Show feature importance
if ensemble_model.feature_importance is not None:
    print("\n=== TOP 15 FEATURE IMPORTANCES ===")
    top_features = ensemble_model.feature_importance.head(15)
    for _, row in top_features.iterrows():
        print(f"{row['feature']}: {row['importance']:.4f}")

# Error analysis for improvement insights
print("\n=== ERROR ANALYSIS ===")
error_percentiles = np.percentile(val_with_pred['Error'], [50, 75, 90, 95, 99])
print(f"Error Percentiles - 50th: {error_percentiles[0]:.0f}, 75th: {error_percentiles[1]:.0f}, 90th: {error_percentiles[2]:.0f}, 95th: {error_percentiles[3]:.0f}, 99th: {error_percentiles[4]:.0f}")

# Worst predictions analysis
worst_predictions = val_with_pred.nlargest(10, 'Error')[['Store', 'Dept', 'Date', 'Weekly_Sales', 'Predictions', 'Error', 'IsHoliday', 'IsQ4']]
print("\nWorst 10 Predictions:")
print(worst_predictions.to_string(index=False))

# Log detailed results
wandb.log({
    'advanced_validation_mae': val_mae,
    'advanced_validation_rmse': val_rmse,
    'advanced_validation_mape': val_mape,
    'holiday_mae': holiday_mae,
    'non_holiday_mae': non_holiday_mae,
    'q4_mae': q4_mae,
    'non_q4_mae': non_q4_mae,
    'feature_engineering_time': feature_time,
    'ensemble_training_time': training_time,
    'total_features': processed_train.shape[1],
    'ensemble_models': len(ensemble_model.models),
    'error_50th_percentile': error_percentiles[0],
    'error_95th_percentile': error_percentiles[3]
})

print(f"\n🎯 TARGET: Sub-4K MAE")
print(f"🔥 CURRENT: {val_mae:.2f} MAE")
if val_mae < 4000:
    print("✅ TARGET ACHIEVED!")
else:
    print(f"📈 Need {val_mae - 4000:.0f} point improvement")

=== ADVANCED WALMART FORECASTING - TARGETING SUB-4K ===
Applying advanced feature engineering...
Input shape: (421570, 5)
Input columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Merging with features...
After features merge: (421570, 15)
Merging with stores...
After stores merge: (421570, 17)
Filled Temperature: 0 missing values
Filled Fuel_Price: 0 missing values
Filled CPI: 0 missing values
Filled Unemployment: 0 missing values
Final processed shape: (421570, 69)
Sample of engineered features:
  IsHoliday: mean=0.070, std=0.256
  TotalMarkDown: mean=6684.041, std=14750.942
  Month_sin: mean=-0.006, std=0.727
  IsQ4: mean=0.218, std=0.413
  Holiday_Promotion: mean=0.028, std=0.166
Input shape: (115064, 4)
Input columns: ['Store', 'Dept', 'Date', 'IsHoliday']
Merging with features...
After features merge: (115064, 14)
Merging with stores...
After stores merge: (115064, 16)
Filled Temperature: 0 missing values
Filled Fuel_Price: 0 missing values
Filled CPI: 0 missing value

XGBoostError: [21:45:55] /workspace/src/data/gradient_index.h:100: Check failed: valid: Input data contains `inf` or a value too large, while `missing` is not set to `inf`
Stack trace:
  [bt] (0) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x2a6ecc) [0x78afaf4a6ecc]
  [bt] (1) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x5d7680) [0x78afaf7d7680]
  [bt] (2) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x5eb966) [0x78afaf7eb966]
  [bt] (3) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x5e971b) [0x78afaf7e971b]
  [bt] (4) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x5ea949) [0x78afaf7ea949]
  [bt] (5) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x57b541) [0x78afaf77b541]
  [bt] (6) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(XGQuantileDMatrixCreateFromCallback+0x178) [0x78afaf3b93c8]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x78b05c2fde2e]
  [bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x78b05c2fa493]



In [None]:

# =============================================================================
# Block 5: ADVANCED Final Training and Prediction
# =============================================================================

print("\n" + "="*60)
print("BLOCK 5: ADVANCED FINAL TRAINING AND PREDICTION")
print("="*60)

# Decision: Retrain on full data vs use validation model
RETRAIN_ON_FULL_DATA = True  # Set to True for maximum accuracy

if RETRAIN_ON_FULL_DATA:
    print("🔄 Retraining ensemble on FULL dataset for maximum accuracy...")
    start_time = time.time()

    # Prepare full training data with lag features
    full_train_data = processed_train.copy()
    min_date_full = full_train_data['Date'].min() + timedelta(weeks=12)
    full_train_data = full_train_data[full_train_data['Date'] >= min_date_full]

    print(f"Full training data shape: {full_train_data.shape}")

    # Train final ensemble
    final_ensemble = HighAccuracyWalmartEnsemble()
    final_ensemble.fit(full_train_data)

    final_training_time = time.time() - start_time
    print(f"Final ensemble training completed in {final_training_time:.2f} seconds")

else:
    print("⚡ Using pre-trained validation model for speed...")
    final_ensemble = ensemble_model
    final_training_time = 0

# Advanced test prediction with post-processing
print("🔮 Generating advanced test predictions...")
start_time = time.time()

# Generate base predictions
test_predictions = final_ensemble.predict(processed_test)

# Post-processing for better accuracy
print("📊 Applying post-processing improvements...")

# 1. Handle extreme predictions
original_predictions = test_predictions.copy()

# Cap based on training data distribution
train_stats = processed_train['Weekly_Sales'].describe()
lower_bound = max(0, train_stats['25%'] * 0.1)  # Allow some flexibility below Q1
upper_bound = train_stats['99%'] * 1.2  # Cap at 120% of 99th percentile

extreme_low = np.sum(test_predictions < lower_bound)
extreme_high = np.sum(test_predictions > upper_bound)

test_predictions = np.clip(test_predictions, lower_bound, upper_bound)

print(f"Capped {extreme_low} extremely low and {extreme_high} extremely high predictions")

# 2. Seasonal adjustment for unseen store-dept combinations
print("🎯 Applying intelligent seasonal adjustments...")

# Create test dataframe with predictions for analysis
test_with_pred = processed_test.copy()
test_with_pred['Predictions'] = test_predictions

# Adjust predictions based on business logic
adjustments_made = 0

for idx, row in test_with_pred.iterrows():
    original_pred = test_predictions[idx]
    adjusted_pred = original_pred

    # Holiday boost (especially important for Walmart)
    if row['IsHoliday'] == 1:
        if row['IsBlackFriday'] == 1:
            adjusted_pred *= 1.25  # Black Friday boost
        elif row['IsChristmasWeek'] == 1:
            adjusted_pred *= 1.15  # Christmas week boost
        elif row['IsThanksgiving'] == 1:
            adjusted_pred *= 1.20  # Thanksgiving boost
        else:
            adjusted_pred *= 1.10  # General holiday boost

    # Q4 seasonal patterns
    if row['IsQ4'] == 1 and row['IsHoliday'] == 0:
        adjusted_pred *= 1.05  # General Q4 boost for non-holidays

    # January post-holiday dip
    if row['IsJanuary'] == 1 and row['IsHoliday'] == 0:
        adjusted_pred *= 0.92  # Post-holiday reduction

    # Back to school boost
    if row['IsBackToSchool'] == 1:
        adjusted_pred *= 1.08

    # Summer patterns
    if row['IsSummer'] == 1 and row['IsHoliday'] == 0:
        adjusted_pred *= 1.03

    # Promotion interaction
    if row['HasAnyPromotion'] == 1:
        if row['IsHoliday'] == 1:
            adjusted_pred *= 1.05  # Extra boost for holiday promotions
        else:
            adjusted_pred *= 1.02  # Modest promotion boost

    # Store type adjustments
    if 'Type_A' in row and row['Type_A'] == 1:
        adjusted_pred *= 1.02  # Supercenters typically higher
    elif 'Type_C' in row and row['Type_C'] == 1:
        adjusted_pred *= 0.98  # Neighborhood markets typically lower

    # Apply adjustment if significant
    if abs(adjusted_pred - original_pred) / original_pred > 0.01:  # Only if >1% change
        test_predictions[idx] = adjusted_pred
        adjustments_made += 1

print(f"Applied intelligent adjustments to {adjustments_made} predictions")

# 3. Final quality checks and fixes
print("🔍 Final quality assurance...")

# Ensure no negative predictions
negative_count = np.sum(test_predictions < 0)
if negative_count > 0:
    print(f"Fixed {negative_count} negative predictions")
    test_predictions = np.maximum(test_predictions, 100)  # Minimum $100 sales

# Check for NaN or infinite values
nan_count = np.sum(np.isnan(test_predictions))
inf_count = np.sum(np.isinf(test_predictions))

if nan_count > 0:
    print(f"Fixed {nan_count} NaN predictions")
    median_pred = np.nanmedian(test_predictions)
    test_predictions = np.where(np.isnan(test_predictions), median_pred, test_predictions)

if inf_count > 0:
    print(f"Fixed {inf_count} infinite predictions")
    median_pred = np.median(test_predictions[np.isfinite(test_predictions)])
    test_predictions = np.where(np.isinf(test_predictions), median_pred, test_predictions)

prediction_time = time.time() - start_time
print(f"Advanced predictions completed in {prediction_time:.2f} seconds")

# Comprehensive prediction analysis
print(f"\n=== FINAL PREDICTION ANALYSIS ===")
print(f"Total predictions: {len(test_predictions):,}")
print(f"Mean: ${np.mean(test_predictions):,.2f}")
print(f"Median: ${np.median(test_predictions):,.2f}")
print(f"Std: ${np.std(test_predictions):,.2f}")
print(f"Min: ${np.min(test_predictions):,.2f}")
print(f"Max: ${np.max(test_predictions):,.2f}")

# Compare with training distribution
train_mean = processed_train['Weekly_Sales'].mean()
train_median = processed_train['Weekly_Sales'].median()
train_std = processed_train['Weekly_Sales'].std()

print(f"\n📊 Comparison with Training Data:")
print(f"Mean - Train: ${train_mean:,.2f}, Test: ${np.mean(test_predictions):,.2f}, Ratio: {np.mean(test_predictions)/train_mean:.3f}")
print(f"Median - Train: ${train_median:,.2f}, Test: ${np.median(test_predictions):,.2f}, Ratio: {np.median(test_predictions)/train_median:.3f}")
print(f"Std - Train: ${train_std:,.2f}, Test: ${np.std(test_predictions):,.2f}, Ratio: {np.std(test_predictions)/train_std:.3f}")

# Distribution analysis
percentiles = [10, 25, 50, 75, 90, 95, 99]
pred_percentiles = np.percentile(test_predictions, percentiles)
train_percentiles = np.percentile(processed_train['Weekly_Sales'], percentiles)

print(f"\n📈 Percentile Comparison:")
for p, pred_p, train_p in zip(percentiles, pred_percentiles, train_percentiles):
    print(f"{p}th - Train: ${train_p:,.0f}, Test: ${pred_p:,.0f}, Ratio: {pred_p/train_p:.3f}")

# Create submission with validation
print("📄 Creating final submission...")
start_time = time.time()

# Ensure prediction count matches submission template
if len(test_predictions) != len(sample_submission):
    print(f"⚠️ Warning: Prediction count ({len(test_predictions)}) != submission count ({len(sample_submission)})")
    if len(test_predictions) > len(sample_submission):
        test_predictions = test_predictions[:len(sample_submission)]
        print("Truncated excess predictions")
    else:
        # This shouldn't happen with proper processing, but handle gracefully
        median_pred = np.median(test_predictions)
        padding = [median_pred] * (len(sample_submission) - len(test_predictions))
        test_predictions = np.concatenate([test_predictions, padding])
        print("Padded missing predictions with median")

# Create submission
submission = sample_submission.copy()
submission['Weekly_Sales'] = test_predictions

# Final validation of submission format
assert len(submission) == len(sample_submission), "Submission length mismatch"
assert not submission['Weekly_Sales'].isna().any(), "NaN values in submission"
assert (submission['Weekly_Sales'] >= 0).all(), "Negative values in submission"

# Save with advanced metadata
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'advanced_ensemble_submission_{timestamp}.csv'

try:
    submission.to_csv(submission_filename, index=False)
    print(f"✅ Submission saved: {submission_filename}")

    # Verify file integrity
    verification = pd.read_csv(submission_filename)
    assert len(verification) == len(submission), "File verification failed"
    print(f"✅ File integrity verified")

except Exception as e:
    print(f"❌ Error saving submission: {e}")
    alt_filename = f'backup_advanced_submission_{timestamp}.csv'
    submission.to_csv(alt_filename, index=False)
    submission_filename = alt_filename
    print(f"✅ Saved to backup: {alt_filename}")

submission_time = time.time() - start_time
print(f"Submission created in {submission_time:.2f} seconds")

# Comprehensive logging
total_time = prediction_time + submission_time
if RETRAIN_ON_FULL_DATA:
    total_time += final_training_time

wandb.log({
    'final_training_time': final_training_time,
    'prediction_time': prediction_time,
    'submission_time': submission_time,
    'total_block5_time': total_time,
    'final_predictions_count': len(test_predictions),
    'final_predictions_mean': float(np.mean(test_predictions)),
    'final_predictions_median': float(np.median(test_predictions)),
    'final_predictions_std': float(np.std(test_predictions)),
    'seasonal_adjustments_made': adjustments_made,
    'extreme_predictions_capped': extreme_low + extreme_high,
    'prediction_train_mean_ratio': np.mean(test_predictions) / train_mean,
    'prediction_train_median_ratio': np.median(test_predictions) / train_median,
    'file_size_mb': os.path.getsize(submission_filename) / (1024*1024)
})

print(f"\n" + "="*60)
print(f"🚀 ADVANCED ENSEMBLE FORECASTING COMPLETE!")
print(f"="*60)
print(f"📈 Validation MAE: {val_mae:.2f}")
print(f"🎯 Target: <4,000 MAE")
print(f"⏱️ Total Time: {total_time:.1f}s")
print(f"📁 Submission: {submission_filename}")
print(f"🔥 Ready for Kaggle submission!")

if val_mae < 4000:
    print("🏆 CONGRATULATIONS - TARGET ACHIEVED!")
else:
    print(f"📊 Current gap to target: {val_mae - 4000:.0f} points")
    print("💡 Consider: More lag features, hyperparameter tuning, or additional external data")

print("="*60)

In [None]:
# =============================================================================
# Block 6: Pipeline Saving and Artifact Creation
# =============================================================================

class WalmartARIMAPipeline(BaseEstimator):
    """Complete ARIMA pipeline for Walmart sales forecasting"""

    def __init__(self):
        self.feature_engineer = ARIMATimeSeriesFeatureEngineer()
        self.model = WalmartARIMAModel()
        self.fitted = False

    def fit(self, X, y=None):
        print("Fitting complete ARIMA pipeline...")
        processed_data = self.feature_engineer.fit_transform(X)
        self.model.fit(processed_data)
        self.fitted = True
        return self

    def predict(self, X):
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before prediction")
        processed_data = self.feature_engineer.transform(X)
        return self.model.predict(processed_data)

# Create and save pipeline
print("Creating complete ARIMA pipeline...")
pipeline = WalmartARIMAPipeline()
pipeline.fit(train_df)

# Save pipeline
pipeline_filename = f'walmart_arima_pipeline_{timestamp}.pkl'
with open(pipeline_filename, 'wb') as f:
    dill.dump(pipeline, f)

print(f"Pipeline saved: {pipeline_filename}")

# Create WandB artifacts
pipeline_artifact = wandb.Artifact(
    name="walmart_arima_pipeline",
    type="model",
    description="Complete ARIMA pipeline for Walmart sales forecasting",
    metadata={
        "model_type": "SARIMAX",
        "validation_mae": val_mae,
        "validation_rmse": val_rmse,
        "validation_mape": val_mape,
        "models_trained": len(final_model.models),
        "timestamp": timestamp,
        "seasonal_order": str(final_model.seasonal_order),
        "use_auto_arima": final_model.use_auto_arima
    }
)

pipeline_artifact.add_file(pipeline_filename)
wandb.log_artifact(pipeline_artifact)

submission_artifact = wandb.Artifact(
    name="walmart_arima_submission",
    type="dataset",
    description=f"ARIMA submission for Walmart sales - {timestamp}"
)
submission_artifact.add_file(submission_filename)
wandb.log_artifact(submission_artifact)

# Final comprehensive logging
wandb.log({
    'pipeline_saved': True,
    'submission_created': True,
    'test_predictions_mean': np.mean(test_predictions),
    'test_predictions_median': np.median(test_predictions),
    'test_predictions_std': np.std(test_predictions),
    'final_models_count': len(final_model.models),
    'model_orders_variety': len(set(str(info['order']) for info in final_model.model_orders.values())),
    'negative_predictions_handled': np.sum(test_predictions < 0) == 0
})

print("Walmart ARIMA forecasting completed successfully!")
print(f"Validation MAE: {val_mae:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Validation MAPE: {val_mape:.2f}%")
print(f"Models trained: {len(final_model.models)}")
print("Pipeline and submission saved to WandB!")

wandb.finish()