In [1]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [3]:
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

In [4]:
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [5]:
# !pip install wandb -qU
# !pip uninstall -y pmdarima numpy scipy statsmodels
# !pip install numpy==1.24.4 scipy==1.10.1 statsmodels==0.13.5 pmdarima==2.0.3

In [6]:
import wandb
import random
import math
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

In [7]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdshan21[0m ([33mdshan21-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
import pandas as pd
import numpy as np
import wandb
import pickle
import joblib
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

# Initialize WandB
wandb.init(project="walmart-sales-forecasting", name="Prophet_Experiment")

# Load data
print("Loading data...")
train_df = pd.read_csv("/content/train.csv")
features_df = pd.read_csv("/content/features.csv")
stores_df = pd.read_csv("/content/stores.csv")
test_df = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sampleSubmission.csv")

print("Data loaded successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Features shape: {features_df.shape}")
print(f"Stores shape: {stores_df.shape}")
print(f"Test shape: {test_df.shape}")

# Log dataset info to WandB
wandb.log({
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "n_stores": train_df['Store'].nunique(),
    "n_departments": train_df['Dept'].nunique()
})

Loading data...
Data loaded successfully!
Train shape: (421570, 5)
Features shape: (8190, 12)
Stores shape: (45, 3)
Test shape: (115064, 4)


In [9]:
class EnhancedTimeSeriesPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.store_encodings = {}
        self.dept_encodings = {}
        self.seasonal_components = {}

    def fit(self, X, y=None):
        return self

    def _add_lag_features(self, df, group_cols=['Store', 'Dept']):
        """Add lag features for time series analysis"""
        print("Adding lag features...")

        # Sort by date to ensure proper lag calculation
        df = df.sort_values(group_cols + ['Date'])

        # Add lag features for each store-dept combination
        for lag in [1, 2, 3, 7, 14, 30]:
            df[f'lag_{lag}'] = df.groupby(group_cols)['Weekly_Sales'].shift(lag)

        return df

    def _add_rolling_features(self, df, group_cols=['Store', 'Dept']):
        """Add rolling statistics features"""
        print("Adding rolling features...")

        # Add rolling means and stds
        for window in [7, 14, 30]:
            df[f'rolling_mean_{window}'] = df.groupby(group_cols)['Weekly_Sales'].transform(
                lambda x: x.rolling(window=window, min_periods=1).mean()
            )
            df[f'rolling_std_{window}'] = df.groupby(group_cols)['Weekly_Sales'].transform(
                lambda x: x.rolling(window=window, min_periods=1).std()
            )

        return df

    def _add_fourier_features(self, df):
        """Add Fourier features for cyclical patterns"""
        print("Adding Fourier features...")

        # Add day of year for annual patterns
        df['dayofyear'] = df['Date'].dt.dayofyear

        # Add Fourier components for annual seasonality
        for i in range(1, 6):
            df[f'sin_annual_{i}'] = np.sin(2 * np.pi * i * df['dayofyear'] / 365.25)
            df[f'cos_annual_{i}'] = np.cos(2 * np.pi * i * df['dayofyear'] / 365.25)

        # Add Fourier components for weekly seasonality
        df['dayofweek'] = df['Date'].dt.dayofweek
        for i in range(1, 4):
            df[f'sin_weekly_{i}'] = np.sin(2 * np.pi * i * df['dayofweek'] / 7)
            df[f'cos_weekly_{i}'] = np.cos(2 * np.pi * i * df['dayofweek'] / 7)

        return df

    def _add_seasonal_decomposition(self, df, group_cols=['Store', 'Dept']):
        """Add seasonal decomposition components where possible"""
        print("Adding seasonal decomposition features...")

        df['trend_component'] = np.nan
        df['seasonal_component'] = np.nan
        df['residual_component'] = np.nan

        # Apply decomposition for groups with sufficient data
        for name, group in df.groupby(group_cols):
            if len(group) >= 52:  # At least one year of data
                try:
                    # Create weekly aggregation for decomposition
                    weekly_data = group.set_index('Date')['Weekly_Sales'].resample('W').mean()
                    if len(weekly_data) >= 52:
                        decomposition = seasonal_decompose(weekly_data, model='additive', period=52)

                        # Map back to original data
                        for idx in group.index:
                            date = group.loc[idx, 'Date']
                            week_start = date - timedelta(days=date.weekday())
                            closest_week = weekly_data.index[weekly_data.index.get_loc(week_start, method='nearest')]

                            df.loc[idx, 'trend_component'] = decomposition.trend.loc[closest_week]
                            df.loc[idx, 'seasonal_component'] = decomposition.seasonal.loc[closest_week]
                            df.loc[idx, 'residual_component'] = decomposition.resid.loc[closest_week]
                except:
                    continue

        # Fill missing values with median
        df['trend_component'] = df['trend_component'].fillna(df['trend_component'].median())
        df['seasonal_component'] = df['seasonal_component'].fillna(0)
        df['residual_component'] = df['residual_component'].fillna(0)

        return df

    def transform(self, X, is_train=True):
        # Make a copy to avoid modifying original data
        df = X.copy()

        # Convert Date to datetime first
        df['Date'] = pd.to_datetime(df['Date'])

        # Also convert Date in features_df and stores_df to datetime
        features_df_copy = features_df.copy()
        features_df_copy['Date'] = pd.to_datetime(features_df_copy['Date'])

        # Merge with features data (matching on Store and Date)
        df = df.merge(features_df_copy, on=['Store', 'Date'], how='left', suffixes=('', '_feat'))

        # Merge with stores data (only matching on Store, no Date column in stores)
        df = df.merge(stores_df, on='Store', how='left')

        # Handle missing values in numeric columns
        numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())

        # Fill markdown columns with 0 (these are promotional markdowns)
        markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
        for col in markdown_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0)

        # Handle IsHoliday column - take the one from main data, fill missing with features data
        if 'IsHoliday_feat' in df.columns:
            df['IsHoliday'] = df['IsHoliday'].fillna(df['IsHoliday_feat'])
            df = df.drop('IsHoliday_feat', axis=1)

        # Enhanced time-based features
        print("Creating enhanced time-based features...")
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Week'] = df['Date'].dt.isocalendar().week
        df['DayOfYear'] = df['Date'].dt.dayofyear
        df['Quarter'] = df['Date'].dt.quarter
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['WeekOfYear'] = df['Date'].dt.isocalendar().week

        # Season indicators based on findings (fall to early spring sales increase)
        df['IsFallWinterSeason'] = ((df['Month'] >= 9) | (df['Month'] <= 3)).astype(int)
        df['IsHolidaySeason'] = ((df['Month'] == 11) | (df['Month'] == 12) | (df['Month'] == 1)).astype(int)

        # Create holiday features
        df['IsHoliday'] = df['IsHoliday'].astype(int)

        # Create store type features
        if 'Type' in df.columns:
            df['Type_A'] = (df['Type'] == 'A').astype(int)
            df['Type_B'] = (df['Type'] == 'B').astype(int)
            df['Type_C'] = (df['Type'] == 'C').astype(int)

        # Handle Size column
        if 'Size' in df.columns:
            df['Size'] = df['Size'].fillna(df['Size'].median())

        # Add enhanced time series features only for training data
        if is_train and 'Weekly_Sales' in df.columns:
            # Add lag features
            df = self._add_lag_features(df)

            # Add rolling features
            df = self._add_rolling_features(df)

            # Add seasonal decomposition
            df = self._add_seasonal_decomposition(df)

        # Add Fourier features for both train and test
        df = self._add_fourier_features(df)

        return df

# Initialize enhanced preprocessor
enhanced_preprocessor = EnhancedTimeSeriesPreprocessor()

# Preprocess data
print("Preprocessing data with enhanced time series features...")
processed_train = enhanced_preprocessor.transform(train_df, is_train=True)
processed_test = enhanced_preprocessor.transform(test_df, is_train=False)

print("Enhanced data preprocessing completed!")
print(f"Processed train shape: {processed_train.shape}")
print(f"Processed test shape: {processed_test.shape}")

# Check for any remaining missing values
print(f"Missing values in train: {processed_train.isnull().sum().sum()}")
print(f"Missing values in test: {processed_test.isnull().sum().sum()}")

# Display new feature columns
new_features = [col for col in processed_train.columns if any(x in col.lower() for x in ['lag_', 'rolling_', 'sin_', 'cos_', 'trend_', 'seasonal_', 'residual_'])]
print(f"New time series features added: {len(new_features)}")
print("Sample features:", new_features[:10])

# Log preprocessing info to WandB
wandb.log({
    "processed_train_shape": processed_train.shape,
    "processed_test_shape": processed_test.shape,
    "missing_values_train": processed_train.isnull().sum().sum(),
    "missing_values_test": processed_test.isnull().sum().sum(),
    "train_date_range": f"{processed_train['Date'].min()} to {processed_train['Date'].max()}",
    "test_date_range": f"{processed_test['Date'].min()} to {processed_test['Date'].max()}",
    "new_features_count": len(new_features)
})

Preprocessing data with enhanced time series features...
Creating enhanced time-based features...
Adding lag features...
Adding rolling features...
Adding seasonal decomposition features...
Adding Fourier features...
Creating enhanced time-based features...
Adding Fourier features...
Enhanced data preprocessing completed!
Processed train shape: (421570, 61)
Processed test shape: (115064, 45)
Missing values in train: 613195
Missing values in test: 0
New time series features added: 31
Sample features: ['lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14', 'lag_30', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14']


In [10]:
print("Creating time-based train/validation split...")

# Sort by date to ensure proper time series split
processed_train = processed_train.sort_values('Date')

# Use time-based split instead of random split (maintaining temporal order)
max_date = processed_train['Date'].max()
split_date = max_date - timedelta(weeks=12)  # Last 12 weeks for validation

train_split = processed_train[processed_train['Date'] <= split_date].copy()
val_split = processed_train[processed_train['Date'] > split_date].copy()

print(f"Time-based split created:")
print(f"Train split: {len(train_split)} samples (up to {split_date.date()})")
print(f"Validation split: {len(val_split)} samples (after {split_date.date()})")
print(f"Train date range: {train_split['Date'].min().date()} to {train_split['Date'].max().date()}")
print(f"Val date range: {val_split['Date'].min().date()} to {val_split['Date'].max().date()}")

# Log split information
wandb.log({
    "train_split_samples": len(train_split),
    "val_split_samples": len(val_split),
    "split_date": split_date.strftime('%Y-%m-%d'),
    "time_series_split": True
})

Creating time-based train/validation split...
Time-based split created:
Train split: 386007 samples (up to 2012-08-03)
Validation split: 35563 samples (after 2012-08-03)
Train date range: 2010-02-05 to 2012-08-03
Val date range: 2012-08-10 to 2012-10-26


In [11]:
print("Creating time-based train/validation split...")

# Sort by date to ensure proper time series split
processed_train = processed_train.sort_values('Date')

# Use time-based split instead of random split (maintaining temporal order)
max_date = processed_train['Date'].max()
split_date = max_date - timedelta(weeks=12)  # Last 12 weeks for validation

train_split = processed_train[processed_train['Date'] <= split_date].copy()
val_split = processed_train[processed_train['Date'] > split_date].copy()

print(f"Time-based split created:")
print(f"Train split: {len(train_split)} samples (up to {split_date.date()})")
print(f"Validation split: {len(val_split)} samples (after {split_date.date()})")
print(f"Train date range: {train_split['Date'].min().date()} to {train_split['Date'].max().date()}")
print(f"Val date range: {val_split['Date'].min().date()} to {val_split['Date'].max().date()}")

# Log split information
wandb.log({
    "train_split_samples": len(train_split),
    "val_split_samples": len(val_split),
    "split_date": split_date.strftime('%Y-%m-%d'),
    "time_series_split": True
})

Creating time-based train/validation split...
Time-based split created:
Train split: 386007 samples (up to 2012-08-03)
Validation split: 35563 samples (after 2012-08-03)
Train date range: 2010-02-05 to 2012-08-03
Val date range: 2012-08-10 to 2012-10-26


In [12]:
class EnhancedWalmartProphetPipeline(BaseEstimator):
    def __init__(self, prophet_params=None):
        self.preprocessor = EnhancedTimeSeriesPreprocessor()
        self.prophet_params = prophet_params or best_params
        self.model = None

    def fit(self, X, y=None):
        print("Fitting enhanced pipeline...")
        # Preprocess data
        processed_data = self.preprocessor.transform(X, is_train=True)

        # Train Enhanced Prophet model
        self.model = EnhancedProphetModel(**self.prophet_params)
        self.model.fit(processed_data)

        return self

    def predict(self, X):
        if self.model is None:
            raise ValueError("Pipeline must be fitted before making predictions")

        # Preprocess data
        processed_data = self.preprocessor.transform(X, is_train=False)

        # Make predictions
        predictions = self.model.predict(processed_data)

        return predictions

# Create and fit the enhanced pipeline
print("Creating Enhanced Prophet pipeline...")
enhanced_prophet_pipeline = EnhancedWalmartProphetPipeline(prophet_params=best_params)
enhanced_prophet_pipeline.fit(train_df)

# Test pipeline predictions
print("Testing enhanced pipeline predictions...")
test_predictions = enhanced_prophet_pipeline.predict(test_df)

# Create submission
submission_df = sample_submission.copy()
submission_df['Weekly_Sales'] = test_predictions

# Save submission
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'enhanced_prophet_submission_{timestamp}.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission saved as: {submission_filename}")

# Save the enhanced pipeline with dill
import dill
pipeline_filename = f'enhanced_prophet_pipeline_{timestamp}.pkl'
with open(pipeline_filename, 'wb') as f:
    dill.dump(enhanced_prophet_pipeline, f)

wandb.init(project="walmart-sales-forecasting", name="Prophet_save_model")

print(f"Enhanced pipeline saved locally as: {pipeline_filename}")

# Create WandB Artifact BEFORE logging
print("Creating WandB artifact for the enhanced pipeline...")
pipeline_artifact = wandb.Artifact(
    name="prophet_pipeline",
    type="model",
    description="Enhanced Prophet model pipeline with time series features for Walmart sales forecasting",
    metadata={
        "model_type": "Enhanced_Prophet",
        "train_mae": train_mae,
        "train_rmse": train_rmse,
        "n_models": len(final_model.models),
        "best_params": best_params,
        "timestamp": timestamp,
        "enhanced_features": True,
        "time_series_split": True
    }
)

# Add the pipeline file to the artifact BEFORE logging
pipeline_artifact.add_file(pipeline_filename)

# NOW log the artifact (this finalizes it)
wandb.log_artifact(pipeline_artifact)

# Continue with submission artifact...
submission_artifact = wandb.Artifact(
    name="prophet_submission",
    type="dataset",
    description=f"Enhanced Prophet model submission for Kaggle - {timestamp}"
)
submission_artifact.add_file(submission_filename)
wandb.log_artifact(submission_artifact)

# Log final summary
wandb.log({
    'pipeline_created': True,
    'pipeline_artifact_name': "prophet_pipeline",
    'submission_artifact_name': "prophet_submission",
    'test_predictions_mean': np.mean(test_predictions),
    'test_predictions_std': np.std(test_predictions),
    'model_registry_success': True
})

print("Enhanced Prophet experiment completed successfully!")
print("Enhanced pipeline and submission saved to WandB artifacts!")
wandb.finish()

Creating Enhanced Prophet pipeline...


NameError: name 'best_params' is not defined

In [None]:
import dill
pipeline_filename = f'enhanced_prophet_pipeline_{timestamp}.pkl'
with open(pipeline_filename, 'wb') as f:
    dill.dump(enhanced_prophet_pipeline, f)

wandb.init(project="walmart-sales-forecasting", name="Prophet_save_model")

print(f"Enhanced pipeline saved locally as: {pipeline_filename}")

# Create WandB Artifact BEFORE logging
print("Creating WandB artifact for the enhanced pipeline...")
pipeline_artifact = wandb.Artifact(
    name="prophet_pipeline",
    type="model",
    description="Enhanced Prophet model pipeline with time series features for Walmart sales forecasting",
    metadata={
        "model_type": "Enhanced_Prophet",
        "train_mae": train_mae,
        "train_rmse": train_rmse,
        "n_models": len(final_model.models),
        "best_params": best_params,
        "timestamp": timestamp,
        "enhanced_features": True,
        "time_series_split": True
    }
)

# Add the pipeline file to the artifact BEFORE logging
pipeline_artifact.add_file(pipeline_filename)

# NOW log the artifact (this finalizes it)
wandb.log_artifact(pipeline_artifact)

# Continue with submission artifact...
submission_artifact = wandb.Artifact(
    name="prophet_submission",
    type="dataset",
    description=f"Enhanced Prophet model submission for Kaggle - {timestamp}"
)
submission_artifact.add_file(submission_filename)
wandb.log_artifact(submission_artifact)

# Log final summary
wandb.log({
    'pipeline_created': True,
    'pipeline_artifact_name': "prophet_pipeline",
    'submission_artifact_name': "prophet_submission",
    'test_predictions_mean': np.mean(test_predictions),
    'test_predictions_std': np.std(test_predictions),
    'model_registry_success': True
})

print("Enhanced Prophet experiment completed successfully!")
print("Enhanced pipeline and submission saved to WandB artifacts!")
wandb.finish()