In [12]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [14]:
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json

In [15]:
! chmod 600 ~/.kaggle/kaggle.json

In [16]:
!pip install wandb -qU

In [17]:
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

In [18]:
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [19]:
import pandas as pd
import numpy as np
import wandb
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== WALMART SALES FORECASTING - MODEL INFERENCE ===\n")

=== WALMART SALES FORECASTING - MODEL INFERENCE ===



In [20]:
# Initialize wandb for inference
wandb.login()
wandb.init(project="walmart-sales-forecasting", name="XGBoost_Inference", tags=["inference", "xgboost", "submission"])

print("Loading test datasets...")

# Load all necessary datasets
test = pd.read_csv("/content/test.csv")
features = pd.read_csv("/content/features.csv")
stores = pd.read_csv("/content/stores.csv")
sample_submission = pd.read_csv("/content/sampleSubmission.csv")

print(f"Test data shape: {test.shape}")
print(f"Features data shape: {features.shape}")
print(f"Stores data shape: {stores.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Log basic info
wandb.log({
    "test_samples": test.shape[0],
    "submission_samples": sample_submission.shape[0]
})



0,1
merged_test_shape,▁
submission_samples,▁
test_missing_values,▁
test_samples,▁

0,1
merged_test_shape,115064
submission_samples,115064
test_date_range_end,2013-07-26 00:00:00
test_date_range_start,2012-11-02 00:00:00
test_missing_values,127817
test_samples,115064


Loading test datasets...
Test data shape: (115064, 4)
Features data shape: (8190, 12)
Stores data shape: (45, 3)
Sample submission shape: (115064, 2)


In [21]:
# Merge test data with features and stores (same as training process)
print("\nMerging test datasets...")

test_data = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_data = test_data.merge(stores, on='Store', how='left')

print(f"Merged test data shape: {test_data.shape}")
print(f"Missing values in test data:\n{test_data.isnull().sum().sum()} total missing values")

# Check date range
test_data['Date'] = pd.to_datetime(test_data['Date'])
print(f"Test date range: {test_data['Date'].min()} to {test_data['Date'].max()}")

wandb.log({
    "merged_test_shape": test_data.shape[0],
    "test_missing_values": test_data.isnull().sum().sum(),
    "test_date_range_start": str(test_data['Date'].min()),
    "test_date_range_end": str(test_data['Date'].max())
})


Merging test datasets...
Merged test data shape: (115064, 15)
Missing values in test data:
127817 total missing values
Test date range: 2012-11-02 00:00:00 to 2013-07-26 00:00:00


In [22]:
# Download the best model from wandb
print("\n=== DOWNLOADING MODEL FROM WANDB ===")

try:
    # Get the latest version of the model artifact
    api = wandb.Api()
    artifact = api.artifact('walmart-sales-forecasting/xgboost_pipeline:latest', type='model')
    artifact_dir = artifact.download()

    print(f"✓ Model artifact downloaded to: {artifact_dir}")

    # Display model metadata
    model_metadata = artifact.metadata
    print(f"\nModel Information:")
    print(f"  Training MAE: {model_metadata.get('train_mae', 'N/A')}")
    print(f"  Training RMSE: {model_metadata.get('train_rmse', 'N/A')}")
    print(f"  Training MAPE: {model_metadata.get('train_mape', 'N/A')}%")
    print(f"  Training R²: {model_metadata.get('train_r2', 'N/A')}")
    print(f"  Features count: {model_metadata.get('features_count', 'N/A')}")
    print(f"  Training samples: {model_metadata.get('training_samples', 'N/A')}")

    # Log model metadata
    wandb.log({
        "loaded_model_mae": model_metadata.get('train_mae', 0),
        "loaded_model_rmse": model_metadata.get('train_rmse', 0),
        "loaded_model_r2": model_metadata.get('train_r2', 0),
        "model_features": model_metadata.get('features_count', 0)
    })

except Exception as e:
    print(f"❌ Error downloading model: {e}")
    print("Please check your wandb project and model artifact name")
    raise


=== DOWNLOADING MODEL FROM WANDB ===


[34m[1mwandb[0m:   1 of 1 files downloaded.  


✓ Model artifact downloaded to: /content/artifacts/xgboost_pipeline:v0

Model Information:
  Training MAE: 2829.0733395522316
  Training RMSE: 4950.429855810212
  Training MAPE: 4941.608704966984%
  Training R²: 0.9524875616202894
  Features count: 34
  Training samples: 421570


In [23]:
# Download and load model from wandb (similar to MLflow model registry)
print("\n=== LOADING MODEL FROM WANDB ===")

with wandb.init(project="walmart-sales-forecasting", name="Load_Best_Model", tags=["inference", "model_loading"]) as run:

    # Download the model artifact (like MLflow model registry)
    model_artifact_name = "xgboost_pipeline:latest"
    print(f"Loading model: {model_artifact_name}")

    try:
        # Download artifact
        artifact = run.use_artifact(model_artifact_name, type='model')
        artifact_dir = artifact.download()

        # Find the pipeline file
        import os
        pipeline_files = [f for f in os.listdir(artifact_dir) if f.endswith('.pkl')]
        if not pipeline_files:
            raise FileNotFoundError("No pipeline file found in artifact")

        pipeline_path = os.path.join(artifact_dir, pipeline_files[0])

        # Load using cloudpickle (handles custom classes better) or dill
        try:
            import cloudpickle
            with open(pipeline_path, 'rb') as f:
                model_pipeline = cloudpickle.load(f)
            print("✓ Model loaded with cloudpickle")
        except ImportError:
            try:
                import dill
                with open(pipeline_path, 'rb') as f:
                    model_pipeline = dill.load(f)
                print("✓ Model loaded with dill")
            except ImportError:
                # Install cloudpickle and retry
                import subprocess
                subprocess.check_call(['pip', 'install', 'cloudpickle'])
                import cloudpickle
                with open(pipeline_path, 'rb') as f:
                    model_pipeline = cloudpickle.load(f)
                print("✓ Model loaded with cloudpickle (after install)")

        # Log model metadata
        wandb.log({
            "model_name": model_artifact_name,
            "model_loaded": True,
            "pipeline_components": list(model_pipeline.named_steps.keys())
        })

        print(f"✓ Model loaded successfully from wandb!")
        print(f"Pipeline components: {list(model_pipeline.named_steps.keys())}")

    except Exception as e:
        print(f"❌ Error loading from wandb: {e}")
        raise


=== LOADING MODEL PIPELINE ===
Loading pipeline from: /content/artifacts/xgboost_pipeline:v0/xgboost_pipeline_20250704_162410.pkl
❌ Error loading pipeline: Can't get attribute 'WalmartDataPreprocessor' on <module '__main__'>


AttributeError: Can't get attribute 'WalmartDataPreprocessor' on <module '__main__'>

In [None]:
# Make predictions
print("\n=== MAKING PREDICTIONS ===")

# Prepare test features (exclude target column that doesn't exist in test)
print("Preparing test features...")
X_test = test_data.copy()

print(f"Test features shape: {X_test.shape}")
print(f"Test feature columns: {list(X_test.columns)}")

# Make predictions using the pipeline
print("Making predictions...")
test_predictions = model_pipeline.predict(X_test)

print(f"✓ Predictions completed!")
print(f"Predictions shape: {test_predictions.shape}")
print(f"Predictions summary:")
print(f"  Min: {test_predictions.min():.2f}")
print(f"  Max: {test_predictions.max():.2f}")
print(f"  Mean: {test_predictions.mean():.2f}")
print(f"  Std: {test_predictions.std():.2f}")

# Log prediction statistics
wandb.log({
    "predictions_count": len(test_predictions),
    "predictions_min": float(test_predictions.min()),
    "predictions_max": float(test_predictions.max()),
    "predictions_mean": float(test_predictions.mean()),
    "predictions_std": float(test_predictions.std())
})

In [None]:
# Create submission file
print("\n=== CREATING SUBMISSION FILE ===")

# Create submission DataFrame
submission = sample_submission.copy()
submission['Weekly_Sales'] = test_predictions

# Verify submission format
print(f"Submission shape: {submission.shape}")
print(f"Submission columns: {list(submission.columns)}")
print("\nFirst 10 predictions:")
print(submission.head(10))

print("\nLast 10 predictions:")
print(submission.tail(10))

# Check for any issues
if submission.isnull().sum().sum() > 0:
    print(f"⚠️  Warning: {submission.isnull().sum().sum()} missing values in submission")
else:
    print("✓ No missing values in submission")

if len(submission) != len(sample_submission):
    print(f"⚠️  Warning: Submission length mismatch!")
    print(f"Expected: {len(sample_submission)}, Got: {len(submission)}")
else:
    print("✓ Submission length matches expected format")

In [None]:
# Save submission file
print("\n=== SAVING SUBMISSION ===")

# Create timestamped filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f"xgboost_submission_{timestamp}.csv"

# Save submission
submission.to_csv(submission_filename, index=False)
print(f"✓ Submission saved as: {submission_filename}")

# Display submission statistics
print(f"\nSubmission Statistics:")
print(f"  Total predictions: {len(submission):,}")
print(f"  Average prediction: ${submission['Weekly_Sales'].mean():,.2f}")
print(f"  Prediction range: ${submission['Weekly_Sales'].min():,.2f} - ${submission['Weekly_Sales'].max():,.2f}")

# Create simple visualization of predictions
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(test_predictions, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Predicted Weekly Sales')
plt.ylabel('Frequency')
plt.title('Distribution of Predictions')

plt.subplot(1, 2, 2)
plt.plot(test_predictions[:1000])  # Plot first 1000 predictions
plt.xlabel('Sample Index')
plt.ylabel('Predicted Weekly Sales')
plt.title('First 1000 Predictions')

plt.tight_layout()
plt.savefig('prediction_analysis.png', dpi=300, bbox_inches='tight')
wandb.log({"prediction_analysis": wandb.Image('prediction_analysis.png')})
plt.show()

In [None]:
# Log submission as wandb artifact
print("\n=== LOGGING SUBMISSION TO WANDB ===")

try:
    # Create submission artifact
    submission_artifact = wandb.Artifact(
        name=f"submission_{timestamp}",
        type="submission",
        description=f"XGBoost submission for Walmart sales forecasting - {timestamp}",
        metadata={
            "submission_count": len(submission),
            "predictions_mean": float(submission['Weekly_Sales'].mean()),
            "predictions_std": float(submission['Weekly_Sales'].std()),
            "predictions_min": float(submission['Weekly_Sales'].min()),
            "predictions_max": float(submission['Weekly_Sales'].max()),
            "model_used": "XGBoost Pipeline",
            "timestamp": timestamp
        }
    )

    # Add submission file to artifact
    submission_artifact.add_file(submission_filename)

    # Log artifact
    wandb.log_artifact(submission_artifact)
    print("✓ Submission logged to wandb successfully!")

except Exception as e:
    print(f"⚠️  Error logging submission to wandb: {e}")
    print("Submission file saved locally")

# Final log
wandb.log({
    "submission_filename": submission_filename,
    "inference_completed": True,
    "final_submission_count": len(submission)
})

In [None]:
# Final summary and instructions
print(f"\n" + "="*70)
print("INFERENCE COMPLETED SUCCESSFULLY!")
print("="*70)
print(f"📁 Submission file: {submission_filename}")
print(f"📊 Total predictions: {len(submission):,}")
print(f"💰 Average prediction: ${submission['Weekly_Sales'].mean():,.2f}")
print(f"📈 Prediction range: ${submission['Weekly_Sales'].min():,.2f} - ${submission['Weekly_Sales'].max():,.2f}")
print(f"🤖 Model used: XGBoost Pipeline")
print(f"⏰ Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)
print("📤 NEXT STEPS:")
print(f"1. Upload '{submission_filename}' to Kaggle competition")
print("2. Check your score on the leaderboard")
print("3. Compare with other models")
print("="*70)

wandb.finish()