In [14]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [16]:
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json

In [17]:
! chmod 600 ~/.kaggle/kaggle.json

In [18]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)


In [19]:
# !pip install wandb -qU

In [20]:
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
import wandb
import joblib
import pickle
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("Starting Walmart Sales Forecasting Inference...")
print("=" * 50)

Starting Walmart Sales Forecasting Inference...


In [22]:
# Load datasets
print("Loading datasets...")
test = pd.read_csv('/content/test.csv')
stores = pd.read_csv('/content/stores.csv')
features = pd.read_csv('/content/features.csv')
sample_submission = pd.read_csv('/content/sampleSubmission.csv')

# Convert dates
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

print("Data loading completed!")

Loading datasets...
Data loading completed!


In [24]:
# YOU STILL NEED THESE CLASS DEFINITIONS (KEEP THEM)
class WalmartFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}

    def fit(self, X, y=None):
        # Your exact fit method
        X_with_stores = X.merge(stores, on='Store', how='left')

        if 'Type' in X_with_stores.columns:
            le = LabelEncoder()
            le.fit(X_with_stores['Type'].astype(str))
            self.label_encoders['Type'] = le
        return self

    def transform(self, X):
        # Your exact transform method
        X = X.copy()

        # Merge with additional data
        X = X.merge(stores, on='Store', how='left')
        X = X.merge(features, on=['Store', 'Date'], how='left')

        # Handle IsHoliday conflict
        if 'IsHoliday_x' in X.columns and 'IsHoliday_y' in X.columns:
            X['IsHoliday'] = X['IsHoliday_y'].fillna(X['IsHoliday_x'])
            X = X.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)

        # Basic date features
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Week'] = X['Date'].dt.isocalendar().week
        X['Weekday'] = X['Date'].dt.dayofweek
        X['Quarter'] = X['Date'].dt.quarter
        X['Is_Weekend'] = (X['Weekday'] >= 5).astype(int)

        # Simple holiday feature
        X['IsHoliday'] = X['IsHoliday'].fillna(False).astype(int)

        # Markdown features
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        for col in markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0)

        # Create total markdown
        if any(col in X.columns for col in markdown_cols):
            existing_cols = [col for col in markdown_cols if col in X.columns]
            X['Total_MarkDown'] = X[existing_cols].sum(axis=1)
            X['Num_MarkDowns'] = (X[existing_cols] > 0).sum(axis=1)

        # Economic features
        if 'Fuel_Price' in X.columns and 'CPI' in X.columns:
            X['Fuel_Price_to_CPI'] = X['Fuel_Price'] / (X['CPI'] + 1e-8)

        if 'Temperature' in X.columns:
            X['Temperature_squared'] = X['Temperature'] ** 2

        # Encode categorical variables
        for col, le in self.label_encoders.items():
            if col in X.columns:
                X[col] = le.transform(X[col].astype(str))

        # Fill missing values
        numeric_cols = X.select_dtypes(include=[np.number]).columns
        X[numeric_cols] = X[numeric_cols].fillna(0)

        # Select features
        cols_to_drop = ['Date']
        if 'Weekly_Sales' in X.columns:
            cols_to_drop.append('Weekly_Sales')
        feature_cols = [col for col in X.columns if col not in cols_to_drop]

        return X[feature_cols]

class XGBoostPipeline(BaseEstimator):
    def __init__(self, params, feature_engineer, selected_features):
        self.params = params
        self.feature_engineer = feature_engineer
        self.selected_features = selected_features
        self.model = None

    def fit(self, X, y):
        # Transform features
        X_transformed = self.feature_engineer.fit_transform(X)
        X_selected = X_transformed[self.selected_features]

        # Train XGBoost
        dtrain = xgb.DMatrix(X_selected, label=y)
        self.model = xgb.train(
            params=self.params,
            dtrain=dtrain,
            num_boost_round=1000,
            verbose_eval=False
        )
        return self

    def predict(self, X):
        # Transform features
        X_transformed = self.feature_engineer.transform(X)
        X_selected = X_transformed[self.selected_features]

        # Make predictions
        dtest = xgb.DMatrix(X_selected)
        return self.model.predict(dtest)

print("Pipeline classes defined!")

NameError: name 'BaseEstimator' is not defined

In [25]:
# Initialize wandb
wandb.login()

run = wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Inference_Clean",
    group="Inference",
    tags=["inference", "xgboost", "components"]
)

print("Wandb initialized!")

Wandb initialized!


In [26]:
# Loading section in inference - replace the model loading part with:

print("Loading trained model from wandb...")

try:
    # Load artifact (standard approach)
    artifact = run.use_artifact('walmart_xgboost_pipeline:latest', type='model')
    artifact_dir = artifact.download()

    print(f"✅ Model downloaded to: {artifact_dir}")

    # Load the trained pipeline with joblib
    model_path = f"{artifact_dir}/model.joblib"
    trained_pipeline = joblib.load(model_path)

    print("✅ Trained XGBoost pipeline loaded successfully!")
    print(f"Pipeline type: {type(trained_pipeline).__name__}")
    print(f"Selected features: {len(trained_pipeline.selected_features)}")

    # Log model loading info
    wandb.config.update({
        "model_loaded_from": "wandb_artifact",
        "model_type": "XGBoostPipeline",
        "n_features": len(trained_pipeline.selected_features),
        "artifact_version": artifact.version
    })

    model_loaded = True

except Exception as e:
    print(f"❌ Error loading model from wandb: {e}")
    print("Full error:", str(e))
    model_loaded = False
    trained_pipeline = None

Loading trained model from wandb...


[34m[1mwandb[0m:   1 of 1 files downloaded.  


✅ Model downloaded to: /content/artifacts/walmart_xgboost_pipeline:v1
❌ Error loading model from wandb: Can't get attribute 'XGBoostPipeline' on <module '__main__'>
Full error: Can't get attribute 'XGBoostPipeline' on <module '__main__'>


In [27]:
# Simple prediction function
def make_predictions(test_data):
  """Make predictions using loaded components"""
  if not model_loaded:
      print("⚠️  Model loading failed. Creating baseline model...")

      class BaselineModel:
          def predict(self, X):
              print("Using baseline predictions...")
              np.random.seed(42)

              predictions = []
              store_size_map = stores.set_index('Store')['Size'].to_dict()

              for idx, row in X.iterrows():
                  store_size = store_size_map.get(row['Store'], 150000)
                  month = pd.to_datetime(row['Date']).month

                  if month in [11, 12]:  # Holiday season
                      seasonal_mult = 1.3
                  elif month in [1, 2]:  # Post-holiday
                      seasonal_mult = 0.8
                  else:
                      seasonal_mult = 1.0

                  base_pred = (store_size / 1000) * seasonal_mult
                  noise = np.random.normal(0, base_pred * 0.1)
                  prediction = max(base_pred + noise, 0)
                  predictions.append(prediction)

              return np.array(predictions)

      trained_pipeline = BaselineModel()
      wandb.config.update({"model_loaded_from": "baseline_fallback"})
      print("⚠️  Using baseline model - results may not be optimal!")

  else:
        # Use trained components
        print("Using trained model components...")

        # Transform features using loaded feature engineer
        test_transformed = feature_engineer.transform(test_data)
        test_selected = test_transformed[selected_features]

        # Make predictions using loaded XGBoost model
        dtest = xgb.DMatrix(test_selected)
        predictions = xgb_model.predict(dtest)

        return predictions

# Generate predictions
print("\nGenerating predictions...")
predictions = make_predictions(test)

print("✅ Predictions generated!")
print(f"Shape: {predictions.shape}")
print(f"Range: ${predictions.min():,.2f} to ${predictions.max():,.2f}")
print(f"Mean: ${predictions.mean():,.2f}")


Generating predictions...
⚠️  Model loading failed. Creating baseline model...
⚠️  Using baseline model - results may not be optimal!
✅ Predictions generated!


AttributeError: 'NoneType' object has no attribute 'shape'

In [28]:
# Create and validate submission
print("\nCreating submission...")

submission = sample_submission.copy()
submission['Weekly_Sales'] = predictions

# Validation
validation_passed = True
issues = []

if submission.shape != sample_submission.shape:
    issues.append("Shape mismatch")
    validation_passed = False

if submission.isnull().sum().sum() > 0:
    issues.append("Missing values found")
    validation_passed = False

negative_count = (predictions < 0).sum()
if negative_count > 0:
    issues.append(f"{negative_count} negative predictions")

print(f"Validation: {'✅ PASSED' if validation_passed else '⚠️ ISSUES'}")
if issues:
    for issue in issues:
        print(f"  - {issue}")

# Save submission
submission_filename = 'walmart_submission_clean.csv'
submission.to_csv(submission_filename, index=False)

print(f"\n💾 Submission saved: {submission_filename}")

# Log results
wandb.log({
    "predictions_mean": float(predictions.mean()),
    "predictions_std": float(predictions.std()),
    "validation_passed": validation_passed,
    "negative_predictions": int(negative_count),
    "submission_ready": validation_passed
})

print(f"\n🎯 Ready for Kaggle submission!")
print(f"📊 Wandb: {run.url}")

wandb.finish()


Creating submission...


TypeError: '<' not supported between instances of 'NoneType' and 'int'