In [1]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [2]:
! mkdir ~/.kaggle

In [3]:
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 621MB/s]


In [6]:
! unzip /content/walmart-recruiting-store-sales-forecasting.zip
! unzip /content/train.csv.zip
! unzip /content/test.csv.zip
! unzip /content/features.csv.zip
! unzip /content/sampleSubmission.csv.zip

Archive:  /content/walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           
Archive:  /content/train.csv.zip
  inflating: train.csv               
Archive:  /content/test.csv.zip
  inflating: test.csv                
Archive:  /content/features.csv.zip
  inflating: features.csv            
Archive:  /content/sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    


In [7]:
!pip install wandb -qU

In [8]:
import wandb
import random
import math

In [9]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdshan21[0m ([33mdshan21-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
# model_experiment_XGBoost.ipynb (Updated with proper week calculation fix)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import xgboost as xgb
import wandb
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

# Fixed Custom transformers for our pipeline
class DataMerger(BaseEstimator, TransformerMixin):
    """Merge main data with features and stores data"""

    def __init__(self, features_df, stores_df):
        self.features_df = features_df
        self.stores_df = stores_df

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Merge with features
        X_merged = X.merge(self.features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
        # Merge with stores
        X_merged = X_merged.merge(self.stores_df, on='Store', how='left')
        return X_merged

class DateFeatureCreator(BaseEstimator, TransformerMixin):
    """Create date-based features"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['DayOfWeek'] = X['Date'].dt.dayofweek
        X['Quarter'] = X['Date'].dt.quarter

        # Fix for the week calculation - use a more robust approach
        try:
            # Try the modern approach first
            X['Week'] = X['Date'].dt.isocalendar().week.values
        except:
            # Fallback to alternative method
            X['Week'] = X['Date'].dt.strftime('%U').astype(int)

        X['WeekOfMonth'] = X['Day'].apply(lambda x: (x-1) // 7 + 1)

        # Additional time features
        X['IsWeekend'] = (X['DayOfWeek'] >= 5).astype(int)
        X['MonthStart'] = (X['Day'] <= 7).astype(int)
        X['MonthEnd'] = (X['Day'] >= 25).astype(int)  # Simplified to avoid days_in_month issues

        # Seasonal features
        X['IsSummer'] = X['Month'].isin([6, 7, 8]).astype(int)
        X['IsWinter'] = X['Month'].isin([12, 1, 2]).astype(int)
        X['IsSpring'] = X['Month'].isin([3, 4, 5]).astype(int)
        X['IsFall'] = X['Month'].isin([9, 10, 11]).astype(int)

        return X

class LagFeatureCreator(BaseEstimator, TransformerMixin):
    """Create lag features for time series"""

    def __init__(self, target_col='Weekly_Sales', lags=[1, 2, 4, 8, 12]):
        self.target_col = target_col
        self.lags = lags
        self.lag_medians = {}

    def fit(self, X, y=None):
        if self.target_col in X.columns:
            X_sorted = X.sort_values(['Store', 'Dept', 'Date']).copy()
            X_sorted = X_sorted.reset_index(drop=True)

            for lag in self.lags:
                lag_col = f'{self.target_col}_lag_{lag}'
                try:
                    lag_values = X_sorted.groupby(['Store', 'Dept'])[self.target_col].shift(lag)
                    self.lag_medians[lag_col] = lag_values.median()
                except Exception as e:
                    print(f"Warning: Could not create lag {lag}, using 0 as default")
                    self.lag_medians[lag_col] = 0
        return self

    def transform(self, X):
        X_sorted = X.sort_values(['Store', 'Dept', 'Date']).copy()
        X_sorted = X_sorted.reset_index(drop=True)

        for lag in self.lags:
            lag_col = f'{self.target_col}_lag_{lag}'
            if self.target_col in X.columns:
                # Training data - compute actual lags
                try:
                    X_sorted[lag_col] = X_sorted.groupby(['Store', 'Dept'])[self.target_col].shift(lag)
                except Exception as e:
                    print(f"Warning: Could not create lag {lag} in transform, using median")
                    X_sorted[lag_col] = self.lag_medians.get(lag_col, 0)
            else:
                # Test data - use median from training
                X_sorted[lag_col] = self.lag_medians.get(lag_col, 0)

        return X_sorted

class RollingFeatureCreator(BaseEstimator, TransformerMixin):
    """Create rolling statistics features"""

    def __init__(self, target_col='Weekly_Sales', windows=[4, 8, 12]):
        self.target_col = target_col
        self.windows = windows
        self.rolling_medians = {}

    def fit(self, X, y=None):
        if self.target_col in X.columns:
            X_sorted = X.sort_values(['Store', 'Dept', 'Date']).copy()
            X_sorted = X_sorted.reset_index(drop=True)

            for window in self.windows:
                try:
                    grouped = X_sorted.groupby(['Store', 'Dept'])[self.target_col]

                    rolling_mean = grouped.rolling(window=window, min_periods=1).mean()
                    rolling_std = grouped.rolling(window=window, min_periods=1).std()
                    rolling_max = grouped.rolling(window=window, min_periods=1).max()
                    rolling_min = grouped.rolling(window=window, min_periods=1).min()

                    # Store medians
                    self.rolling_medians[f'{self.target_col}_rolling_mean_{window}'] = rolling_mean.median()
                    self.rolling_medians[f'{self.target_col}_rolling_std_{window}'] = rolling_std.median()
                    self.rolling_medians[f'{self.target_col}_rolling_max_{window}'] = rolling_max.median()
                    self.rolling_medians[f'{self.target_col}_rolling_min_{window}'] = rolling_min.median()
                except Exception as e:
                    print(f"Warning: Could not create rolling window {window}, using 0 as default")
                    self.rolling_medians[f'{self.target_col}_rolling_mean_{window}'] = 0
                    self.rolling_medians[f'{self.target_col}_rolling_std_{window}'] = 0
                    self.rolling_medians[f'{self.target_col}_rolling_max_{window}'] = 0
                    self.rolling_medians[f'{self.target_col}_rolling_min_{window}'] = 0
        return self

    def transform(self, X):
        X_sorted = X.sort_values(['Store', 'Dept', 'Date']).copy()
        X_sorted = X_sorted.reset_index(drop=True)

        for window in self.windows:
            if self.target_col in X.columns:
                # Training data - compute actual rolling features
                try:
                    grouped = X_sorted.groupby(['Store', 'Dept'])[self.target_col]

                    rolling_mean = grouped.rolling(window=window, min_periods=1).mean()
                    rolling_std = grouped.rolling(window=window, min_periods=1).std()
                    rolling_max = grouped.rolling(window=window, min_periods=1).max()
                    rolling_min = grouped.rolling(window=window, min_periods=1).min()

                    X_sorted[f'{self.target_col}_rolling_mean_{window}'] = rolling_mean.reset_index(0, drop=True)
                    X_sorted[f'{self.target_col}_rolling_std_{window}'] = rolling_std.reset_index(0, drop=True)
                    X_sorted[f'{self.target_col}_rolling_max_{window}'] = rolling_max.reset_index(0, drop=True)
                    X_sorted[f'{self.target_col}_rolling_min_{window}'] = rolling_min.reset_index(0, drop=True)
                except Exception as e:
                    print(f"Warning: Could not create rolling features for window {window}")
                    X_sorted[f'{self.target_col}_rolling_mean_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_mean_{window}', 0)
                    X_sorted[f'{self.target_col}_rolling_std_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_std_{window}', 0)
                    X_sorted[f'{self.target_col}_rolling_max_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_max_{window}', 0)
                    X_sorted[f'{self.target_col}_rolling_min_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_min_{window}', 0)
            else:
                # Test data - use medians from training
                X_sorted[f'{self.target_col}_rolling_mean_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_mean_{window}', 0)
                X_sorted[f'{self.target_col}_rolling_std_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_std_{window}', 0)
                X_sorted[f'{self.target_col}_rolling_max_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_max_{window}', 0)
                X_sorted[f'{self.target_col}_rolling_min_{window}'] = self.rolling_medians.get(f'{self.target_col}_rolling_min_{window}', 0)

        return X_sorted

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical variables"""

    def __init__(self):
        self.label_encoders = {}

    def fit(self, X, y=None):
        if 'Type' in X.columns:
            self.label_encoders['Type'] = LabelEncoder()
            self.label_encoders['Type'].fit(X['Type'])
        return self

    def transform(self, X):
        X = X.copy()
        if 'Type' in X.columns and 'Type' in self.label_encoders:
            X['Type_encoded'] = self.label_encoders['Type'].transform(X['Type'])
        return X

class MissingValueImputer(BaseEstimator, TransformerMixin):
    """Fill missing values with median"""

    def __init__(self):
        self.medians = {}

    def fit(self, X, y=None):
        numeric_columns = X.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            self.medians[col] = X[col].median()
        return self

    def transform(self, X):
        X = X.copy()
        for col, median_val in self.medians.items():
            if col in X.columns:
                X[col] = X[col].fillna(median_val)
        return X

class FeatureSelector(BaseEstimator, TransformerMixin):
    """Select final features for modeling"""

    def __init__(self):
        self.feature_columns = None

    def fit(self, X, y=None):
        self.feature_columns = [col for col in X.columns
                               if col not in ['Date', 'Weekly_Sales', 'Type', 'Id']]
        return self

    def transform(self, X):
        missing_cols = set(self.feature_columns) - set(X.columns)
        if missing_cols:
            for col in missing_cols:
                X[col] = 0
        return X[self.feature_columns]

def load_and_explore_data():
    """Load all datasets and perform initial exploration"""

    train_df = pd.read_csv("/content/train.csv")
    features_df = pd.read_csv("/content/features.csv")
    stores_df = pd.read_csv("/content/stores.csv")
    test_df = pd.read_csv("/content/test.csv")

    print("Dataset shapes:")
    print(f"Train: {train_df.shape}")
    print(f"Features: {features_df.shape}")
    print(f"Stores: {stores_df.shape}")
    print(f"Test: {test_df.shape}")

    return train_df, features_df, stores_df, test_df

def create_preprocessing_pipeline(features_df, stores_df):
    """Create the complete preprocessing pipeline"""

    pipeline = Pipeline([
        ('merger', DataMerger(features_df, stores_df)),
        ('date_features', DateFeatureCreator()),
        ('lag_features', LagFeatureCreator()),
        ('rolling_features', RollingFeatureCreator()),
        ('categorical_encoder', CategoricalEncoder()),
        ('imputer', MissingValueImputer()),
        ('feature_selector', FeatureSelector())
    ])

    return pipeline

def hyperparameter_tuning(train_df, features_df, stores_df):
    """Perform hyperparameter tuning with pipeline"""

    run = wandb.init(project="walmart-forecasting",
                     job_type="hyperparameter_tuning",
                     name="XGBoost_hyperparameter_tuning")

    try:
        # Create and fit preprocessing pipeline
        print("Creating preprocessing pipeline...")
        preprocessing_pipeline = create_preprocessing_pipeline(features_df, stores_df)

        print("Fitting preprocessing pipeline...")
        train_processed = preprocessing_pipeline.fit_transform(train_df)

        print(f"Processed data shape: {train_processed.shape}")

        y = train_df['Weekly_Sales']
        mask = ~(train_processed.isnull().any(axis=1) | y.isnull())
        X_clean = train_processed[mask]
        y_clean = y[mask]

        print(f"Clean data shape: {X_clean.shape}")

        # Parameter combinations
        param_combinations = [
            {'max_depth': 4, 'learning_rate': 0.05, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0.8},
            {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'colsample_bytree': 0.8},
            {'max_depth': 8, 'learning_rate': 0.15, 'n_estimators': 200, 'subsample': 0.9, 'colsample_bytree': 0.9},
        ]

        best_score = float('inf')
        best_params = None

        tscv = TimeSeriesSplit(n_splits=3)

        for i, params in enumerate(param_combinations):
            print(f"\nTesting parameter combination {i+1}: {params}")

            scores = []
            for fold, (train_idx, val_idx) in enumerate(tscv.split(X_clean)):
                X_train_fold, X_val_fold = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
                y_train_fold, y_val_fold = y_clean.iloc[train_idx], y_clean.iloc[val_idx]

                model = xgb.XGBRegressor(
                    objective='reg:squarederror',
                    random_state=42,
                    **params
                )

                model.fit(X_train_fold, y_train_fold)
                val_pred = model.predict(X_val_fold)
                rmse = np.sqrt(mean_squared_error(y_val_fold, val_pred))
                scores.append(rmse)

            avg_score = np.mean(scores)
            print(f"Average RMSE: {avg_score:.4f}")

            wandb.log({
                f"param_combo_{i+1}_rmse": avg_score,
                **{f"param_{k}": v for k, v in params.items()}
            })

            if avg_score < best_score:
                best_score = avg_score
                best_params = params

        wandb.log({
            "best_rmse": best_score,
            "best_params": best_params
        })

    except Exception as e:
        print(f"Error in hyperparameter tuning: {str(e)}")
        import traceback
        traceback.print_exc()
        best_params = {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'colsample_bytree': 0.8}
        best_score = 0

    wandb.finish()

    return best_params, best_score

def train_model_with_pipeline(train_df, features_df, stores_df, best_params=None):
    """Train model with complete pipeline"""

    run = wandb.init(project="walmart-forecasting",
                     job_type="training",
                     name="XGBoost_pipeline_training")

    try:
        # Create preprocessing pipeline
        preprocessing_pipeline = create_preprocessing_pipeline(features_df, stores_df)

        print("Fitting preprocessing pipeline...")
        train_processed = preprocessing_pipeline.fit_transform(train_df)

        print(f"Processed training data shape: {train_processed.shape}")

        # Prepare target
        y = train_df['Weekly_Sales']

        # Remove rows with NaN
        mask = ~(train_processed.isnull().any(axis=1) | y.isnull())
        X_clean = train_processed[mask]
        y_clean = y[mask]

        print(f"Clean data shape: {X_clean.shape}")

        # Use best params if available
        if best_params is None:
            best_params = {
                'max_depth': 6,
                'learning_rate': 0.1,
                'n_estimators': 500,
                'subsample': 0.8,
                'colsample_bytree': 0.8
            }

        # Train final model
        print("Training final model...")
        final_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=42,
            **best_params
        )

        final_model.fit(X_clean, y_clean)

        # Create complete pipeline
        complete_pipeline = Pipeline([
            ('preprocessing', preprocessing_pipeline),
            ('model', final_model)
        ])

        # Save pipeline
        joblib.dump(complete_pipeline, '/content/walmart_pipeline.pkl')

        # Save as wandb artifact
        artifact = wandb.Artifact('walmart-xgboost-pipeline', type='model')
        artifact.add_file('/content/walmart_pipeline.pkl')

        artifact.metadata = {
            "model_type": "XGBoost",
            "n_features": int(X_clean.shape[1]),
            "n_samples": int(X_clean.shape[0]),
            "best_params": best_params
        }

        wandb.log_artifact(artifact)

        print("Pipeline saved successfully!")

    except Exception as e:
        print(f"Error in training: {str(e)}")
        import traceback
        traceback.print_exc()
        complete_pipeline = None

    wandb.finish()

    return complete_pipeline

# Main execution
if __name__ == "__main__":
    print("Starting Walmart Sales Forecasting with XGBoost...")

    # Load data
    train_df, features_df, stores_df, test_df = load_and_explore_data()

    # Hyperparameter tuning
    print("\nPerforming hyperparameter tuning...")
    best_params, best_score = hyperparameter_tuning(train_df, features_df, stores_df)
    print(f"Best parameters: {best_params}")

    # Train final model
    print("\nTraining final model with pipeline...")
    complete_pipeline = train_model_with_pipeline(train_df, features_df, stores_df, best_params)

    if complete_pipeline is not None:
        print("\n✅ Training completed successfully!")
        print("Pipeline saved to wandb artifacts!")
    else:
        print("\n❌ Training failed!")

Starting Walmart Sales Forecasting with XGBoost...
Dataset shapes:
Train: (421570, 5)
Features: (8190, 12)
Stores: (45, 3)
Test: (115064, 4)

Performing hyperparameter tuning...


Creating preprocessing pipeline...
Fitting preprocessing pipeline...
Processed data shape: (421570, 45)
Clean data shape: (421570, 45)

Testing parameter combination 1: {'max_depth': 4, 'learning_rate': 0.05, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0.8}
Average RMSE: 4883.2234

Testing parameter combination 2: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'colsample_bytree': 0.8}
Average RMSE: 4235.8996

Testing parameter combination 3: {'max_depth': 8, 'learning_rate': 0.15, 'n_estimators': 200, 'subsample': 0.9, 'colsample_bytree': 0.9}
Average RMSE: 4314.1897


0,1
best_rmse,▁
param_colsample_bytree,▁▁█
param_combo_1_rmse,▁
param_combo_2_rmse,▁
param_combo_3_rmse,▁
param_learning_rate,▁▄█
param_max_depth,▁▅█
param_n_estimators,█▃▁
param_subsample,▁▁█

0,1
best_rmse,4235.89962
param_colsample_bytree,0.9
param_combo_1_rmse,4883.22341
param_combo_2_rmse,4235.89962
param_combo_3_rmse,4314.18965
param_learning_rate,0.15
param_max_depth,8.0
param_n_estimators,200.0
param_subsample,0.9


Best parameters: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'colsample_bytree': 0.8}

Training final model with pipeline...


Fitting preprocessing pipeline...
Processed training data shape: (421570, 45)
Clean data shape: (421570, 45)
Training final model...
Pipeline saved successfully!



✅ Training completed successfully!
Pipeline saved to wandb artifacts!
