In [27]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [29]:
!cp /content/drive/MyDrive/Kaggle_credentials/kaggle.json ~/.kaggle/kaggle.json

In [30]:
! chmod 600 ~/.kaggle/kaggle.json

In [31]:
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

In [32]:
# ! unzip /content/walmart-recruiting-store-sales-forecasting.zip
# ! unzip /content/train.csv.zip
# ! unzip /content/test.csv.zip
# ! unzip /content/features.csv.zip
# ! unzip /content/sampleSubmission.csv.zip

In [33]:
!pip install wandb -qU

In [34]:
import wandb
import random
import math

In [35]:
wandb.login()



True

In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import wandb
import pickle
import warnings
warnings.filterwarnings('ignore')

# DATA EXPLORATION

In [37]:
# Load data
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
stores = pd.read_csv('/content/stores.csv')
features = pd.read_csv('/content/features.csv')
sample_submission = pd.read_csv('/content/sampleSubmission.csv')

print("Data shapes:")
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print(f"Stores: {stores.shape}")
print(f"Features: {features.shape}")
print(f"Sample Submission: {sample_submission.shape}")

# Display column information
print("\nColumn details:")
print("Train columns:", train.columns.tolist())
print("Features columns:", features.columns.tolist())
print("Stores columns:", stores.columns.tolist())
print("Sample submission columns:", sample_submission.columns.tolist())

# Basic info about the data
print("\nData info:")
print("Date range:", train['Date'].min(), "to", train['Date'].max())
print("Unique stores:", train['Store'].nunique())
print("Unique departments:", train['Dept'].nunique())
print("Store types:", stores['Type'].unique())

# Check for missing values
print("\nMissing values:")
print("Train missing:", train.isnull().sum().sum())
print("Features missing:", features.isnull().sum())

# Fix the date conversion issue BEFORE creating feature engineer
print("Converting all Date columns to datetime...")

# Convert dates in all dataframes
train['Date'] = pd.to_datetime(train['Date'])
features['Date'] = pd.to_datetime(features['Date'])
stores_copy = stores.copy()  # stores doesn't have Date column
test['Date'] = pd.to_datetime(test['Date'])

print("Date conversion completed!")
print(f"Train Date dtype: {train['Date'].dtype}")
print(f"Features Date dtype: {features['Date'].dtype}")
print(f"Test Date dtype: {test['Date'].dtype}")

Data shapes:
Train: (421570, 5)
Test: (115064, 4)
Stores: (45, 3)
Features: (8190, 12)
Sample Submission: (115064, 2)

Column details:
Train columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Features columns: ['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday']
Stores columns: ['Store', 'Type', 'Size']
Sample submission columns: ['Id', 'Weekly_Sales']

Data info:
Date range: 2010-02-05 to 2012-10-26
Unique stores: 45
Unique departments: 81
Store types: ['A' 'B' 'C']

Missing values:
Train missing: 0
Features missing: Store              0
Date               0
Temperature        0
Fuel_Price         0
MarkDown1       4158
MarkDown2       5269
MarkDown3       4577
MarkDown4       4726
MarkDown5       4140
CPI              585
Unemployment     585
IsHoliday          0
dtype: int64
Converting all Date columns to datetime...
Date conversion completed!
Train Date dtype: datetime64[ns]


# FEATURE ENGINEERING

In [38]:
# Updated Feature Engineer that uses the pre-converted dataframes
class WalmartFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}

    def fit(self, X, y=None):
        # Fit label encoders
        X_with_stores = X.merge(stores_copy, on='Store', how='left')

        if 'Type' in X_with_stores.columns:
            le = LabelEncoder()
            le.fit(X_with_stores['Type'].astype(str))
            self.label_encoders['Type'] = le
        return self

    def transform(self, X):
        X = X.copy()

        # Dates should already be datetime from preprocessing
        # No need to convert again - just verify
        if not pd.api.types.is_datetime64_any_dtype(X['Date']):
            X['Date'] = pd.to_datetime(X['Date'])

        # Merge with additional data (all Date columns should now be datetime)
        X = X.merge(stores_copy, on='Store', how='left')
        X = X.merge(features, on=['Store', 'Date'], how='left')

        # Handle IsHoliday conflict
        if 'IsHoliday_x' in X.columns and 'IsHoliday_y' in X.columns:
            X['IsHoliday'] = X['IsHoliday_y'].fillna(X['IsHoliday_x'])
            X = X.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)

        # Basic date features
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Week'] = X['Date'].dt.isocalendar().week
        X['Weekday'] = X['Date'].dt.dayofweek
        X['Quarter'] = X['Date'].dt.quarter
        X['Is_Weekend'] = (X['Weekday'] >= 5).astype(int)

        # Simple holiday feature
        X['IsHoliday'] = X['IsHoliday'].fillna(False).astype(int)

        # Markdown features
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        for col in markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0)

        # Create total markdown
        if any(col in X.columns for col in markdown_cols):
            existing_cols = [col for col in markdown_cols if col in X.columns]
            X['Total_MarkDown'] = X[existing_cols].sum(axis=1)
            X['Num_MarkDowns'] = (X[existing_cols] > 0).sum(axis=1)

        # Economic features
        if 'Fuel_Price' in X.columns and 'CPI' in X.columns:
            X['Fuel_Price_to_CPI'] = X['Fuel_Price'] / (X['CPI'] + 1e-8)

        if 'Temperature' in X.columns:
            X['Temperature_squared'] = X['Temperature'] ** 2

        # Encode categorical variables
        for col, le in self.label_encoders.items():
            if col in X.columns:
                X[col] = le.transform(X[col].astype(str))

        # Fill missing values
        numeric_cols = X.select_dtypes(include=[np.number]).columns
        X[numeric_cols] = X[numeric_cols].fillna(0)

        # Select features (exclude Date and Weekly_Sales if present)
        cols_to_drop = ['Date']
        if 'Weekly_Sales' in X.columns:
            cols_to_drop.append('Weekly_Sales')
        feature_cols = [col for col in X.columns if col not in cols_to_drop]

        return X[feature_cols]

# Initialize feature engineer
feature_engineer = WalmartFeatureEngineer()
print("Updated feature engineer created!")

Updated feature engineer created!


# PROCESSING

In [39]:
# Initialize wandb run for data cleaning
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Data_Cleaning",
    group="XGBoost_Training",
    tags=["preprocessing", "xgboost", "data_cleaning"]
)

# Log initial parameters
wandb.config.update({
    "original_train_shape": train.shape,
    "date_conversion_completed": True
})

# Fit and transform training data
print("Fitting and transforming training data...")
train_processed = feature_engineer.fit_transform(train)

# Prepare target variable
y_train = train['Weekly_Sales'].values

# Log preprocessing results
wandb.config.update({
    "final_train_shape": train_processed.shape,
    "n_features": train_processed.shape[1],
    "missing_values_after_preprocessing": int(train_processed.isnull().sum().sum())
})

# Log summary metrics
wandb.log({
    "train_samples": len(train_processed),
    "n_features": train_processed.shape[1],
    "target_mean": float(y_train.mean()),
    "target_std": float(y_train.std()),
    "missing_values": int(train_processed.isnull().sum().sum())
})

print("Preprocessing completed!")
print(f"Final training shape: {train_processed.shape}")
print(f"Target shape: {y_train.shape}")
print(f"Features: {list(train_processed.columns)}")

# Check for any remaining issues
print(f"Missing values: {train_processed.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(train_processed.select_dtypes(include=[np.number])).sum().sum()}")

# Save feature names for later use
wandb.log({"feature_names": list(train_processed.columns)})

wandb.finish()

Fitting and transforming training data...
Preprocessing completed!
Final training shape: (421570, 24)
Target shape: (421570,)
Features: ['Store', 'Dept', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday', 'Year', 'Month', 'Week', 'Weekday', 'Quarter', 'Is_Weekend', 'Total_MarkDown', 'Num_MarkDowns', 'Fuel_Price_to_CPI', 'Temperature_squared']
Missing values: 0
Infinite values: 0


0,1
missing_values,▁
n_features,▁
target_mean,▁
target_std,▁
train_samples,▁

0,1
missing_values,0.0
n_features,24.0
target_mean,15981.25812
target_std,22711.15658
train_samples,421570.0


# SELECTION

In [40]:
# Initialize wandb run for feature selection
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Feature_Selection",
    group="XGBoost_Training",
    tags=["preprocessing", "xgboost", "feature_selection"]
)

# Remove features with too many missing values
missing_threshold = 0.8
missing_pct = train_processed.isnull().sum() / len(train_processed)
features_to_keep = missing_pct[missing_pct < missing_threshold].index.tolist()

X_train_selected = train_processed[features_to_keep]

# Remove constant features
constant_features = [col for col in X_train_selected.columns
                    if X_train_selected[col].nunique() <= 1]
X_train_selected = X_train_selected.drop(columns=constant_features)

# Remove highly correlated features
correlation_threshold = 0.95
corr_matrix = X_train_selected.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
X_train_selected = X_train_selected.drop(columns=high_corr_features)

# Log feature selection results
wandb.config.update({
    "missing_threshold": missing_threshold,
    "correlation_threshold": correlation_threshold,
    "features_after_selection": X_train_selected.shape[1],
    "removed_missing_features": len(train_processed.columns) - len(features_to_keep),
    "removed_constant_features": len(constant_features),
    "removed_correlated_features": len(high_corr_features)
})

wandb.log({
    "n_features_final": X_train_selected.shape[1],
    "n_features_removed_missing": len(train_processed.columns) - len(features_to_keep),
    "n_features_removed_constant": len(constant_features),
    "n_features_removed_correlated": len(high_corr_features)
})

print(f"Features after selection: {X_train_selected.shape[1]}")
print(f"Removed {len(constant_features)} constant features")
print(f"Removed {len(high_corr_features)} highly correlated features")

# Save selected features list
selected_features = list(X_train_selected.columns)
wandb.log({"selected_features": selected_features})

wandb.finish()

Features after selection: 19
Removed 2 constant features
Removed 3 highly correlated features


0,1
n_features_final,▁
n_features_removed_constant,▁
n_features_removed_correlated,▁
n_features_removed_missing,▁

0,1
n_features_final,19
n_features_removed_constant,2
n_features_removed_correlated,3
n_features_removed_missing,0


# CROSS VALIDATION

In [41]:
def weighted_mean_absolute_error(y_true, y_pred, weights=None):
    """Calculate Weighted Mean Absolute Error (Kaggle metric)"""
    if weights is None:
        weights = np.ones(len(y_true))
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

def time_series_cv_xgboost(X, y, params, n_splits=5, run_name="CV"):
    """Time series cross-validation for XGBoost"""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    cv_scores = []
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Create XGBoost datasets
        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
        dval = xgb.DMatrix(X_val_fold, label=y_val_fold)

        # Train model
        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dval, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        # Predict and evaluate
        y_pred = model.predict(dval)

        # Calculate MAE
        mae = mean_absolute_error(y_val_fold, y_pred)
        cv_scores.append(mae)
        fold_results.append({
            f"fold_{fold+1}_mae": mae,
            f"fold_{fold+1}_train_size": len(train_idx),
            f"fold_{fold+1}_val_size": len(val_idx)
        })

        print(f"Fold {fold + 1} MAE: {mae:.4f}")

    # Log fold results to wandb
    for fold_result in fold_results:
        wandb.log(fold_result)

    return cv_scores

# TUNING

In [42]:
# Initialize wandb run for hyperparameter tuning
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Hyperparameter_Tuning",
    group="XGBoost_Training",
    tags=["hyperparameter_tuning", "xgboost", "cross_validation"]
)

# Define parameter grid for tuning
param_grid = [
    {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'reg_alpha': 0,
        'reg_lambda': 1,
        'random_state': 42,
        'tree_method': 'hist'
    },
    {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'max_depth': 8,
        'learning_rate': 0.05,
        'subsample': 0.9,
        'colsample_bytree': 0.9,
        'min_child_weight': 3,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'random_state': 42,
        'tree_method': 'hist'
    },
    {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'max_depth': 10,
        'learning_rate': 0.03,
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'min_child_weight': 5,
        'reg_alpha': 0.05,
        'reg_lambda': 1.5,
        'random_state': 42,
        'tree_method': 'hist'
    }
]

best_score = float('inf')
best_params = None
param_results = []

for i, params in enumerate(param_grid):
    print(f"\nTesting parameter set {i + 1}:")
    print(f"Parameters: {params}")

    # Log current parameters being tested
    wandb.log({f"testing_param_set": i + 1})

    cv_scores = time_series_cv_xgboost(X_train_selected, y_train, params, run_name=f"ParamSet_{i+1}")
    avg_score = np.mean(cv_scores)
    std_score = np.std(cv_scores)

    # Store results
    param_results.append({
        'param_set': i + 1,
        'params': params,
        'cv_mean': avg_score,
        'cv_std': std_score,
        'cv_scores': cv_scores
    })

    # Log to wandb
    wandb.log({
        f"param_set_{i+1}_cv_mean": avg_score,
        f"param_set_{i+1}_cv_std": std_score,
        f"param_set_{i+1}_max_depth": params['max_depth'],
        f"param_set_{i+1}_learning_rate": params['learning_rate'],
        f"param_set_{i+1}_subsample": params['subsample'],
        f"param_set_{i+1}_colsample_bytree": params['colsample_bytree'],
        f"param_set_{i+1}_min_child_weight": params['min_child_weight'],
        f"param_set_{i+1}_reg_alpha": params['reg_alpha'],
        f"param_set_{i+1}_reg_lambda": params['reg_lambda']
    })

    print(f"Average CV MAE: {avg_score:.4f} (+/- {std_score*2:.4f})")

    if avg_score < best_score:
        best_score = avg_score
        best_params = params

# Log best results
wandb.config.update({
    "best_params": best_params,
    "best_cv_score": best_score,
    "n_param_sets_tested": len(param_grid)
})

wandb.log({
    "best_cv_score": best_score,
    "best_param_set": param_results[np.argmin([r['cv_mean'] for r in param_results])]['param_set']
})

print(f"\nBest parameters: {best_params}")
print(f"Best CV score: {best_score:.4f}")

wandb.finish()


Testing parameter set 1:
Parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_alpha': 0, 'reg_lambda': 1, 'random_state': 42, 'tree_method': 'hist'}
Fold 1 MAE: 8263.5537
Fold 2 MAE: 6089.4391
Fold 3 MAE: 6889.4377
Fold 4 MAE: 7887.1705
Fold 5 MAE: 5900.5510
Average CV MAE: 7006.0304 (+/- 1883.3143)

Testing parameter set 2:
Parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'max_depth': 8, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9, 'min_child_weight': 3, 'reg_alpha': 0.1, 'reg_lambda': 1, 'random_state': 42, 'tree_method': 'hist'}
Fold 1 MAE: 8315.4580
Fold 2 MAE: 6093.1814
Fold 3 MAE: 7617.5479
Fold 4 MAE: 8219.2369
Fold 5 MAE: 5789.3405
Average CV MAE: 7206.9530 (+/- 2130.2023)

Testing parameter set 3:
Parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'max_depth': 10, 'learning_rate': 0.03, 'subsample'

0,1
best_cv_score,▁
best_param_set,▁
fold_1_mae,▁▃█
fold_1_train_size,▁▁▁
fold_1_val_size,▁▁▁
fold_2_mae,██▁
fold_2_train_size,▁▁▁
fold_2_val_size,▁▁▁
fold_3_mae,▁█▃
fold_3_train_size,▁▁▁

0,1
best_cv_score,7006.03041
best_param_set,1.0
fold_1_mae,8470.03571
fold_1_train_size,70265.0
fold_1_val_size,70261.0
fold_2_mae,5962.85514
fold_2_train_size,140526.0
fold_2_val_size,70261.0
fold_3_mae,7094.67006
fold_3_train_size,210787.0


# TRAINING

In [43]:
# Initialize wandb run for final training
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Final_Training",
    group="XGBoost_Training",
    tags=["final_model", "xgboost", "pipeline"]
)

# Create full pipeline
class XGBoostPipeline(BaseEstimator):
    def __init__(self, params, feature_engineer, selected_features):
        self.params = params
        self.feature_engineer = feature_engineer
        self.selected_features = selected_features
        self.model = None

    def fit(self, X, y):
        # Transform features
        X_transformed = self.feature_engineer.fit_transform(X)
        X_selected = X_transformed[self.selected_features]

        # Train XGBoost
        dtrain = xgb.DMatrix(X_selected, label=y)
        self.model = xgb.train(
            params=self.params,
            dtrain=dtrain,
            num_boost_round=1000,
            verbose_eval=False
        )
        return self

    def predict(self, X):
        # Transform features
        X_transformed = self.feature_engineer.transform(X)
        X_selected = X_transformed[self.selected_features]

        # Make predictions
        dtest = xgb.DMatrix(X_selected)
        return self.model.predict(dtest)

# Create and train final pipeline
final_pipeline = XGBoostPipeline(best_params, feature_engineer, selected_features)
final_pipeline.fit(train, y_train)

# Log model parameters
wandb.config.update(best_params)
wandb.config.update({
    "n_selected_features": len(selected_features),
    "pipeline_components": ["feature_engineer", "xgboost_model"]
})

# Save model artifacts
model_path = "xgboost_final_model.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(final_pipeline, f)

# Log model to wandb
artifact = wandb.Artifact("xgboost_pipeline", type="model")
artifact.add_file(model_path)
wandb.log_artifact(artifact)

# Feature importance
if hasattr(final_pipeline.model, 'get_score'):
    importance = final_pipeline.model.get_score(importance_type='weight')

    # Log top 20 features
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:20]

    # Create feature importance table for wandb
    importance_data = []
    for i, (feature, score) in enumerate(sorted_importance):
        importance_data.append([i+1, feature, score])
        wandb.log({f"feature_importance_rank_{i+1:02d}": score})

    # Log as wandb table
    importance_table = wandb.Table(
        data=importance_data,
        columns=["Rank", "Feature", "Importance"]
    )
    wandb.log({"feature_importance_table": importance_table})

    # Print top features
    print("Top 10 most important features:")
    for i, (feature, score) in enumerate(sorted_importance[:10]):
        print(f"{i+1:2d}. {feature}: {score}")

print("Final model training completed and logged to wandb!")

wandb.finish()

Top 10 most important features:
 1. Dept: 19834.0
 2. Store: 6200.0
 3. Size: 5620.0
 4. Temperature: 4255.0
 5. CPI: 3874.0
 6. Fuel_Price: 3169.0
 7. Month: 2909.0
 8. Unemployment: 2859.0
 9. Fuel_Price_to_CPI: 2062.0
10. MarkDown3: 1414.0
Final model training completed and logged to wandb!


0,1
feature_importance_rank_01,▁
feature_importance_rank_02,▁
feature_importance_rank_03,▁
feature_importance_rank_04,▁
feature_importance_rank_05,▁
feature_importance_rank_06,▁
feature_importance_rank_07,▁
feature_importance_rank_08,▁
feature_importance_rank_09,▁
feature_importance_rank_10,▁

0,1
feature_importance_rank_01,19834
feature_importance_rank_02,6200
feature_importance_rank_03,5620
feature_importance_rank_04,4255
feature_importance_rank_05,3874
feature_importance_rank_06,3169
feature_importance_rank_07,2909
feature_importance_rank_08,2859
feature_importance_rank_09,2062
feature_importance_rank_10,1414


# EVALUATION

In [44]:
# Initialize wandb run for evaluation
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Evaluation",
    group="XGBoost_Training",
    tags=["evaluation", "xgboost", "final_cv"]
)

# Final cross-validation with best parameters
final_cv_scores = time_series_cv_xgboost(X_train_selected, y_train, best_params, n_splits=5)

# Log evaluation metrics
wandb.config.update({
    "final_cv_folds": 5,
    "evaluation_metric": "MAE"
})

wandb.log({
    "final_cv_mean": np.mean(final_cv_scores),
    "final_cv_std": np.std(final_cv_scores),
    "final_cv_min": np.min(final_cv_scores),
    "final_cv_max": np.max(final_cv_scores)
})

print("Model evaluation completed!")
print(f"Final CV MAE: {np.mean(final_cv_scores):.4f} (+/- {np.std(final_cv_scores)*2:.4f})")

# Log individual fold scores
for i, score in enumerate(final_cv_scores):
    wandb.log({f"final_fold_{i+1}_mae": score})

wandb.finish()

Fold 1 MAE: 8263.5537
Fold 2 MAE: 6089.4391
Fold 3 MAE: 6889.4377
Fold 4 MAE: 7887.1705
Fold 5 MAE: 5900.5510
Model evaluation completed!
Final CV MAE: 7006.0304 (+/- 1883.3143)


0,1
final_cv_max,▁
final_cv_mean,▁
final_cv_min,▁
final_cv_std,▁
final_fold_1_mae,▁
final_fold_2_mae,▁
final_fold_3_mae,▁
final_fold_4_mae,▁
final_fold_5_mae,▁
fold_1_mae,▁

0,1
final_cv_max,8263.5537
final_cv_mean,7006.03041
final_cv_min,5900.55104
final_cv_std,941.65714
final_fold_1_mae,8263.5537
final_fold_2_mae,6089.43909
final_fold_3_mae,6889.43771
final_fold_4_mae,7887.17049
final_fold_5_mae,5900.55104
fold_1_mae,8263.5537


# SUMMARY

In [45]:
# Create a summary run to consolidate all results
wandb.init(
    project="Walmart_Forecasting",
    name="XGBoost_Summary",
    group="XGBoost_Training",
    tags=["summary", "xgboost", "results"]
)

# Log summary of all experiments
wandb.log({
    "model_architecture": "XGBoost",
    "total_features_engineered": train_processed.shape[1],
    "selected_features": len(selected_features),
    "best_cv_score": best_score,
    "final_cv_mean": np.mean(final_cv_scores),
    "final_cv_std": np.std(final_cv_scores),
    "experiment_completed": True
})

wandb.config.update({
    "experiment_summary": {
        "data_preprocessing": "completed",
        "feature_selection": "completed",
        "hyperparameter_tuning": "completed",
        "final_training": "completed",
        "evaluation": "completed"
    }
})

print("XGBoost experiment completed successfully!")
print(f"Check your wandb dashboard at: https://wandb.ai/{wandb.run.entity}/{wandb.run.project}")

wandb.finish()

XGBoost experiment completed successfully!
Check your wandb dashboard at: https://wandb.ai/dshan21-free-university-of-tbilisi-/Walmart_Forecasting


0,1
best_cv_score,▁
final_cv_mean,▁
final_cv_std,▁
selected_features,▁
total_features_engineered,▁

0,1
best_cv_score,7006.03041
experiment_completed,True
final_cv_mean,7006.03041
final_cv_std,941.65714
model_architecture,XGBoost
selected_features,19
total_features_engineered,24
