In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import logging
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
import wandb

warnings.filterwarnings('ignore')


In [None]:
warnings.filterwarnings('ignore')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('walmart_sales_prediction.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

## Loading and preprocessing the data

In [None]:
class DataLoader:
    def __init__(self):
        self.train_df = None
        self.test_df = None
        self.features_df = None
        self.stores_df = None
        self.data_info = {}

    def load_datasets(self, train_path, test_path, features_path, stores_path):
        logger.info("Loading datasets...")

        self.train_df = pd.read_csv(train_path)
        self.test_df = pd.read_csv(test_path)
        self.features_df = pd.read_csv(features_path)
        self.stores_df = pd.read_csv(stores_path)

        shapes = {
            'train_shape': self.train_df.shape,
            'test_shape': self.test_df.shape,
            'features_shape': self.features_df.shape,
            'stores_shape': self.stores_df.shape
        }

        wandb.log(shapes)
        self.data_info.update(shapes)
        self._log_dataset_info()
        return self.train_df, self.test_df, self.features_df, self.stores_df

    def _log_dataset_info(self):
        logger.info("Creating data exploration visualizations")
        plt.style.use('seaborn-v0_8')
        colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#5D737E']
        fig, axes = plt.subplots(3, 3, figsize=(20, 18))
        fig.suptitle('Walmart Sales Data Exploration', fontsize=20, fontweight='bold')

        axes[0, 0].hist(self.train_df['Weekly_Sales'], bins=50, alpha=0.8,
                       color=colors[0], edgecolor='black', linewidth=0.5)
        axes[0, 0].set_title('Weekly Sales Distribution', fontsize=14, fontweight='bold')
        axes[0, 0].set_xlabel('Weekly Sales ($)')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].grid(True, alpha=0.3)

        store_sales = self.train_df.groupby('Store')['Weekly_Sales'].mean().sort_values(ascending=False)
        axes[0, 1].bar(range(len(store_sales)), store_sales.values,
                      color=colors[1], alpha=0.8)
        axes[0, 1].set_title('Average Sales by Store', fontsize=14, fontweight='bold')
        axes[0, 1].set_xlabel('Store Rank')
        axes[0, 1].set_ylabel('Average Weekly Sales ($)')
        axes[0, 1].grid(True, alpha=0.3)

        holiday_counts = self.train_df['IsHoliday'].value_counts()
        colors_pie = [colors[2], colors[3]]
        axes[0, 2].pie(holiday_counts.values, labels=['Non-Holiday', 'Holiday'],
                      autopct='%1.1f%%', colors=colors_pie, startangle=90)
        axes[0, 2].set_title('Holiday vs Non-Holiday Distribution', fontsize=14, fontweight='bold')

        missing_data = pd.concat([self.train_df, self.features_df], axis=1).isnull().sum()
        missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
        if len(missing_data) > 0:
            axes[1, 0].bar(range(len(missing_data)), missing_data.values,
                          color=colors[4], alpha=0.8)
            axes[1, 0].set_title('Missing Values by Column', fontsize=14, fontweight='bold')
            axes[1, 0].set_xlabel('Columns')
            axes[1, 0].set_ylabel('Missing Count')
            axes[1, 0].set_xticks(range(len(missing_data)))
            axes[1, 0].set_xticklabels(missing_data.index, rotation=45)
        else:
            axes[1, 0].text(0.5, 0.5, 'No Missing Values', ha='center', va='center',
                           transform=axes[1, 0].transAxes, fontsize=12)
            axes[1, 0].set_title('Missing Values Status', fontsize=14, fontweight='bold')

        self.train_df['Date'] = pd.to_datetime(self.train_df['Date'])
        monthly_sales = self.train_df.groupby(self.train_df['Date'].dt.to_period('M'))['Weekly_Sales'].mean()
        axes[1, 1].plot(monthly_sales.index.astype(str), monthly_sales.values,
                       color=colors[0], linewidth=2, marker='o', markersize=4)
        axes[1, 1].set_title('Monthly Sales Trend', fontsize=14, fontweight='bold')
        axes[1, 1].set_xlabel('Month')
        axes[1, 1].set_ylabel('Average Weekly Sales ($)')
        axes[1, 1].tick_params(axis='x', rotation=45)
        axes[1, 1].grid(True, alpha=0.3)

        dept_sales = self.train_df.groupby('Dept')['Weekly_Sales'].mean().sort_values(ascending=False).head(10)
        axes[1, 2].bar(range(len(dept_sales)), dept_sales.values,
                      color=colors[1], alpha=0.8)
        axes[1, 2].set_title('Top 10 Departments by Avg Sales', fontsize=14, fontweight='bold')
        axes[1, 2].set_xlabel('Department Rank')
        axes[1, 2].set_ylabel('Average Weekly Sales ($)')
        axes[1, 2].grid(True, alpha=0.3)

        store_type_counts = self.stores_df['Type'].value_counts()
        axes[2, 0].bar(store_type_counts.index, store_type_counts.values,
                      color=colors[2], alpha=0.8)
        axes[2, 0].set_title('Store Types Distribution', fontsize=14, fontweight='bold')
        axes[2, 0].set_xlabel('Store Type')
        axes[2, 0].set_ylabel('Count')
        axes[2, 0].grid(True, alpha=0.3)

        axes[2, 1].hist(self.stores_df['Size'], bins=30, alpha=0.8,
                       color=colors[3], edgecolor='black', linewidth=0.5)
        axes[2, 1].set_title('Store Size Distribution', fontsize=14, fontweight='bold')
        axes[2, 1].set_xlabel('Store Size')
        axes[2, 1].set_ylabel('Frequency')
        axes[2, 1].grid(True, alpha=0.3)

        holiday_sales = self.train_df.groupby('IsHoliday')['Weekly_Sales'].mean()
        axes[2, 2].bar(['Non-Holiday', 'Holiday'], holiday_sales.values,
                      color=[colors[4], colors[0]], alpha=0.8)
        axes[2, 2].set_title('Average Sales: Holiday vs Non-Holiday', fontsize=14, fontweight='bold')
        axes[2, 2].set_ylabel('Average Weekly Sales ($)')
        axes[2, 2].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig('data_exploration.png', dpi=300, bbox_inches='tight')
        wandb.log({"data_exploration": wandb.Image('data_exploration.png')})
        plt.close()

        basic_stats = {
            'total_records_train': len(self.train_df),
            'total_records_test': len(self.test_df),
            'unique_stores': self.train_df['Store'].nunique(),
            'unique_departments': self.train_df['Dept'].nunique(),
            'date_range_days': (self.train_df['Date'].max() - self.train_df['Date'].min()).days,
            'avg_weekly_sales': self.train_df['Weekly_Sales'].mean(),
            'median_weekly_sales': self.train_df['Weekly_Sales'].median(),
            'std_weekly_sales': self.train_df['Weekly_Sales'].std(),
            'min_weekly_sales': self.train_df['Weekly_Sales'].min(),
            'max_weekly_sales': self.train_df['Weekly_Sales'].max(),
            'holiday_percentage': (self.train_df['IsHoliday'].sum() / len(self.train_df)) * 100,
            'negative_sales_count': (self.train_df['Weekly_Sales'] < 0).sum(),
            'zero_sales_count': (self.train_df['Weekly_Sales'] == 0).sum()
        }

        wandb.log(basic_stats)
        self.data_info.update(basic_stats)
        logger.info("Data exploration completed and logged")
        logger.info(f"Key insights:")
        logger.info(f"Total training records: {basic_stats['total_records_train']:,}")
        logger.info(f"Unique stores: {basic_stats['unique_stores']}")
        logger.info(f"Unique departments: {basic_stats['unique_departments']}")
        logger.info(f"Average weekly sales: ${basic_stats['avg_weekly_sales']:,.2f}")
        logger.info(f"Holiday percentage: {basic_stats['holiday_percentage']:.1f}%")


class BasicPreprocessor:
    def __init__(self):
        self.label_encoders = {}
        self.preprocessing_metrics = {}

    def merge_datasets(self, train_df, test_df, features_df, stores_df):
        logger.info("Merging datasets...")

        for df in [train_df, test_df, features_df]:
            df['Date'] = pd.to_datetime(df['Date'])

        initial_train_shape = train_df.shape
        initial_test_shape = test_df.shape

        train_merged = train_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
        test_merged = test_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')

        train_merged = train_merged.merge(stores_df, on='Store', how='left')
        test_merged = test_merged.merge(stores_df, on='Store', how='left')

        merge_stats = {
            'train_shape_before_merge': initial_train_shape,
            'test_shape_before_merge': initial_test_shape,
            'train_shape_after_merge': train_merged.shape,
            'test_shape_after_merge': test_merged.shape,
            'columns_added': train_merged.shape[1] - initial_train_shape[1]
        }

        wandb.log(merge_stats)
        self.preprocessing_metrics.update(merge_stats)

        logger.info(f"Datasets merged successfully")
        logger.info(f"Train shape: {initial_train_shape} → {train_merged.shape}")
        logger.info(f"Test shape: {initial_test_shape} → {test_merged.shape}")

        return train_merged, test_merged

    def handle_missing_values(self, train_df, test_df):
        logger.info("Handling missing values...")

        train_clean = train_df.copy()
        test_clean = test_df.copy()

        missing_before = {}
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

        for col in markdown_cols:
            missing_before[f'{col}_missing_before'] = train_clean[col].isnull().sum()
            train_clean[col] = train_clean[col].fillna(0)
            test_clean[col] = test_clean[col].fillna(0)

        for col in ['CPI', 'Unemployment']:
            missing_before[f'{col}_missing_before'] = train_clean[col].isnull().sum()

            train_clean = train_clean.sort_values(['Store', 'Date'])
            test_clean = test_clean.sort_values(['Store', 'Date'])
            train_clean[col] = train_clean.groupby('Store')[col].ffill().bfill()
            test_clean[col] = test_clean.groupby('Store')[col].ffill().bfill()

            if train_clean[col].isna().any() or test_clean[col].isna().any():
                overall_median = train_clean[col].median()
                train_clean[col] = train_clean[col].fillna(overall_median)
                test_clean[col] = test_clean[col].fillna(overall_median)

            missing_after = train_clean[col].isnull().sum()
            logger.info(f"   • {col}: {missing_before[f'{col}_missing_before']} → {missing_after} missing values")

        train_clean['IsHoliday'] = train_clean['IsHoliday'].astype(int)
        test_clean['IsHoliday'] = test_clean['IsHoliday'].astype(int)

        self.label_encoders['Type'] = LabelEncoder()
        train_clean['Type'] = self.label_encoders['Type'].fit_transform(train_clean['Type'])
        test_clean['Type'] = self.label_encoders['Type'].transform(test_clean['Type'])

        missing_stats = {
            'markdown_columns_filled': len(markdown_cols),
            'economic_columns_filled': 2,
            'total_missing_values_handled': sum(missing_before.values())
        }

        wandb.log(missing_stats)
        self.preprocessing_metrics.update(missing_stats)

        logger.info("Missing values handled successfully")
        logger.info(f"Markdown columns filled with 0: {len(markdown_cols)}")
        logger.info(f"Economic indicators interpolated: CPI, Unemployment")
        logger.info(f"Store type encoded: {len(self.label_encoders['Type'].classes_)} classes")

        return train_clean, test_clean

    def create_preprocessing_summary(self, train_df, test_df):
        logger.info("Creating preprocessing summary...")

        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Basic Preprocessing Results', fontsize=16, fontweight='bold')

        colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

        # Data completeness
        completeness = (1 - train_df.isnull().sum() / len(train_df)) * 100
        axes[0, 0].bar(range(len(completeness)), completeness.values,
                      color=colors[0], alpha=0.8)
        axes[0, 0].set_title('Data Completeness by Column', fontweight='bold')
        axes[0, 0].set_ylabel('Completeness (%)')
        axes[0, 0].set_xlabel('Column Index')
        axes[0, 0].grid(True, alpha=0.3)

        # Store type distribution after encoding
        type_dist = train_df['Type'].value_counts()
        axes[0, 1].bar(type_dist.index, type_dist.values,
                      color=colors[1], alpha=0.8)
        axes[0, 1].set_title('Store Type Distribution (Encoded)', fontweight='bold')
        axes[0, 1].set_ylabel('Count')
        axes[0, 1].set_xlabel('Store Type (Encoded)')
        axes[0, 1].grid(True, alpha=0.3)

        # Holiday distribution
        holiday_dist = train_df['IsHoliday'].value_counts()
        axes[1, 0].bar(['Non-Holiday', 'Holiday'], holiday_dist.values,
                      color=[colors[2], colors[3]], alpha=0.8)
        axes[1, 0].set_title('Holiday Distribution', fontweight='bold')
        axes[1, 0].set_ylabel('Count')
        axes[1, 0].grid(True, alpha=0.3)

        # Dataset size comparison
        sizes = [train_df.shape[0], test_df.shape[0]]
        axes[1, 1].bar(['Training', 'Test'], sizes,
                      color=[colors[0], colors[1]], alpha=0.8)
        axes[1, 1].set_title('Dataset Sizes', fontweight='bold')
        axes[1, 1].set_ylabel('Number of Records')
        axes[1, 1].grid(True, alpha=0.3)
        for i, v in enumerate(sizes):
            axes[1, 1].text(i, v + max(sizes) * 0.01, f'{v:,}',
                           ha='center', va='bottom', fontweight='bold')
        plt.tight_layout()
        plt.savefig('preprocessing_summary.png', dpi=300, bbox_inches='tight')
        wandb.log({"preprocessing_summary": wandb.Image('preprocessing_summary.png')})
        plt.close()

        logger.info("Preprocessing summary created and logged")

        return train_df, test_df

def run_data_loading_preprocessing(train_path, test_path, features_path, stores_path):
    logger.info("Starting Data Loading and Preprocessing Pipeline")
    data_loader = DataLoader()
    preprocessor = BasicPreprocessor()
    train_df, test_df, features_df, stores_df = data_loader.load_datasets(
        train_path, test_path, features_path, stores_path
    )
    train_merged, test_merged = preprocessor.merge_datasets(
        train_df, test_df, features_df, stores_df
    )
    train_clean, test_clean = preprocessor.handle_missing_values(
        train_merged, test_merged
    )
    train_final, test_final = preprocessor.create_preprocessing_summary(
        train_clean, test_clean
    )
    logger.info("Data Loading and Preprocessing completed successfully")

    return train_final, test_final, preprocessor

### Engineering Time-based and Holiday-related Features
Creating new features to capture the effects of holidays and temporal patterns on the data.


In [None]:
class XGBoostEngineering:
    def __init__(self, project_name="walmart-sales-prediction"):
        self.project_name = project_name
        self.train_df = None
        self.test_df = None
        self.wandb_run = None

    def initialize_wandb(self):
        self.wandb_run = wandb.init(
            project=self.project_name,
            name="engineering",
            tags=["feature-engineering", "feature-selection", "preprocessing", "xgboost"],
            reinit=True
        )

    def run_feature_engineering(self, train_df, test_df):
        if self.wandb_run is None:
            self.initialize_wandb()

        self.train_df = train_df.copy()
        self.test_df = test_df.copy()

        initial_features = len(self.train_df.columns)

        self.add_time_features()
        self.add_holiday_features()
        self.add_statistical_features()
        self.add_markdown_features()
        self.add_economic_features()

        final_features = len(self.train_df.columns)

        feature_engineering_metrics = {
            "initial_features": initial_features,
            "final_features": final_features,
            "new_features_created": final_features - initial_features,
            "feature_density": final_features / len(self.train_df),
            "avg_total_markdown": self.train_df['Total_MarkDown'].mean(),
            "avg_economic_health": self.train_df['Economic_Health'].mean(),
            "holiday_records_ratio": self.train_df['IsHoliday'].mean()
        }

        wandb.log(feature_engineering_metrics)

        plt.figure(figsize=(12, 8))

        plt.subplot(2, 2, 1)
        plt.hist(self.train_df['Total_MarkDown'], bins=30, alpha=0.7)
        plt.title('Total Markdown Distribution')
        plt.xlabel('Total Markdown')
        plt.ylabel('Frequency')

        plt.subplot(2, 2, 2)
        plt.hist(self.train_df['Economic_Health'], bins=30, alpha=0.7)
        plt.title('Economic Health Distribution')
        plt.xlabel('Economic Health')
        plt.ylabel('Frequency')

        plt.subplot(2, 2, 3)
        monthly_sales = self.train_df.groupby('Month')['Weekly_Sales'].mean()
        plt.plot(monthly_sales.index, monthly_sales.values, marker='o')
        plt.title('Average Sales by Month')
        plt.xlabel('Month')
        plt.ylabel('Average Sales')

        plt.subplot(2, 2, 4)
        holiday_sales = self.train_df.groupby('IsHoliday')['Weekly_Sales'].mean()
        plt.bar(['Non-Holiday', 'Holiday'], holiday_sales.values)
        plt.title('Average Sales by Holiday Status')
        plt.ylabel('Average Sales')

        plt.tight_layout()
        plt.savefig('feature_engineering_analysis.png', dpi=200, bbox_inches='tight')
        wandb.log({"feature_engineering_analysis": wandb.Image('feature_engineering_analysis.png')})
        plt.close()

        return self.train_df, self.test_df

    def add_time_features(self):
        for df in [self.train_df, self.test_df]:
            df['Year'] = df['Date'].dt.year
            df['Month'] = df['Date'].dt.month
            df['Week'] = df['Date'].dt.isocalendar().week
            df['DayOfWeek'] = df['Date'].dt.dayofweek
            df['Quarter'] = df['Date'].dt.quarter
            df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
            df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
            df['DayOfMonth'] = df['Date'].dt.day
            df['DayOfYear'] = df['Date'].dt.dayofyear

            df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
            df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
            df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
            df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)
            df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
            df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)

    def add_holiday_features(self):
        super_bowl_dates = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        labor_day_dates = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        thanksgiving_dates = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        christmas_dates = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

        for df in [self.train_df, self.test_df]:
            df['IsSuperBowl'] = df['Date'].isin(super_bowl_dates).astype(int)
            df['IsLaborDay'] = df['Date'].isin(labor_day_dates).astype(int)
            df['IsThanksgiving'] = df['Date'].isin(thanksgiving_dates).astype(int)
            df['IsChristmas'] = df['Date'].isin(christmas_dates).astype(int)

        self.train_df = self.add_holiday_proximity(self.train_df)
        self.test_df = self.add_holiday_proximity(self.test_df)

    def add_holiday_proximity(self, df):
        df = df.copy()
        df['HolidayProximity'] = 0

        holidays = df.loc[df['IsHoliday'] == 1, 'Date'].unique()

        for holiday in holidays:
            days_diff = (df['Date'] - holiday).dt.days

            mask_2weeks_before = (days_diff >= -14) & (days_diff < -7)
            mask_1week_before = (days_diff >= -7) & (days_diff < 0)
            mask_holiday = (days_diff == 0)
            mask_1week_after = (days_diff > 0) & (days_diff <= 7)
            mask_2weeks_after = (days_diff > 7) & (days_diff <= 14)

            df.loc[mask_2weeks_before, 'HolidayProximity'] = -2
            df.loc[mask_1week_before, 'HolidayProximity'] = -1
            df.loc[mask_holiday, 'HolidayProximity'] = 0
            df.loc[mask_1week_after, 'HolidayProximity'] = 1
            df.loc[mask_2weeks_after, 'HolidayProximity'] = 2

        return df

    def add_statistical_features(self):
        store_stats = self.train_df.groupby('Store')['Weekly_Sales'].agg(['mean', 'std', 'median']).reset_index()
        store_stats.columns = ['Store', 'Store_Sales_Mean', 'Store_Sales_Std', 'Store_Sales_Median']

        dept_stats = self.train_df.groupby('Dept')['Weekly_Sales'].agg(['mean', 'std', 'median']).reset_index()
        dept_stats.columns = ['Dept', 'Dept_Sales_Mean', 'Dept_Sales_Std', 'Dept_Sales_Median']

        store_dept_stats = self.train_df.groupby(['Store', 'Dept'])['Weekly_Sales'].agg(['mean', 'std', 'median']).reset_index()
        store_dept_stats.columns = ['Store', 'Dept', 'StoreDept_Sales_Mean', 'StoreDept_Sales_Std', 'StoreDept_Sales_Median']

        self.train_df = self.train_df.merge(store_stats, on='Store', how='left')
        self.test_df = self.test_df.merge(store_stats, on='Store', how='left')

        self.train_df = self.train_df.merge(dept_stats, on='Dept', how='left')
        self.test_df = self.test_df.merge(dept_stats, on='Dept', how='left')

        self.train_df = self.train_df.merge(store_dept_stats, on=['Store', 'Dept'], how='left')
        self.test_df = self.test_df.merge(store_dept_stats, on=['Store', 'Dept'], how='left')

        stat_cols = ['Store_Sales_Mean', 'Store_Sales_Std', 'Store_Sales_Median',
                    'Dept_Sales_Mean', 'Dept_Sales_Std', 'Dept_Sales_Median',
                    'StoreDept_Sales_Mean', 'StoreDept_Sales_Std', 'StoreDept_Sales_Median']

        for col in stat_cols:
            if col in self.train_df.columns:
                mean_val = self.train_df[col].mean()
                self.train_df[col] = self.train_df[col].fillna(mean_val)
                self.test_df[col] = self.test_df[col].fillna(mean_val)

    def add_markdown_features(self):
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

        for df in [self.train_df, self.test_df]:
            df['Total_MarkDown'] = df[markdown_cols].sum(axis=1)
            df['Active_MarkDowns'] = (df[markdown_cols] > 0).sum(axis=1)
            df['Max_MarkDown'] = df[markdown_cols].max(axis=1)
            df['MarkDown_Intensity'] = df['Total_MarkDown'] / (df['Size'] + 1)

    def add_economic_features(self):
        for df in [self.train_df, self.test_df]:
            df['Economic_Health'] = df['CPI'] / df['Unemployment']
            df['Fuel_Impact'] = df['Fuel_Price'] / df['CPI']
            df['CPI_Normalized'] = df['CPI'] / df['CPI'].mean()
            df['Unemployment_Normalized'] = df['Unemployment'] / df['Unemployment'].mean()
            df['Fuel_Price_Normalized'] = df['Fuel_Price'] / df['Fuel_Price'].mean()

    def run_feature_selection(self, train_df, test_df, n_features=50):
        if self.wandb_run is None:
            self.initialize_wandb()

        exclude_cols = ['Weekly_Sales', 'Date']
        feature_cols = [col for col in train_df.columns if col not in exclude_cols]

        X = train_df[feature_cols].fillna(0)
        y = train_df['Weekly_Sales']

        selector = SelectKBest(score_func=f_regression, k=n_features)
        X_selected = selector.fit_transform(X, y)

        selected_features = [feature_cols[i] for i in selector.get_support(indices=True)]
        feature_scores = selector.scores_

        feature_importance_df = pd.DataFrame({
            'feature': feature_cols,
            'score': feature_scores,
            'selected': selector.get_support()
        }).sort_values('score', ascending=False)

        selection_metrics = {
            "total_features": len(feature_cols),
            "selected_features": len(selected_features),
            "selection_ratio": len(selected_features) / len(feature_cols),
            "avg_feature_score": feature_scores.mean(),
            "max_feature_score": feature_scores.max(),
            "min_selected_score": feature_importance_df[feature_importance_df['selected']]['score'].min()
        }

        wandb.log(selection_metrics)

        plt.figure(figsize=(10, 6))
        top_features = feature_importance_df.head(20)
        colors = ['red' if selected else 'blue' for selected in top_features['selected']]
        plt.barh(range(len(top_features)), top_features['score'], color=colors)
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Score')
        plt.title('Top 20 Features (Red = Selected)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig('feature_selection_analysis.png', dpi=200, bbox_inches='tight')
        wandb.log({"feature_selection_analysis": wandb.Image('feature_selection_analysis.png')})
        plt.close()

        wandb.log({
            "selected_features_table": wandb.Table(
                data=[[f, s] for f, s in zip(selected_features,
                     feature_importance_df[feature_importance_df['selected']]['score'].values)],
                columns=["Feature", "Score"]
            )
        })

        return selected_features, feature_importance_df

    def run_data_preprocessing(self, train_df, test_df, selected_features):
        if self.wandb_run is None:
            self.initialize_wandb()

        X_train = train_df[selected_features].fillna(0)
        y_train = train_df['Weekly_Sales']
        X_test = test_df[selected_features].fillna(0)

        weights = np.where(train_df['IsHoliday'] == 1, 5, 1)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features, index=X_train.index)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features, index=X_test.index)

        preprocessing_metrics = {
            "train_samples": len(X_train_scaled),
            "test_samples": len(X_test_scaled),
            "features_count": len(selected_features),
            "holiday_samples": np.sum(weights == 5),
            "non_holiday_samples": np.sum(weights == 1),
            "feature_mean_after_scaling": X_train_scaled.mean().mean(),
            "feature_std_after_scaling": X_train_scaled.std().mean(),
            "target_mean": y_train.mean(),
            "target_std": y_train.std()
        }

        wandb.log(preprocessing_metrics)

        plt.figure(figsize=(12, 4))

        plt.subplot(1, 3, 1)
        plt.hist(y_train, bins=30, alpha=0.7)
        plt.title('Target Distribution')
        plt.xlabel('Weekly Sales')
        plt.ylabel('Frequency')

        plt.subplot(1, 3, 2)
        plt.hist(weights, bins=10, alpha=0.7)
        plt.title('Sample Weights Distribution')
        plt.xlabel('Weight')
        plt.ylabel('Frequency')

        plt.subplot(1, 3, 3)
        correlation_matrix = X_train_scaled.corr()
        plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
        plt.title('Feature Correlation Matrix')
        plt.colorbar()

        plt.tight_layout()
        plt.savefig('preprocessing_analysis.png', dpi=200, bbox_inches='tight')
        wandb.log({"preprocessing_analysis": wandb.Image('preprocessing_analysis.png')})
        plt.close()

        return X_train_scaled, X_test_scaled, y_train, weights, scaler

    def finish_wandb(self):
        if self.wandb_run is not None:
            wandb.finish()

def get_feature_columns(df):
    exclude_cols = ['Weekly_Sales', 'Date']
    return [col for col in df.columns if col not in exclude_cols]

# Key steps for training:

* Prepare training and validation data, applying weighted errors for holidays.

* Train an XGBoost regression model with custom hyperparameters.

* Evaluate performance using MAE, RMSE, R², and a custom Weighted MAE.

* Visualize results: learning curves, residuals, feature importances.

* Generate and analyze predictions on test data.

* Log everything to wandb for reproducibility and monitoring.

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class XGBoostTrainer:
    def __init__(self, project_name="walmart-sales-prediction"):
        self.project_name = project_name
        self.model = None
        self.feature_importance = None
        self.training_metrics = {}
        self.scaler = None
        self.selected_features = None
        self.wandb_run = None

    def initialize_wandb(self, run_name="training"):
        if self.wandb_run is None:
            self.wandb_run = wandb.init(
                project=self.project_name,
                name=run_name,
                tags=["training", "xgboost", "model"],
                reinit=True
            )

    def weighted_mean_absolute_error(self, y_true, y_pred, weights):
        return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

    def prepare_training_data(self, train_df, selected_features):
        logger.info("Preparing training data...")

        X = train_df[selected_features].fillna(0)
        y = train_df['Weekly_Sales']

        weights = np.where(train_df['IsHoliday'] == 1, 5, 1)

        X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
            X, y, weights, test_size=0.2, random_state=42,
            stratify=train_df['IsHoliday']
        )

        prep_metrics = {
            'total_training_samples': len(X),
            'train_split_samples': len(X_train),
            'val_split_samples': len(X_val),
            'total_features': len(selected_features),
            'holiday_samples': np.sum(weights == 5),
            'non_holiday_samples': np.sum(weights == 1),
            'holiday_weight_ratio': 5.0
        }

        if self.wandb_run is not None:
            wandb.log(prep_metrics)
        self.training_metrics.update(prep_metrics)

        logger.info(f"Training data prepared: {len(X_train)} train, {len(X_val)} validation samples")
        logger.info(f"Holiday samples: {np.sum(w_train == 5)} train, {np.sum(w_val == 5)} validation")

        return X_train, X_val, y_train, y_val, w_train, w_val

    def train_model(self, X_train, y_train, X_val, y_val, w_train, w_val):
        logger.info("Starting XGBoost model training...")

        training_start_time = datetime.now()

        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'mae',
            'max_depth': 8,
            'learning_rate': 0.05,
            'n_estimators': 2700,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'colsample_bylevel': 0.8,
            'reg_alpha': 1,
            'reg_lambda': 1,
            'random_state': 42,
            'n_jobs': -1,
            'tree_method': 'hist',
            'early_stopping_rounds': 50
        }

        if self.wandb_run is not None:
            wandb.log({"model_params": params})

        self.model = xgb.XGBRegressor(**params)

        logger.info("Training XGBoost model...")
        self.model.fit(
            X_train, y_train,
            sample_weight=w_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            sample_weight_eval_set=[w_train, w_val],
            verbose=100
        )

        training_end_time = datetime.now()
        training_duration = (training_end_time - training_start_time).total_seconds()

        logger.info(f"Training completed in {training_duration:.2f} seconds")

        return training_duration

    def evaluate_model(self, X_train, y_train, X_val, y_val, w_train, w_val, training_duration):
        logger.info("Evaluating model performance...")

        train_pred = self.model.predict(X_train)
        val_pred = self.model.predict(X_val)

        train_wmae = self.weighted_mean_absolute_error(y_train, train_pred, w_train)
        val_wmae = self.weighted_mean_absolute_error(y_val, val_pred, w_val)

        train_mae = mean_absolute_error(y_train, train_pred)
        val_mae = mean_absolute_error(y_val, val_pred)

        train_mse = mean_squared_error(y_train, train_pred)
        val_mse = mean_squared_error(y_val, val_pred)

        train_rmse = np.sqrt(train_mse)
        val_rmse = np.sqrt(val_mse)

        train_r2 = r2_score(y_train, train_pred)
        val_r2 = r2_score(y_val, val_pred)

        metrics = {
            'train_wmae': train_wmae,
            'val_wmae': val_wmae,
            'train_mae': train_mae,
            'val_mae': val_mae,
            'train_mse': train_mse,
            'val_mse': val_mse,
            'train_rmse': train_rmse,
            'val_rmse': val_rmse,
            'train_r2': train_r2,
            'val_r2': val_r2,
            'training_duration_seconds': training_duration,
            'overfitting_ratio': val_wmae / train_wmae
        }

        if self.wandb_run is not None:
            wandb.log(metrics)
        self.training_metrics.update(metrics)

        logger.info(f"Training WMAE: {train_wmae:.2f}")
        logger.info(f"Validation WMAE: {val_wmae:.2f}")
        logger.info(f"Training MAE: {train_mae:.2f}")
        logger.info(f"Validation MAE: {val_mae:.2f}")
        logger.info(f"Training R²: {train_r2:.4f}")
        logger.info(f"Validation R²: {val_r2:.4f}")
        logger.info(f"Overfitting ratio: {val_wmae/train_wmae:.3f}")

        return train_pred, val_pred, metrics

    def analyze_feature_importance(self, feature_names):
        logger.info("Analyzing feature importance...")

        self.feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)

        logger.info("Top 15 Most Important Features:")
        top_features = self.feature_importance.head(15)
        for idx, row in top_features.iterrows():
            logger.info(f"{row['feature']}: {row['importance']:.4f}")

        if self.wandb_run is not None:
            wandb.log({
                "feature_importance_table": wandb.Table(
                    data=self.feature_importance.head(20).values.tolist(),
                    columns=["Feature", "Importance"]
                )
            })

        return self.feature_importance

    def create_training_plots(self, X_train, y_train, X_val, y_val, train_pred, val_pred):
        logger.info("Creating training visualization plots...")

        plt.figure(figsize=(20, 15))

        plt.subplot(2, 4, 1)
        plt.scatter(y_train, train_pred, alpha=0.5, s=1)
        plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
        plt.xlabel('Actual Sales')
        plt.ylabel('Predicted Sales')
        plt.title('Training: Actual vs Predicted')

        plt.subplot(2, 4, 2)
        plt.scatter(y_val, val_pred, alpha=0.5, s=1)
        plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
        plt.xlabel('Actual Sales')
        plt.ylabel('Predicted Sales')
        plt.title('Validation: Actual vs Predicted')

        plt.subplot(2, 4, 3)
        residuals_train = y_train - train_pred
        plt.scatter(train_pred, residuals_train, alpha=0.5, s=1)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Sales')
        plt.ylabel('Residuals')
        plt.title('Training Residuals')

        plt.subplot(2, 4, 4)
        residuals_val = y_val - val_pred
        plt.scatter(val_pred, residuals_val, alpha=0.5, s=1)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Sales')
        plt.ylabel('Residuals')
        plt.title('Validation Residuals')

        plt.subplot(2, 4, 5)
        top_features = self.feature_importance.head(15)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Importance')
        plt.title('Top 15 Feature Importances')
        plt.gca().invert_yaxis()

        plt.subplot(2, 4, 6)
        plt.hist(residuals_train, bins=50, alpha=0.7, label='Training', density=True)
        plt.hist(residuals_val, bins=50, alpha=0.7, label='Validation', density=True)
        plt.xlabel('Residuals')
        plt.ylabel('Density')
        plt.title('Residuals Distribution')
        plt.legend()

        plt.subplot(2, 4, 7)
        train_errors = np.abs(residuals_train)
        val_errors = np.abs(residuals_val)
        plt.hist(train_errors, bins=50, alpha=0.7, label='Training', density=True)
        plt.hist(val_errors, bins=50, alpha=0.7, label='Validation', density=True)
        plt.xlabel('Absolute Error')
        plt.ylabel('Density')
        plt.title('Absolute Error Distribution')
        plt.legend()

        plt.subplot(2, 4, 8)
        learning_curve = self.model.evals_result()
        if learning_curve:
            train_mae = learning_curve['validation_0']['mae']
            val_mae = learning_curve['validation_1']['mae']
            plt.plot(train_mae, label='Training MAE')
            plt.plot(val_mae, label='Validation MAE')
            plt.xlabel('Iterations')
            plt.ylabel('MAE')
            plt.title('Learning Curve')
            plt.legend()

        plt.tight_layout()
        plt.savefig('training_analysis.png', dpi=300, bbox_inches='tight')
        if self.wandb_run is not None:
            wandb.log({"training_analysis": wandb.Image('training_analysis.png')})
        plt.close()

        logger.info("Training plots created and saved")

    def generate_predictions(self, test_df, selected_features):
        logger.info("Generating test predictions...")

        X_test = test_df[selected_features].fillna(0)
        test_predictions = self.model.predict(X_test)

        return test_predictions

    def create_prediction_analysis(self, test_df, predictions):
        logger.info("Creating prediction analysis plots...")

        plt.figure(figsize=(15, 10))

        plt.subplot(2, 3, 1)
        plt.hist(predictions, bins=50, alpha=0.7, edgecolor='black')
        plt.title('Prediction Distribution')
        plt.xlabel('Predicted Sales')
        plt.ylabel('Frequency')

        plt.subplot(2, 3, 2)
        test_df_copy = test_df.copy()
        test_df_copy['Predictions'] = predictions
        monthly_pred = test_df_copy.groupby(test_df_copy['Date'].dt.to_period('M'))['Predictions'].mean()
        monthly_pred.plot()
        plt.title('Monthly Prediction Trend')
        plt.xlabel('Month')
        plt.ylabel('Average Predicted Sales')
        plt.xticks(rotation=45)

        plt.subplot(2, 3, 3)
        store_pred = test_df_copy.groupby('Store')['Predictions'].mean()
        store_pred.plot(kind='bar')
        plt.title('Average Predictions by Store')
        plt.xlabel('Store')
        plt.ylabel('Average Predicted Sales')
        plt.xticks(rotation=45)

        plt.subplot(2, 3, 4)
        holiday_pred = test_df_copy.groupby('IsHoliday')['Predictions'].mean()
        holiday_pred.plot(kind='bar')
        plt.title('Holiday vs Non-Holiday Predictions')
        plt.xlabel('IsHoliday')
        plt.ylabel('Average Predicted Sales')

        plt.subplot(2, 3, 5)
        dept_pred = test_df_copy.groupby('Dept')['Predictions'].mean().sort_values(ascending=False).head(10)
        dept_pred.plot(kind='bar')
        plt.title('Top 10 Departments by Predicted Sales')
        plt.xlabel('Department')
        plt.ylabel('Average Predicted Sales')
        plt.xticks(rotation=45)

        plt.subplot(2, 3, 6)
        plt.boxplot([test_df_copy[test_df_copy['IsHoliday'] == 0]['Predictions'],
                    test_df_copy[test_df_copy['IsHoliday'] == 1]['Predictions']],
                   labels=['Non-Holiday', 'Holiday'])
        plt.title('Prediction Distribution by Holiday Status')
        plt.ylabel('Predicted Sales')

        plt.tight_layout()
        plt.savefig('prediction_analysis.png', dpi=300, bbox_inches='tight')
        if self.wandb_run is not None:
            wandb.log({"prediction_analysis": wandb.Image('prediction_analysis.png')})
        plt.close()

        pred_stats = {
            'prediction_mean': predictions.mean(),
            'prediction_median': np.median(predictions),
            'prediction_std': predictions.std(),
            'prediction_min': predictions.min(),
            'prediction_max': predictions.max(),
            'negative_predictions': np.sum(predictions < 0),
            'zero_predictions': np.sum(predictions == 0),
            'holiday_avg_prediction': test_df_copy[test_df_copy['IsHoliday'] == 1]['Predictions'].mean(),
            'non_holiday_avg_prediction': test_df_copy[test_df_copy['IsHoliday'] == 0]['Predictions'].mean()
        }

        if self.wandb_run is not None:
            wandb.log(pred_stats)
        logger.info(f"Prediction statistics: {pred_stats}")

        return pred_stats

    def run_complete_training(self, train_df, test_df, selected_features):
        self.initialize_wandb("training")

        X_train, X_val, y_train, y_val, w_train, w_val = self.prepare_training_data(train_df, selected_features)

        training_duration = self.train_model(X_train, y_train, X_val, y_val, w_train, w_val)

        train_pred, val_pred, metrics = self.evaluate_model(X_train, y_train, X_val, y_val, w_train, w_val, training_duration)

        self.analyze_feature_importance(selected_features)

        self.create_training_plots(X_train, y_train, X_val, y_val, train_pred, val_pred)

        test_predictions = self.generate_predictions(test_df, selected_features)

        pred_stats = self.create_prediction_analysis(test_df, test_predictions)

        return self.model, test_predictions, metrics, pred_stats

    def finish_wandb(self):
        if self.wandb_run is not None:
            wandb.finish()
            self.wandb_run = None


def main():
    wandb.init(
        project="walmart-sales-prediction",
        name=f"walmart_sales_complete_pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
        config={
            "model_type": "XGBoost",
            "objective": "sales_forecasting",
            "dataset": "walmart_sales",
            "pipeline_version": "v1.0"
        }
    )

    logger.info(" Starting Walmart Sales Prediction Pipeline")
    pipeline_start_time = datetime.now()

    train_path = '/content/drive/MyDrive/ML/final/train.csv'
    test_path = '/content/drive/MyDrive/ML/final/test.csv'
    features_path = '/content/drive/MyDrive/ML/final/features.csv'
    stores_path = '/content/drive/MyDrive/ML/final/stores.csv'

    try:
        logger.info(" Step 1: Data Loading and Preprocessing")
        train_df, test_df, preprocessor = run_data_loading_preprocessing(
            train_path, test_path, features_path, stores_path
        )

        logger.info(" Step 2: Feature Engineering")
        feature_engineer = XGBoostEngineering()
        train_engineered, test_engineered = feature_engineer.run_feature_engineering(train_df, test_df)

        logger.info(" Step 3: Feature Selection")
        selected_features, feature_importance_df = feature_engineer.run_feature_selection(
            train_engineered, test_engineered, n_features=50
        )

        logger.info(" Step 4: Data Preprocessing for Training")
        X_train_scaled, X_test_scaled, y_train, weights, scaler = feature_engineer.run_data_preprocessing(
            train_engineered, test_engineered, selected_features
        )

        logger.info("Step 5: XGBoost Model Training")
        trainer = XGBoostTrainer()
        model, test_predictions, training_metrics, pred_stats = trainer.run_complete_training(
            train_engineered, test_engineered, selected_features
        )

        logger.info("Step 6: Creating Submission File")
        submission = pd.DataFrame({
            'Id': test_engineered['Store'].astype(str) + '_' +
                  test_engineered['Dept'].astype(str) + '_' +
                  test_engineered['Date'].dt.strftime('%Y-%m-%d'),
            'Weekly_Sales': test_predictions
        })

        pipeline_end_time = datetime.now()
        total_pipeline_duration = (pipeline_end_time - pipeline_start_time).total_seconds()

        final_metrics = {
            'submission_records': len(submission),
            'unique_store_dept_combinations': len(submission['Id'].unique()),
            'total_pipeline_duration_seconds': total_pipeline_duration,
            'total_pipeline_duration_minutes': total_pipeline_duration / 60,
            'pipeline_success': True
        }

        wandb.log(final_metrics)

        submission.to_csv('walmart_sales_predictions.csv', index=False)
        logger.info("Predictions saved to 'walmart_sales_predictions.csv'")

        wandb.log({
            "submission_sample": wandb.Table(
                data=submission.head(10).values.tolist(),
                columns=["Id", "Weekly_Sales"]
            )
        })

        wandb.save('walmart_sales_predictions.csv')

        logger.info("Pipeline completed successfully!")
        logger.info(f"Total pipeline duration: {total_pipeline_duration:.2f} seconds ({total_pipeline_duration/60:.2f} minutes)")
        logger.info(f"Submission shape: {submission.shape}")
        logger.info(f"Best validation WMAE: {training_metrics.get('val_wmae', 'N/A')}")
        logger.info(f"Model R² score: {training_metrics.get('val_r2', 'N/A')}")

        logger.info("Submission preview:")
        logger.info(submission.head().to_string())

        wandb.finish()

        return model, submission, training_metrics

    except Exception as e:
        logger.error(f"Pipeline failed with error: {str(e)}")
        wandb.log({"pipeline_success": False, "error": str(e)})
        wandb.finish()
        raise e


if __name__ == "__main__":
    model, submission, metrics = main()

0,1
avg_economic_health,▁
avg_feature_score,▁
avg_total_markdown,▁
feature_density,▁
final_features,▁
holiday_records_ratio,▁
initial_features,▁
max_feature_score,▁
min_selected_score,▁
new_features_created,▁

0,1
avg_economic_health,22.83277
avg_feature_score,179647.37661
avg_total_markdown,6684.04143
feature_density,0.00013
final_features,54.0
holiday_records_ratio,0.07036
initial_features,16.0
max_feature_score,3935213.15216
min_selected_score,0.0
new_features_created,38.0


0,1
avg_weekly_sales,▁
columns_added,▁
date_range_days,▁
economic_columns_filled,▁
holiday_percentage,▁
markdown_columns_filled,▁
max_weekly_sales,▁
median_weekly_sales,▁
min_weekly_sales,▁
negative_sales_count,▁

0,1
avg_weekly_sales,15981.25812
columns_added,11.0
date_range_days,994.0
economic_columns_filled,2.0
holiday_percentage,7.03584
markdown_columns_filled,5.0
max_weekly_sales,693099.36
median_weekly_sales,7612.03
min_weekly_sales,-4988.94
negative_sales_count,1285.0


0,1
avg_economic_health,▁
avg_feature_score,▁
avg_total_markdown,▁
feature_density,▁
feature_mean_after_scaling,▁
feature_std_after_scaling,▁
features_count,▁
final_features,▁
holiday_records_ratio,▁
holiday_samples,▁

0,1
avg_economic_health,22.83277
avg_feature_score,179647.37661
avg_total_markdown,6684.04143
feature_density,0.00013
feature_mean_after_scaling,-0.0
feature_std_after_scaling,0.98
features_count,50.0
final_features,54.0
holiday_records_ratio,0.07036
holiday_samples,29661.0


[0]	validation_0-mae:14651.87986	validation_1-mae:14751.21257
[100]	validation_0-mae:1816.70698	validation_1-mae:1977.11188
[200]	validation_0-mae:1514.50806	validation_1-mae:1714.66426
[300]	validation_0-mae:1357.38421	validation_1-mae:1587.32525
[400]	validation_0-mae:1250.19743	validation_1-mae:1508.59793
[500]	validation_0-mae:1165.05481	validation_1-mae:1447.37914
[600]	validation_0-mae:1099.58853	validation_1-mae:1404.92889
[700]	validation_0-mae:1046.96577	validation_1-mae:1371.55526
[800]	validation_0-mae:1006.80519	validation_1-mae:1348.66384
[900]	validation_0-mae:966.38259	validation_1-mae:1325.13685
[1000]	validation_0-mae:933.31555	validation_1-mae:1307.21165
[1100]	validation_0-mae:901.73826	validation_1-mae:1290.35726
[1200]	validation_0-mae:873.27153	validation_1-mae:1275.68019
[1300]	validation_0-mae:845.87437	validation_1-mae:1261.97867
[1400]	validation_0-mae:821.91526	validation_1-mae:1250.91354
[1500]	validation_0-mae:798.44644	validation_1-mae:1239.27698
[1600]	va

0,1
holiday_avg_prediction,▁
holiday_samples,▁
holiday_weight_ratio,▁
negative_predictions,▁
non_holiday_avg_prediction,▁
non_holiday_samples,▁
overfitting_ratio,▁
prediction_max,▁
prediction_mean,▁
prediction_median,▁

0,1
holiday_avg_prediction,18752.87109
holiday_samples,29661
holiday_weight_ratio,5
negative_predictions,2671
non_holiday_avg_prediction,16568.99023
non_holiday_samples,391909
overfitting_ratio,1.89662
pipeline_success,True
prediction_max,616618.375
prediction_mean,16738.44141
