# 🔬 BigMart Sales - Proper Train/Validation Split Approach

This notebook implements a robust validation methodology by:

1. **Loading original training data**
2. **Creating proper train/validation splits** 
3. **Saving splits separately** (train_splitted.csv, validation_splitted.csv)
4. **Training only on train_splitted.csv**
5. **Validating on validation_splitted.csv** (unseen data)

This approach ensures true validation performance by eliminating any potential data leakage between training and validation phases.

In [1]:
# 1. Import Libraries and Setup
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, GroupKFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("🔬 BigMart Sales - Proper Train/Validation Split Approach")
print("=" * 60)
print("✅ Libraries imported successfully")
print(f"🎲 Random state set to: {RANDOM_STATE}")

🔬 BigMart Sales - Proper Train/Validation Split Approach
✅ Libraries imported successfully
🎲 Random state set to: 42


In [2]:
# 2. Load Original Data, Create ONE Split, and Save to CSV Files
print("📊 Loading Original Data, Creating Split, and Saving...")
print("-" * 60)

# Load original training data
train_data_raw = pd.read_csv('code/train_data.csv')
print(f"✅ Original training data loaded: {train_data_raw.shape}")

# Create ONE train/validation split using GroupKFold
cv_strategy = GroupKFold(n_splits=5)
groups = train_data_raw['Item_Identifier']

# Get the first split (80/20 split approximately)
train_idx, val_idx = next(cv_strategy.split(train_data_raw, train_data_raw['Item_Outlet_Sales'], groups))

# Create train and validation datasets
train_data_split = train_data_raw.iloc[train_idx].copy()
validation_data_split = train_data_raw.iloc[val_idx].copy()

print(f"✅ Data split created:")
print(f"   • Training split: {train_data_split.shape} ({len(train_idx)/len(train_data_raw)*100:.1f}%)")
print(f"   • Validation split: {validation_data_split.shape} ({len(val_idx)/len(train_data_raw)*100:.1f}%)")

# Verify no item overlap
train_items = set(train_data_split['Item_Identifier'])
val_items = set(validation_data_split['Item_Identifier'])
overlap = train_items.intersection(val_items)
print(f"   • Overlap: {len(overlap)} (should be 0) - {'✅ GOOD' if len(overlap) == 0 else '❌ BAD'}")

# Save splits to CSV files
output_dir = Path("data_splits")
output_dir.mkdir(exist_ok=True)

train_split_path = output_dir / "train_data_splitted.csv"
validation_split_path = output_dir / "validation_data_splitted.csv"

train_data_split.to_csv(train_split_path, index=False)
validation_data_split.to_csv(validation_split_path, index=False)

print(f"\n💾 Files Saved:")
print(f"   • {train_split_path.name}")
print(f"   • {validation_split_path.name}")

# Clear all data from memory
del train_data_raw, train_data_split, validation_data_split, train_idx, val_idx, groups, train_items, val_items
print(f"\n🧹 Memory cleared - ready for clean training process")

📊 Loading Original Data, Creating Split, and Saving...
------------------------------------------------------------
✅ Original training data loaded: (8523, 12)
✅ Data split created:
   • Training split: (6818, 12) (80.0%)
   • Validation split: (1705, 12) (20.0%)
   • Overlap: 0 (should be 0) - ✅ GOOD

💾 Files Saved:
   • train_data_splitted.csv
   • validation_data_splitted.csv

🧹 Memory cleared - ready for clean training process


In [3]:
# 3. Define BigMartPreprocessor Class
class BigMartPreprocessor(BaseEstimator, TransformerMixin):
    """Complete preprocessing pipeline for BigMart sales data"""
    
    def __init__(self):
        self.item_stats = None
        self.outlet_stats = None
        self.item_target_mean = None
        self.overall_mean = None
        self.outlet_size_mode = {}
        self.is_fitted = False
        
    def fit(self, X, y=None):
        if y is not None:
            self.overall_mean = y.mean()
            X_temp = X.copy()
            X_temp['target'] = y
            
            # Item statistics
            self.item_stats = X_temp.groupby('Item_Identifier')['target'].agg([
                'mean', 'median', 'std', 'count'
            ]).add_prefix('Item_')
            
            # Outlet statistics  
            self.outlet_stats = X_temp.groupby('Outlet_Identifier')['target'].agg([
                'mean', 'median', 'std', 'count'
            ]).add_prefix('Outlet_')
            
            # Target encoding
            self.item_target_mean = X_temp.groupby('Item_Identifier')['target'].mean().to_dict()
        
        # Outlet size mode by outlet type
        if 'Outlet_Size' in X.columns and X['Outlet_Size'].isnull().any():
            self.outlet_size_mode = X.groupby('Outlet_Type')['Outlet_Size'].apply(
                lambda x: x.mode().iloc[0] if not x.mode().empty else 'Medium'
            ).to_dict()
        
        self.is_fitted = True
        return self
    
    def transform(self, X):
        X_processed = X.copy()
        
        # 1. Handle missing values
        if 'Item_Weight' in X_processed.columns and X_processed['Item_Weight'].isnull().any():
            weight_median_by_type = X_processed.groupby('Item_Type')['Item_Weight'].median()
            for item_type in X_processed['Item_Type'].unique():
                mask = (X_processed['Item_Type'] == item_type) & X_processed['Item_Weight'].isnull()
                if mask.any():
                    median_val = weight_median_by_type.get(item_type, X_processed['Item_Weight'].median())
                    X_processed.loc[mask, 'Item_Weight'] = median_val
        
        if 'Outlet_Size' in X_processed.columns and X_processed['Outlet_Size'].isnull().any():
            for outlet_type, mode_size in self.outlet_size_mode.items():
                mask = (X_processed['Outlet_Type'] == outlet_type) & X_processed['Outlet_Size'].isnull()
                if mask.any():
                    X_processed.loc[mask, 'Outlet_Size'] = mode_size
        
        # 2. Enhanced Item_Identifier features
        X_processed['Item_Category'] = X_processed['Item_Identifier'].str[:2]
        item_numeric = X_processed['Item_Identifier'].str[2:]
        X_processed['Item_Number'] = pd.to_numeric(item_numeric, errors='coerce').fillna(0).astype(int)
        
        category_mapping = {'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'}
        X_processed['Item_Category_Group'] = X_processed['Item_Category'].map(category_mapping)
        
        # Target encoding
        if self.item_target_mean is not None:
            X_processed['Item_Target_Encoded'] = X_processed['Item_Identifier'].map(self.item_target_mean)
            X_processed['Item_Target_Encoded'].fillna(self.overall_mean, inplace=True)
        
        # 3. Add statistics
        if self.item_stats is not None:
            X_processed = X_processed.merge(self.item_stats, left_on='Item_Identifier', right_index=True, how='left')
        if self.outlet_stats is not None:
            X_processed = X_processed.merge(self.outlet_stats, left_on='Outlet_Identifier', right_index=True, how='left')
        
        # 4. Feature engineering
        X_processed['Item_MRP_Bin'] = pd.cut(X_processed['Item_MRP'], bins=4, labels=['Low', 'Medium', 'High', 'Premium'])
        X_processed['Outlet_Age'] = 2013 - X_processed['Outlet_Establishment_Year']
        X_processed['Outlet_Age_Group'] = pd.cut(X_processed['Outlet_Age'], bins=[0, 10, 20, 30], labels=['New', 'Medium', 'Old'])
        X_processed['Item_Visibility_Binned'] = pd.cut(X_processed['Item_Visibility'], bins=5, labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])
        
        food_categories = ['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables', 
                          'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
                          'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
                          'Breads', 'Starchy Foods', 'Others', 'Seafood']
        X_processed['Item_Type_Category'] = X_processed['Item_Type'].apply(
            lambda x: 'Food' if x in food_categories else 'Non-Food'
        )
        
        # 5. Encode categorical variables
        categorical_cols = X_processed.select_dtypes(include=['object', 'category']).columns.tolist()
        id_cols = ['Item_Identifier', 'Outlet_Identifier']
        categorical_cols = [col for col in categorical_cols if col not in id_cols]
        
        if categorical_cols:
            X_encoded = pd.get_dummies(X_processed, columns=categorical_cols, drop_first=True)
        else:
            X_encoded = X_processed
        
        # Keep Item_Identifier for GroupKFold, remove Outlet_Identifier
        feature_cols = [col for col in X_encoded.columns if col != 'Outlet_Identifier']
        X_final = X_encoded[feature_cols]
        
        return X_final

print("✅ BigMartPreprocessor class defined")

✅ BigMartPreprocessor class defined


In [4]:
# 4. Load ONLY Training Split and Perform Complete Model Development
print("🚀 Loading Training Split and Starting Model Development...")
print("=" * 60)

# Load ONLY the training split
train_data = pd.read_csv("data_splits/train_data_splitted.csv")
print(f"✅ Training split loaded: {train_data.shape}")
print(f"   • Unique items: {train_data['Item_Identifier'].nunique()}")
print(f"   • Target mean: ${train_data['Item_Outlet_Sales'].mean():.2f}")

# Prepare features and target
X_train_raw = train_data.drop('Item_Outlet_Sales', axis=1)
y_train = train_data['Item_Outlet_Sales']

print(f"📊 Training Data:")
print(f"   • Features: {X_train_raw.shape}")
print(f"   • Target: {y_train.shape}")

# Fit preprocessing pipeline on training split only
preprocessor = BigMartPreprocessor()
preprocessor.fit(X_train_raw, y_train)
print("✅ Preprocessing pipeline fitted")

# Transform training data
X_train_processed = preprocessor.transform(X_train_raw)
print(f"✅ Training data transformed: {X_train_raw.shape} → {X_train_processed.shape}")

# Prepare data for modeling (remove Item_Identifier)
X_model = X_train_processed.drop('Item_Identifier', axis=1) if 'Item_Identifier' in X_train_processed.columns else X_train_processed
print(f"📊 Model data ready: {X_model.shape}")

# Setup GroupKFold cross-validation on training split
cv_strategy = GroupKFold(n_splits=5)
cv_groups = X_train_processed['Item_Identifier'] if 'Item_Identifier' in X_train_processed.columns else X_train_raw['Item_Identifier']

print(f"\n🔀 Cross-Validation Setup (Training Split Only):")
print(f"   • Strategy: GroupKFold (5 splits)")
print(f"   • Groups: {cv_groups.nunique()} unique items")
print(f"   • Total records: {len(X_model)}")

# Train Random Forest model with cross-validation
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print(f"\n🤖 Performing Cross-Validation...")
cv_scores_r2 = cross_val_score(rf_model, X_model, y_train, cv=cv_strategy, 
                               groups=cv_groups, scoring='r2', n_jobs=-1)
cv_scores_neg_rmse = cross_val_score(rf_model, X_model, y_train, cv=cv_strategy, 
                                     groups=cv_groups, scoring='neg_root_mean_squared_error', n_jobs=-1)
cv_scores_rmse = -cv_scores_neg_rmse

print(f"\n📊 Cross-Validation Results:")
print(f"   • R² Mean: {cv_scores_r2.mean():.4f} ± {cv_scores_r2.std():.4f}")
print(f"   • RMSE Mean: ${cv_scores_rmse.mean():.2f} ± ${cv_scores_rmse.std():.2f}")

# Train final model on full training split
print(f"\n🎯 Training Final Model on Complete Training Split...")
rf_model.fit(X_model, y_train)
print("✅ Final model trained")

# Save the model and pipeline
models_dir = Path("trained_models")
models_dir.mkdir(exist_ok=True)

timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
model_path = models_dir / f"best_model_{timestamp}.pkl"
pipeline_path = models_dir / f"preprocessor_{timestamp}.pkl"

joblib.dump(rf_model, model_path)
joblib.dump(preprocessor, pipeline_path)

print(f"💾 Model saved: {model_path.name}")
print(f"💾 Pipeline saved: {pipeline_path.name}")

print(f"\n🎯 Training Complete - Ready for Validation Test!")
print(f"   • Cross-validation R²: {cv_scores_r2.mean():.4f}")
print(f"   • Cross-validation RMSE: ${cv_scores_rmse.mean():.2f}")
print(f"   • Next: Load validation split and test final performance")

🚀 Loading Training Split and Starting Model Development...
✅ Training split loaded: (6818, 12)
   • Unique items: 1247
   • Target mean: $2166.09
📊 Training Data:
   • Features: (6818, 11)
   • Target: (6818,)
✅ Preprocessing pipeline fitted
✅ Training data transformed: (6818, 11) → (6818, 55)
📊 Model data ready: (6818, 54)

🔀 Cross-Validation Setup (Training Split Only):
   • Strategy: GroupKFold (5 splits)
   • Groups: 1247 unique items
   • Total records: 6818

🤖 Performing Cross-Validation...

📊 Cross-Validation Results:
   • R² Mean: 0.6962 ± 0.0133
   • RMSE Mean: $935.43 ± $27.64

🎯 Training Final Model on Complete Training Split...
✅ Final model trained
💾 Model saved: best_model_20250906_160015.pkl
💾 Pipeline saved: preprocessor_20250906_160015.pkl

🎯 Training Complete - Ready for Validation Test!
   • Cross-validation R²: 0.6962
   • Cross-validation RMSE: $935.43
   • Next: Load validation split and test final performance


In [5]:
# 5. Load Validation Split and Test Final Model Performance
print("🧪 Loading Validation Split and Testing Final Model...")
print("=" * 60)

# Load validation split (completely unseen data)
validation_data = pd.read_csv("data_splits/validation_data_splitted.csv")
print(f"✅ Validation split loaded: {validation_data.shape}")
print(f"   • Unique items: {validation_data['Item_Identifier'].nunique()}")

# Prepare validation features and target
X_val_raw = validation_data.drop('Item_Outlet_Sales', axis=1)
y_val = validation_data['Item_Outlet_Sales']

print(f"📊 Validation Data:")
print(f"   • Features: {X_val_raw.shape}")
print(f"   • Target: {y_val.shape}")
print(f"   • Target mean: ${y_val.mean():.2f}")

# Apply preprocessing pipeline (fitted on training data only)
print(f"\n🔧 Applying Preprocessing Pipeline...")
X_val_processed = preprocessor.transform(X_val_raw)
print(f"✅ Validation data transformed: {X_val_raw.shape} → {X_val_processed.shape}")

# Prepare for model prediction (remove Item_Identifier)
X_val_model = X_val_processed.drop('Item_Identifier', axis=1) if 'Item_Identifier' in X_val_processed.columns else X_val_processed
print(f"📊 Validation model data: {X_val_model.shape}")

# Make predictions
print(f"\n🔮 Making Predictions...")
y_pred = rf_model.predict(X_val_model)

# Calculate metrics
r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)

print(f"\n📈 FINAL VALIDATION RESULTS:")
print("=" * 50)
print(f"📊 R² Score: {r2:.4f}")
print(f"💰 RMSE: ${rmse:.2f}")
print(f"📊 MAE: ${mae:.2f}")

print(f"\n📊 Cross-Validation vs Validation Comparison:")
print(f"   • CV R²: {cv_scores_r2.mean():.4f} → Validation R²: {r2:.4f}")
print(f"   • CV RMSE: ${cv_scores_rmse.mean():.2f} → Validation RMSE: ${rmse:.2f}")

# Performance assessment
print(f"\n🎯 Performance Assessment:")
if abs(r2 - cv_scores_r2.mean()) <= 0.05 and abs(rmse - cv_scores_rmse.mean()) <= 100:
    print("✅ EXCELLENT! Validation performance matches cross-validation")
    print("   • No overfitting detected")
    print("   • Model generalizes well to unseen data")
elif r2 >= 0.65 and rmse <= 1000:
    print("✅ GOOD! Validation performance is acceptable")
    print("   • Meets performance thresholds")
else:
    print("⚠️ CONCERNS: Validation performance below expectations")
    print("   • May need model improvement")

print(f"\n🎉 VALIDATION COMPLETE!")
print(f"   • True validation on completely unseen data")
print(f"   • No data leakage between training and validation")
print(f"   • Robust performance assessment")

🧪 Loading Validation Split and Testing Final Model...
✅ Validation split loaded: (1705, 12)
   • Unique items: 312
📊 Validation Data:
   • Features: (1705, 11)
   • Target: (1705,)
   • Target mean: $2242.07

🔧 Applying Preprocessing Pipeline...
✅ Validation data transformed: (1705, 11) → (1705, 55)
📊 Validation model data: (1705, 54)

🔮 Making Predictions...

📈 FINAL VALIDATION RESULTS:
📊 R² Score: 0.2088
💰 RMSE: $1535.87
📊 MAE: $1064.87

📊 Cross-Validation vs Validation Comparison:
   • CV R²: 0.6962 → Validation R²: 0.2088
   • CV RMSE: $935.43 → Validation RMSE: $1535.87

🎯 Performance Assessment:
⚠️ CONCERNS: Validation performance below expectations
   • May need model improvement

🎉 VALIDATION COMPLETE!
   • True validation on completely unseen data
   • No data leakage between training and validation
   • Robust performance assessment


In [8]:
# 6. Save Baseline Models and Results for Future Improvement
print("💾 Saving Baseline Models and Results...")
print("=" * 60)

# Create baseline directory
baseline_dir = Path("baseline_models")
baseline_dir.mkdir(exist_ok=True)

# Generate timestamp for baseline
baseline_timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

# Save models with baseline naming
baseline_model_path = baseline_dir / f"baseline_model_{baseline_timestamp}.pkl"
baseline_pipeline_path = baseline_dir / f"baseline_preprocessor_{baseline_timestamp}.pkl"

joblib.dump(rf_model, baseline_model_path)
joblib.dump(preprocessor, baseline_pipeline_path)

print(f"✅ Baseline model saved: {baseline_model_path.name}")
print(f"✅ Baseline pipeline saved: {baseline_pipeline_path.name}")

# Save baseline performance results
baseline_results = {
    "timestamp": baseline_timestamp,
    "model_type": "RandomForestRegressor",
    "random_state": RANDOM_STATE,
    "data_split": {
        "method": "GroupKFold",
        "train_records": len(train_data),
        "validation_records": len(validation_data),
        "train_items": train_data['Item_Identifier'].nunique(),
        "validation_items": validation_data['Item_Identifier'].nunique()
    },
    "cross_validation": {
        "cv_strategy": "GroupKFold_5_splits",
        "r2_mean": float(cv_scores_r2.mean()),
        "r2_std": float(cv_scores_r2.std()),
        "rmse_mean": float(cv_scores_rmse.mean()),
        "rmse_std": float(cv_scores_rmse.std()),
        "individual_folds": {
            "r2_scores": cv_scores_r2.tolist(),
            "rmse_scores": cv_scores_rmse.tolist()
        }
    },
    "validation_performance": {
        "r2_score": float(r2),
        "rmse": float(rmse),
        "mae": float(mae),
        "target_mean_train": float(train_data['Item_Outlet_Sales'].mean()),
        "target_mean_validation": float(validation_data['Item_Outlet_Sales'].mean())
    },
    "performance_gap": {
        "r2_gap": float(cv_scores_r2.mean() - r2),
        "rmse_gap": float(rmse - cv_scores_rmse.mean()),
        "overfitting_detected": bool(abs(cv_scores_r2.mean() - r2) > 0.05)
    },
    "model_parameters": {
        "n_estimators": rf_model.n_estimators,
        "max_depth": rf_model.max_depth,
        "min_samples_split": rf_model.min_samples_split,
        "min_samples_leaf": rf_model.min_samples_leaf
    },
    "feature_info": {
        "original_features": X_train_raw.shape[1],
        "engineered_features": X_model.shape[1],
        "feature_names": X_model.columns.tolist()
    },
    "data_sources": {
        "train_data": "data_splits/train_data_splitted.csv",
        "validation_data": "data_splits/validation_data_splitted.csv",
        "original_data": "code/train_data.csv"
    }
}

# Save results to JSON
baseline_results_path = baseline_dir / f"baseline_results_{baseline_timestamp}.json"
with open(baseline_results_path, 'w') as f:
    json.dump(baseline_results, f, indent=2)

print(f"✅ Baseline results saved: {baseline_results_path.name}")

# Create baseline summary file
baseline_summary = f"""
# BigMart Sales - Baseline Model Summary
Generated: {baseline_timestamp}

## 📊 Baseline Performance
- **Cross-Validation R²**: {cv_scores_r2.mean():.4f} ± {cv_scores_r2.std():.4f}
- **Cross-Validation RMSE**: ${cv_scores_rmse.mean():.2f} ± ${cv_scores_rmse.std():.2f}
- **Validation R²**: {r2:.4f}
- **Validation RMSE**: ${rmse:.2f}

## 🚨 Key Issues Identified
- **Overfitting**: Validation R² is {abs(cv_scores_r2.mean() - r2):.4f} lower than CV
- **Poor Generalization**: Model doesn't generalize to unseen items
- **Performance Gap**: {((cv_scores_r2.mean() - r2) / cv_scores_r2.mean() * 100):.1f}% performance loss

## 📁 Data Setup
- **Training Data**: train_data_splitted.csv ({len(train_data)} records, {train_data['Item_Identifier'].nunique()} items)
- **Validation Data**: validation_data_splitted.csv ({len(validation_data)} records, {validation_data['Item_Identifier'].nunique()} items)
- **No Data Leakage**: ✅ Confirmed

## 🎯 Next Steps for Improvement
1. **Reduce Item-Specific Overfitting**: Remove/modify item statistics and target encoding
2. **Focus on Generalizable Features**: Emphasize features that work for unseen items
3. **Regularization**: Add model regularization to prevent overfitting
4. **Feature Selection**: Remove features that don't generalize well
5. **Alternative Models**: Try models less prone to overfitting

## 📦 Saved Files
- Model: {baseline_model_path.name}
- Pipeline: {baseline_pipeline_path.name}
- Results: {baseline_results_path.name}
"""

baseline_summary_path = baseline_dir / f"baseline_summary_{baseline_timestamp}.md"
with open(baseline_summary_path, 'w', encoding='utf-8') as f:
    f.write(baseline_summary)

print(f"✅ Baseline summary saved: {baseline_summary_path.name}")

print(f"\n🎯 BASELINE ESTABLISHED!")
print("=" * 50)
print(f"📊 Cross-Validation Performance: R² = {cv_scores_r2.mean():.4f}, RMSE = ${cv_scores_rmse.mean():.2f}")
print(f"📊 Validation Performance: R² = {r2:.4f}, RMSE = ${rmse:.2f}")
print(f"🚨 Performance Gap: R² drops by {abs(cv_scores_r2.mean() - r2):.4f}")
print(f"\n📁 Standard Data Files:")
print(f"   • Training: data_splits/train_data_splitted.csv")
print(f"   • Validation: data_splits/validation_data_splitted.csv")
print(f"\n🚀 Ready for model improvement iterations!")
print(f"   • Use baseline as comparison point")
print(f"   • Focus on reducing overfitting")
print(f"   • Improve generalization to unseen items")

💾 Saving Baseline Models and Results...
✅ Baseline model saved: baseline_model_20250906_160544.pkl
✅ Baseline pipeline saved: baseline_preprocessor_20250906_160544.pkl
✅ Baseline results saved: baseline_results_20250906_160544.json
✅ Baseline summary saved: baseline_summary_20250906_160544.md

🎯 BASELINE ESTABLISHED!
📊 Cross-Validation Performance: R² = 0.6962, RMSE = $935.43
📊 Validation Performance: R² = 0.2088, RMSE = $1535.87
🚨 Performance Gap: R² drops by 0.4874

📁 Standard Data Files:
   • Training: data_splits/train_data_splitted.csv
   • Validation: data_splits/validation_data_splitted.csv

🚀 Ready for model improvement iterations!
   • Use baseline as comparison point
   • Focus on reducing overfitting
   • Improve generalization to unseen items
