In [None]:
# BigMart Sales Prediction - Comprehensive Modeling Pipeline
# Colab-Ready Version with Progressive Complexity

In [None]:
# =============================================================================
# INSTALLATION AND IMPORTS
# =============================================================================

# Install required packages
!pip install lightgbm xgboost catboost optuna scikit-learn --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Linear Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Tree-based Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

# Advanced Models
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Optimization
import optuna
from optuna.samplers import TPESampler

import warnings
import json
import os
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Set random seed for reproducibility
np.random.seed(42)
plt.style.use('default')

print("✓ All packages imported successfully!")

In [None]:
# =============================================================================
# 1. DATA LOADING AND PREPARATION - FIXED FOR COLAB
# =============================================================================

def load_processed_data():
    """Load preprocessed data - Colab version"""
    print("Loading processed data...")
    
    try:
        # First try to load from previous notebook output
        train_df = pd.read_csv('train_processed.csv')
        print(f"✓ Loaded processed training data: {train_df.shape}")
        return train_df
    except FileNotFoundError:
        print("❌ train_processed.csv not found!")
        print("Please run the first notebook (EDA & Feature Engineering) first")
        print("Or upload your processed data file")
        return None

def prepare_modeling_data(df, target_col='Item_Outlet_Sales'):
    """Prepare data for modeling - FIXED VERSION"""
    if df is None:
        return None, None, None
        
    print("Preparing data for modeling...")
    
    # Separate features and target
    exclude_cols = ['Item_Identifier', 'Outlet_Identifier', target_col]
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    print("Converting categorical columns to numeric...")
    categorical_converted = 0
    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype.name == 'category':
            try:
                X[col] = pd.Categorical(X[col]).codes
                categorical_converted += 1
                print(f"   - Converted {col} to numeric codes")
            except Exception as e:
                print(f"   - Warning: Could not convert {col}: {e}")
                # If conversion fails, drop the column
                X = X.drop(col, axis=1)
                feature_cols.remove(col)
    
    print(f"   ✓ Converted {categorical_converted} categorical columns to numeric")
    
    # Verify all columns are numeric
    non_numeric_cols = []
    for col in X.columns:
        if X[col].dtype not in ['int64', 'float64', 'int32', 'float32']:
            non_numeric_cols.append(col)
    
    if non_numeric_cols:
        print(f"   - Dropping non-numeric columns: {non_numeric_cols}")
        X = X.drop(non_numeric_cols, axis=1)
        feature_cols = [col for col in feature_cols if col not in non_numeric_cols]
    
    print(f"✓ Features: {len(feature_cols)}")
    print(f"✓ Target variable: {target_col}")
    print(f"✓ Sample size: {len(X)}")
    print(f"✓ All features are now numeric")
    
    return X, y, feature_cols

# Load data
train_df = load_processed_data()

if train_df is not None:
    X, y, feature_cols = prepare_modeling_data(train_df)
    print(f"\nTarget variable statistics:")
    print(y.describe())
else:
    print("Cannot proceed without processed data. Please run EDA notebook first.")

In [None]:
# =============================================================================
# 2. FEATURE SELECTION 
# =============================================================================

def perform_feature_selection(X, y, k=15):
    """Perform statistical feature selection"""
    if X is None or y is None:
        return None, None, None
        
    print(f"\n{'='*50}")
    print(f"FEATURE SELECTION (SelectKBest, k={k})")
    print(f"{'='*50}")
    
    # Apply SelectKBest
    selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
    X_selected = selector.fit_transform(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    feature_scores = selector.scores_[selector.get_support()]
    
    # Create results DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': selected_features,
        'F_Score': feature_scores
    }).sort_values('F_Score', ascending=False)
    
    print("Selected Features (ranked by F-score):")
    print(feature_importance_df.to_string(index=False, float_format='%.2f'))
    
    # Visualize feature importance
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance_df)), feature_importance_df['F_Score'])
    plt.yticks(range(len(feature_importance_df)), feature_importance_df['Feature'])
    plt.xlabel('F-Score')
    plt.title(f'Top {len(selected_features)} Features by F-Score')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    return X_selected, selector, selected_features

# Perform feature selection
if 'X' in locals() and X is not None:
    X_selected, feature_selector, selected_features = perform_feature_selection(X, y, k=15)
    print(f"\n✓ Selected features shape: {X_selected.shape if X_selected is not None else 'N/A'}")

In [None]:
# =============================================================================
# 3. MODEL EVALUATION FRAMEWORK 
# =============================================================================

class ModelEvaluator:
    """Comprehensive model evaluation framework"""
    
    def __init__(self, cv_folds=5, random_state=42):
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.results = {}
        
    def evaluate_model(self, model, X, y, model_name, use_scaling=False):
        """Evaluate model using cross-validation"""
        print(f"\nEvaluating {model_name}...")
        
        # Prepare data
        X_eval = X.copy()
        if use_scaling:
            scaler = StandardScaler()
            X_eval = scaler.fit_transform(X_eval)
        
        # Cross-validation setup
        kfold = KFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        # Store fold results
        rmse_scores = []
        mae_scores = []
        r2_scores = []
        
        # Perform cross-validation
        for fold, (train_idx, val_idx) in enumerate(kfold.split(X_eval), 1):
            X_train_fold = X_eval[train_idx]
            X_val_fold = X_eval[val_idx]
            y_train_fold = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]
            y_val_fold = y.iloc[val_idx] if hasattr(y, 'iloc') else y[val_idx]
            
            # Train model
            model.fit(X_train_fold, y_train_fold)
            
            # Make predictions
            y_pred = model.predict(X_val_fold)
            
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
            mae = mean_absolute_error(y_val_fold, y_pred)
            r2 = r2_score(y_val_fold, y_pred)
            
            rmse_scores.append(rmse)
            mae_scores.append(mae)
            r2_scores.append(r2)
            
            print(f"  Fold {fold}: RMSE={rmse:.2f}, MAE={mae:.2f}, R²={r2:.4f}")
        
        # Calculate summary statistics
        results = {
            'model_name': model_name,
            'rmse_mean': np.mean(rmse_scores),
            'rmse_std': np.std(rmse_scores),
            'mae_mean': np.mean(mae_scores),
            'mae_std': np.std(mae_scores),
            'r2_mean': np.mean(r2_scores),
            'r2_std': np.std(r2_scores),
            'use_scaling': use_scaling
        }
        
        self.results[model_name] = results
        
        print(f"  Summary: RMSE={results['rmse_mean']:.2f}±{results['rmse_std']:.2f}, "
              f"R²={results['r2_mean']:.4f}±{results['r2_std']:.4f}")
        
        return results
    
    def get_results_summary(self):
        """Get summary of all model results"""
        if not self.results:
            return pd.DataFrame()
        
        results_df = pd.DataFrame(self.results).T
        results_df = results_df.sort_values('rmse_mean')
        
        # Format results for display
        results_df['CV_RMSE'] = (results_df['rmse_mean'].round(2).astype(str) + 
                                ' ± ' + results_df['rmse_std'].round(2).astype(str))
        results_df['CV_R2'] = (results_df['r2_mean'].round(4).astype(str) + 
                              ' ± ' + results_df['r2_std'].round(4).astype(str))
        
        return results_df[['CV_RMSE', 'CV_R2', 'use_scaling']]
    
    def plot_results(self):
        """Plot comparison of model results"""
        if not self.results:
            print("No results to plot!")
            return
        
        results_df = pd.DataFrame(self.results).T
        results_df = results_df.sort_values('rmse_mean')
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # RMSE comparison
        axes[0].barh(range(len(results_df)), results_df['rmse_mean'])
        axes[0].set_yticks(range(len(results_df)))
        axes[0].set_yticklabels(results_df.index)
        axes[0].set_xlabel('RMSE')
        axes[0].set_title('Model Comparison - RMSE (Lower is Better)')
        axes[0].grid(True, alpha=0.3)
        
        # R² comparison
        axes[1].barh(range(len(results_df)), results_df['r2_mean'])
        axes[1].set_yticks(range(len(results_df)))
        axes[1].set_yticklabels(results_df.index)
        axes[1].set_xlabel('R² Score')
        axes[1].set_title('Model Comparison - R² (Higher is Better)')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Initialize evaluator
evaluator = ModelEvaluator(cv_folds=5)
print("✓ Model evaluator initialized")

In [None]:
# =============================================================================
# 4. PHASE 1: BASELINE LINEAR MODELS
# =============================================================================

if 'X_selected' in locals() and X_selected is not None:
    print(f"\n{'='*60}")
    print("PHASE 1: BASELINE LINEAR MODELS")
    print(f"{'='*60}")
    
    # 1. Multiple Linear Regression
    print("\n1.  Multiple Linear Regression")
    lr_model = LinearRegression()
    evaluator.evaluate_model(lr_model, X_selected, y, "Linear_Regression", use_scaling=True)
    
    # 2. Ridge Regression with hyperparameter tuning
    print("\n2.  Ridge Regression (with hyperparameter tuning)")
    ridge_params = {'alpha': [0.1, 1.0, 10.0, 50.0, 100.0, 200.0, 500.0]}
    ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=3, scoring='neg_mean_squared_error')
    
    # Scale data for grid search
    X_scaled = StandardScaler().fit_transform(X_selected)
    ridge_grid.fit(X_scaled, y)
    
    print(f"   ✓ Best Ridge alpha: {ridge_grid.best_params_['alpha']}")
    ridge_model = Ridge(alpha=ridge_grid.best_params_['alpha'])
    evaluator.evaluate_model(ridge_model, X_selected, y, "Ridge_Regression", use_scaling=True)
    
    # 3. Lasso Regression with hyperparameter tuning
    print("\n3.  Lasso Regression (with hyperparameter tuning)")
    lasso_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]}
    lasso_grid = GridSearchCV(Lasso(), lasso_params, cv=3, scoring='neg_mean_squared_error')
    lasso_grid.fit(X_scaled, y)
    
    print(f"   ✓ Best Lasso alpha: {lasso_grid.best_params_['alpha']}")
    lasso_model = Lasso(alpha=lasso_grid.best_params_['alpha'])
    evaluator.evaluate_model(lasso_model, X_selected, y, "Lasso_Regression", use_scaling=True)
else:
    print("⚠️ Skipping linear models - no processed data available")

In [None]:

# =============================================================================
# 5. PHASE 2: TREE-BASED MODELS
# =============================================================================

if 'X_selected' in locals() and X_selected is not None:
    print(f"\n{'='*60}")
    print("PHASE 2: TREE-BASED MODELS")
    print(f"{'='*60}")
    
    # 4. Decision Tree
    print("\n4. 🌳 Decision Tree (with hyperparameter tuning)")
    dt_params = {
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, 
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    dt_grid.fit(X_selected, y)
    
    print(f"   ✓ Best Decision Tree params: {dt_grid.best_params_}")
    dt_model = DecisionTreeRegressor(**dt_grid.best_params_, random_state=42)
    evaluator.evaluate_model(dt_model, X_selected, y, "Decision_Tree")
    
    # 5. Random Forest
    print("\n5. 🌲 Random Forest (with hyperparameter tuning)")
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
    rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_params,
                                   n_iter=15, cv=3, scoring='neg_mean_squared_error', 
                                   n_jobs=-1, random_state=42)
    rf_random.fit(X_selected, y)
    
    print(f"   ✓ Best Random Forest params: {rf_random.best_params_}")
    rf_model = RandomForestRegressor(**rf_random.best_params_, random_state=42)
    evaluator.evaluate_model(rf_model, X_selected, y, "Random_Forest")
    
    # 6. Extra Trees
    print("\n6.  Extra Trees (with hyperparameter tuning)")
    et_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
    et_random = RandomizedSearchCV(ExtraTreesRegressor(random_state=42), et_params,
                                   n_iter=15, cv=3, scoring='neg_mean_squared_error',
                                   n_jobs=-1, random_state=42)
    et_random.fit(X_selected, y)
    
    print(f"   ✓ Best Extra Trees params: {et_random.best_params_}")
    et_model = ExtraTreesRegressor(**et_random.best_params_, random_state=42)
    evaluator.evaluate_model(et_model, X_selected, y, "Extra_Trees")
else:
    print("⚠️ Skipping tree-based models - no processed data available")

In [None]:
# =============================================================================
# 6. PHASE 3: ADVANCED ENSEMBLE MODELS WITH OPTUNA OPTIMIZATION
# =============================================================================

if 'X_selected' in locals() and X_selected is not None:
    print(f"\n{'='*60}")
    print("PHASE 3: ADVANCED ENSEMBLE MODELS WITH OPTUNA")
    print(f"{'='*60}")
    
    # 7. LightGBM with Optuna optimization
    print("\n7. LightGBM (Optuna Optimization)")
    
    def optimize_lightgbm(X, y, n_trials=200):  
        """Optimize LightGBM with Optuna"""
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'num_leaves': trial.suggest_int('num_leaves', 20, 300),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'random_state': 42,
                'verbose': -1
            }
            
            # Quick cross-validation for optimization
            model = lgb.LGBMRegressor(**params)
            kfold = KFold(n_splits=3, shuffle=True, random_state=42)
            rmse_scores = []
            
            for train_idx, val_idx in kfold.split(X):
                X_train_fold, X_val_fold = X[train_idx], X[val_idx]
                y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                
                model.fit(X_train_fold, y_train_fold)
                y_pred = model.predict(X_val_fold)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                rmse_scores.append(rmse)
            
            return np.mean(rmse_scores)
        
        print(f"   Starting Optuna optimization with {n_trials} trials...")
        study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
        study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
        
        print(f"   ✓ Best RMSE: {study.best_value:.4f}")
        print(f"   ✓ Best params: {study.best_params}")
        return study.best_params
    
    # Run LightGBM optimization
    lgb_best_params = optimize_lightgbm(X_selected, y, n_trials=200)  # ← CHANGE THIS NUMBER
    lgb_model = lgb.LGBMRegressor(**lgb_best_params)
    evaluator.evaluate_model(lgb_model, X_selected, y, "LightGBM_Optuna")
    
    # 8. XGBoost with Optuna optimization
    print("\n8. XGBoost (Optuna Optimization)")
    
    def optimize_xgboost(X, y, n_trials=200):  
        """Optimize XGBoost with Optuna"""
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0.0, 10.0),
                'random_state': 42,
                'verbosity': 0
            }
            
            model = xgb.XGBRegressor(**params)
            kfold = KFold(n_splits=3, shuffle=True, random_state=42)
            rmse_scores = []
            
            for train_idx, val_idx in kfold.split(X):
                X_train_fold, X_val_fold = X[train_idx], X[val_idx]
                y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                
                model.fit(X_train_fold, y_train_fold)
                y_pred = model.predict(X_val_fold)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                rmse_scores.append(rmse)
            
            return np.mean(rmse_scores)
        
        print(f"   Starting Optuna optimization with {n_trials} trials...")
        study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
        study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
        
        print(f"   ✓ Best RMSE: {study.best_value:.4f}")
        print(f"   ✓ Best params: {study.best_params}")
        return study.best_params
    
    # Run XGBoost optimization
    xgb_best_params = optimize_xgboost(X_selected, y, n_trials=200)  
    xgb_model = xgb.XGBRegressor(**xgb_best_params)
    evaluator.evaluate_model(xgb_model, X_selected, y, "XGBoost_Optuna")
    
    # 9. CatBoost with Optuna optimization
    print("\n9.  CatBoost (Optuna Optimization)")
    
    def optimize_catboost(X, y, n_trials=200):  # ← SET TRIALS HERE
        """Optimize CatBoost with Optuna"""
        def objective(trial):
            params = {
                'iterations': trial.suggest_int('iterations', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'depth': trial.suggest_int('depth', 3, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
                'random_state': 42,
                'verbose': False
            }
            
            model = cb.CatBoostRegressor(**params)
            kfold = KFold(n_splits=3, shuffle=True, random_state=42)
            rmse_scores = []
            
            for train_idx, val_idx in kfold.split(X):
                X_train_fold, X_val_fold = X[train_idx], X[val_idx]
                y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                
                model.fit(X_train_fold, y_train_fold)
                y_pred = model.predict(X_val_fold)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                rmse_scores.append(rmse)
            
            return np.mean(rmse_scores)
        
        print(f"   Starting Optuna optimization with {n_trials} trials...")
        study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
        study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
        
        print(f"   ✓ Best RMSE: {study.best_value:.4f}")
        print(f"   ✓ Best params: {study.best_params}")
        return study.best_params
    
    # Run CatBoost optimization
    cb_best_params = optimize_catboost(X_selected, y, n_trials=200)  # ← CHANGE THIS NUMBER
    cb_model = cb.CatBoostRegressor(**cb_best_params)
    evaluator.evaluate_model(cb_model, X_selected, y, "CatBoost_Optuna")

else:
    print("⚠️ Skipping Optuna optimization - no processed data available")

In [None]:
# =============================================================================
# 7. MODEL COMPARISON AND RESULTS ANALYSIS 
# =============================================================================

if evaluator.results:
    print(f"\n{'='*60}")
    print("MODEL COMPARISON AND RESULTS ANALYSIS")
    print(f"{'='*60}")
    
    # Display results summary
    results_summary = evaluator.get_results_summary()
    print("\n Model Performance Summary (sorted by RMSE):")
    print(results_summary.to_string())
    
    # Plot model comparison
    evaluator.plot_results()
    
    results_df = pd.DataFrame(evaluator.results).T
    
    # Convert key metrics to numeric, handling any string values
    numeric_cols = ['rmse_mean', 'rmse_std', 'mae_mean', 'mae_std', 'r2_mean', 'r2_std']
    for col in numeric_cols:
        if col in results_df.columns:
            results_df[col] = pd.to_numeric(results_df[col], errors='coerce')
    
    # Remove any rows with NaN in critical columns
    results_df_clean = results_df.dropna(subset=['rmse_mean'])
    
    # Now safe to use nsmallest
    best_3_models = results_df_clean.nsmallest(3, 'rmse_mean')
    
    print(f"\n Top 3 Best Performing Models:")
    for i, (model_name, row) in enumerate(best_3_models.iterrows(), 1):
        rmse_std = row.get('rmse_std', float('nan'))
        r2_mean = row.get('r2_mean', float('nan'))
        r2_std = row.get('r2_std', float('nan'))
        
        rmse_str = f"{rmse_std:.2f}" if not pd.isna(rmse_std) else "N/A"
        r2_mean_str = f"{r2_mean:.4f}" if not pd.isna(r2_mean) else "N/A"
        r2_std_str = f"{r2_std:.4f}" if not pd.isna(r2_std) else "N/A"
        
        print(f"   {i}. {model_name}: RMSE={row['rmse_mean']:.2f}±{rmse_str}, "
              f"R²={r2_mean_str}±{r2_std_str}")
    
    # Best model selection 
    best_model_name = results_df_clean['rmse_mean'].idxmin()
    best_model_rmse = results_df_clean.loc[best_model_name, 'rmse_mean']
    best_model_r2 = results_df_clean.loc[best_model_name, 'r2_mean']
    
    print(f"\n BEST MODEL: {best_model_name}")
    print(f"   RMSE: {best_model_rmse:.2f}±{results_df_clean.loc[best_model_name, 'rmse_std']:.2f}")
    print(f"   R²: {best_model_r2:.4f}±{results_df_clean.loc[best_model_name, 'r2_std']:.4f}")
else:
    print("⚠️ No model results available")

In [None]:
# =============================================================================
# 8. FEATURE IMPORTANCE ANALYSIS
# =============================================================================

if 'selected_features' in locals() and evaluator.results:
    print(f"\n{'='*60}")
    print("FEATURE IMPORTANCE ANALYSIS")
    print(f"{'='*60}")
    
    # Get feature importance from tree-based models
    importance_data = {}
    
    # Train models on full data for feature importance
    if 'rf_model' in locals():
        rf_model.fit(X_selected, y)
        importance_data['Random_Forest'] = rf_model.feature_importances_
    
    if 'et_model' in locals():
        et_model.fit(X_selected, y)
        importance_data['Extra_Trees'] = et_model.feature_importances_
    
    if 'lgb_model' in locals():
        lgb_model.fit(X_selected, y)
        importance_data['LightGBM'] = lgb_model.feature_importances_
    
    if 'xgb_model' in locals():
        xgb_model.fit(X_selected, y)
        importance_data['XGBoost'] = xgb_model.feature_importances_
    
    if 'cb_model' in locals():
        cb_model.fit(X_selected, y)
        importance_data['CatBoost'] = cb_model.feature_importances_
    
    if importance_data:
        # Create feature importance DataFrame
        importance_df = pd.DataFrame(importance_data, index=selected_features)
        importance_df['Average'] = importance_df.mean(axis=1)
        importance_df = importance_df.sort_values('Average', ascending=False)
        
        print("🔍 Feature Importance Summary (Top 10):")
        print(importance_df.head(10).round(4).to_string())
        
        # Visualize average feature importance
        plt.figure(figsize=(10, 8))
        top_10 = importance_df.head(10)
        plt.barh(range(len(top_10)), top_10['Average'])
        plt.yticks(range(len(top_10)), top_10.index)
        plt.xlabel('Average Feature Importance')
        plt.title('Top 10 Features by Average Importance')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

In [None]:
# =============================================================================
# 9. SAVE RESULTS
# =============================================================================

if evaluator.results:
    print(f"\n{'='*60}")
    print("SAVING RESULTS")
    print(f"{'='*60}")
    
    try:
        # Save results summary
        results_summary = evaluator.get_results_summary()
        results_summary.to_csv('model_performance_summary.csv')
        print("✓ Model performance summary saved: model_performance_summary.csv")
        
        # Save selected features
        if 'selected_features' in locals():
            pd.Series(selected_features).to_csv('selected_features.csv', 
                                               index=False, header=['feature'])
            print("✓ Selected features saved: selected_features.csv")
        
        # Save feature importance if available
        if 'importance_df' in locals():
            importance_df.to_csv('feature_importance_analysis.csv')
            print("✓ Feature importance saved: feature_importance_analysis.csv")
        
    except Exception as e:
        print(f"⚠️ Error saving files: {e}")
        print("You can still access results from the notebook variables")

In [None]:
# =============================================================================
# 10. FINAL SUMMARY
# =============================================================================

print(f"\n{'='*60}")
print("MODELING PIPELINE COMPLETED!")
print(f"{'='*60}")

if evaluator.results:
    print(f"\n FINAL SUMMARY:")
    print(f"    Models Evaluated: {len(evaluator.results)}")
    if 'best_model_name' in locals():
        print(f"   🏆 Best Model: {best_model_name}")
        print(f"   📈 Best RMSE: {best_model_rmse:.2f}")
        print(f"   📊 Best R²: {best_model_r2:.4f}")
    if 'selected_features' in locals():
        print(f"   🎯 Features Used: {len(selected_features)}")
    print(f"   🔄 Cross-Validation: 5-fold")
    
    print(f"\n🎯 RECOMMENDATIONS:")
    if 'best_model_name' in locals():
        print(f"   1. Use {best_model_name} for final predictions")
        print(f"   2. Consider ensemble if marginal improvement needed")
        print(f"   3. Validate on holdout test set before deployment")

else:
    print("⚠️ No models were successfully evaluated.")
    print("Please check the data loading and preprocessing steps.")

