In [None]:
# BigMart Sales Prediction - Comprehensive Modeling Pipeline
# Progressive complexity approach: Linear Models ‚Üí Tree Models ‚Üí Advanced Ensembles

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# Linear Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Tree-based Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

# Advanced Models
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Optimization
import optuna
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# =============================================================================
# 1. DATA LOADING AND PREPARATION
# =============================================================================

def load_processed_data():
    """Load preprocessed data"""
    print("Loading processed data...")
    
    # Load processed training data
    train_df = pd.read_csv('data/processed/train_processed.csv')
    
    print(f"Training data shape: {train_df.shape}")
    print(f"Columns: {len(train_df.columns)}")
    
    return train_df

def prepare_modeling_data(df, target_col='Item_Outlet_Sales'):
    """Prepare data for modeling"""
    print("Preparing data for modeling...")
    
    # Separate features and target
    feature_cols = [col for col in df.columns 
                   if col not in ['Item_Identifier', 'Outlet_Identifier', target_col]]
    
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    print(f"Features: {len(feature_cols)}")
    print(f"Target variable: {target_col}")
    print(f"Feature columns: {feature_cols}")
    
    return X, y, feature_cols

# Load data
train_df = load_processed_data()
X, y, feature_cols = prepare_modeling_data(train_df)

print(f"\nTarget variable statistics:")
print(y.describe())


In [None]:

# =============================================================================
# 2. FEATURE SELECTION
# =============================================================================

def perform_feature_selection(X, y, k=15, method='f_regression'):
    """Perform statistical feature selection"""
    print(f"\n{'='*50}")
    print(f"FEATURE SELECTION (SelectKBest, k={k})")
    print(f"{'='*50}")
    
    # Apply SelectKBest
    selector = SelectKBest(score_func=f_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    feature_scores = selector.scores_[selector.get_support()]
    
    # Create results DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': selected_features,
        'F_Score': feature_scores
    }).sort_values('F_Score', ascending=False)
    
    print("Selected Features (ranked by F-score):")
    print(feature_importance_df.to_string(index=False))
    
    # Visualize feature importance
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance_df)), feature_importance_df['F_Score'])
    plt.yticks(range(len(feature_importance_df)), feature_importance_df['Feature'])
    plt.xlabel('F-Score')
    plt.title(f'Top {k} Features by F-Score')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    return X_selected, selector, selected_features

# Perform feature selection
X_selected, feature_selector, selected_features = perform_feature_selection(X, y, k=15)

print(f"\nSelected features shape: {X_selected.shape}")


In [None]:

# =============================================================================
# 3. MODEL EVALUATION FRAMEWORK
# =============================================================================

class ModelEvaluator:
    """Comprehensive model evaluation framework"""
    
    def __init__(self, cv_folds=5, random_state=42):
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.results = {}
        
    def evaluate_model(self, model, X, y, model_name, use_scaling=False):
        """Evaluate model using cross-validation"""
        print(f"\nEvaluating {model_name}...")
        
        # Prepare data
        X_eval = X.copy()
        if use_scaling:
            scaler = StandardScaler()
            X_eval = scaler.fit_transform(X_eval)
        
        # Cross-validation setup
        kfold = KFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        # Store fold results
        rmse_scores = []
        mae_scores = []
        r2_scores = []
        
        # Perform cross-validation
        for fold, (train_idx, val_idx) in enumerate(kfold.split(X_eval), 1):
            X_train_fold = X_eval[train_idx]
            X_val_fold = X_eval[val_idx]
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
            
            # Train model
            model.fit(X_train_fold, y_train_fold)
            
            # Make predictions
            y_pred = model.predict(X_val_fold)
            
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
            mae = mean_absolute_error(y_val_fold, y_pred)
            r2 = r2_score(y_val_fold, y_pred)
            
            rmse_scores.append(rmse)
            mae_scores.append(mae)
            r2_scores.append(r2)
            
            print(f"  Fold {fold}: RMSE={rmse:.4f}, MAE={mae:.4f}, R¬≤={r2:.4f}")
        
        # Calculate summary statistics
        results = {
            'model_name': model_name,
            'rmse_mean': np.mean(rmse_scores),
            'rmse_std': np.std(rmse_scores),
            'mae_mean': np.mean(mae_scores),
            'mae_std': np.std(mae_scores),
            'r2_mean': np.mean(r2_scores),
            'r2_std': np.std(r2_scores),
            'use_scaling': use_scaling
        }
        
        self.results[model_name] = results
        
        print(f"  Summary: RMSE={results['rmse_mean']:.4f}¬±{results['rmse_std']:.4f}, "
              f"R¬≤={results['r2_mean']:.4f}¬±{results['r2_std']:.4f}")
        
        return results
    
    def get_results_summary(self):
        """Get summary of all model results"""
        if not self.results:
            return pd.DataFrame()
        
        results_df = pd.DataFrame(self.results).T
        results_df = results_df.sort_values('rmse_mean')
        
        # Format results for display
        results_df['CV_RMSE'] = (results_df['rmse_mean'].round(2).astype(str) + 
                                ' ¬± ' + results_df['rmse_std'].round(2).astype(str))
        results_df['CV_R2'] = (results_df['r2_mean'].round(4).astype(str) + 
                              ' ¬± ' + results_df['r2_std'].round(4).astype(str))
        
        return results_df[['CV_RMSE', 'CV_R2', 'use_scaling']]
    
    def plot_results(self):
        """Plot comparison of model results"""
        if not self.results:
            return
        
        results_df = pd.DataFrame(self.results).T
        results_df = results_df.sort_values('rmse_mean')
        
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        # RMSE comparison
        axes[0].barh(range(len(results_df)), results_df['rmse_mean'])
        axes[0].set_yticks(range(len(results_df)))
        axes[0].set_yticklabels(results_df.index)
        axes[0].set_xlabel('RMSE')
        axes[0].set_title('Model Comparison - RMSE')
        axes[0].grid(True, alpha=0.3)
        
        # R¬≤ comparison
        axes[1].barh(range(len(results_df)), results_df['r2_mean'])
        axes[1].set_yticks(range(len(results_df)))
        axes[1].set_yticklabels(results_df.index)
        axes[1].set_xlabel('R¬≤ Score')
        axes[1].set_title('Model Comparison - R¬≤')
        axes[1].grid(True, alpha=0.3)
        
        # RMSE with error bars
        axes[2].errorbar(results_df['rmse_mean'], range(len(results_df)), 
                        xerr=results_df['rmse_std'], fmt='o', capsize=5)
        axes[2].set_yticks(range(len(results_df)))
        axes[2].set_yticklabels(results_df.index)
        axes[2].set_xlabel('RMSE')
        axes[2].set_title('Model Comparison - RMSE with Std Dev')
        axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Initialize evaluator
evaluator = ModelEvaluator(cv_folds=5)

In [None]:

# =============================================================================
# 4. PHASE 1: BASELINE LINEAR MODELS
# =============================================================================

print(f"\n{'='*60}")
print("PHASE 1: BASELINE LINEAR MODELS")
print(f"{'='*60}")

# 1. Multiple Linear Regression
print("\n1. Multiple Linear Regression")
lr_model = LinearRegression()
evaluator.evaluate_model(lr_model, X_selected, y, "Linear_Regression", use_scaling=True)

# 2. Ridge Regression with hyperparameter tuning
print("\n2. Ridge Regression (with hyperparameter tuning)")

# Grid search for Ridge
ridge_params = {'alpha': [0.1, 1.0, 10.0, 50.0, 100.0, 200.0, 500.0]}
ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=3, scoring='neg_mean_squared_error')
ridge_grid.fit(StandardScaler().fit_transform(X_selected), y)

print(f"   Best Ridge alpha: {ridge_grid.best_params_['alpha']}")
ridge_model = Ridge(alpha=ridge_grid.best_params_['alpha'])
evaluator.evaluate_model(ridge_model, X_selected, y, "Ridge_Regression", use_scaling=True)

# 3. Lasso Regression with hyperparameter tuning
print("\n3. Lasso Regression (with hyperparameter tuning)")

# Grid search for Lasso
lasso_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]}
lasso_grid = GridSearchCV(Lasso(), lasso_params, cv=3, scoring='neg_mean_squared_error')
lasso_grid.fit(StandardScaler().fit_transform(X_selected), y)

print(f"   Best Lasso alpha: {lasso_grid.best_params_['alpha']}")
lasso_model = Lasso(alpha=lasso_grid.best_params_['alpha'])
evaluator.evaluate_model(lasso_model, X_selected, y, "Lasso_Regression", use_scaling=True)

# 4. ElasticNet for comparison
print("\n4. ElasticNet Regression (with hyperparameter tuning)")

# Grid search for ElasticNet
elastic_params = {
    'alpha': [0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9]
}
elastic_grid = GridSearchCV(ElasticNet(), elastic_params, cv=3, scoring='neg_mean_squared_error')
elastic_grid.fit(StandardScaler().fit_transform(X_selected), y)

print(f"   Best ElasticNet params: {elastic_grid.best_params_}")
elastic_model = ElasticNet(**elastic_grid.best_params_)
evaluator.evaluate_model(elastic_model, X_selected, y, "ElasticNet_Regression", use_scaling=True)


In [None]:

# =============================================================================
# 5. PHASE 2: TREE-BASED MODELS
# =============================================================================

print(f"\n{'='*60}")
print("PHASE 2: TREE-BASED MODELS")
print(f"{'='*60}")

# 5. Decision Tree
print("\n5. Decision Tree (with hyperparameter tuning)")

# Grid search for Decision Tree
dt_params = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, 
                       cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
dt_grid.fit(X_selected, y)

print(f"   Best Decision Tree params: {dt_grid.best_params_}")
dt_model = DecisionTreeRegressor(**dt_grid.best_params_, random_state=42)
evaluator.evaluate_model(dt_model, X_selected, y, "Decision_Tree")

# 6. Random Forest
print("\n6. Random Forest (with hyperparameter tuning)")

# Randomized search for Random Forest (faster)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_params,
                               n_iter=20, cv=3, scoring='neg_mean_squared_error', 
                               n_jobs=-1, random_state=42)
rf_random.fit(X_selected, y)

print(f"   Best Random Forest params: {rf_random.best_params_}")
rf_model = RandomForestRegressor(**rf_random.best_params_, random_state=42)
evaluator.evaluate_model(rf_model, X_selected, y, "Random_Forest")

# 7. Extra Trees
print("\n7. Extra Trees (with hyperparameter tuning)")

# Randomized search for Extra Trees
et_params = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
et_random = RandomizedSearchCV(ExtraTreesRegressor(random_state=42), et_params,
                               n_iter=20, cv=3, scoring='neg_mean_squared_error',
                               n_jobs=-1, random_state=42)
et_random.fit(X_selected, y)

print(f"   Best Extra Trees params: {et_random.best_params_}")
et_model = ExtraTreesRegressor(**et_random.best_params_, random_state=42)
evaluator.evaluate_model(et_model, X_selected, y, "Extra_Trees")


In [None]:

# =============================================================================
# 6. PHASE 3: ADVANCED ENSEMBLE MODELS WITH OPTUNA OPTIMIZATION
# =============================================================================

print(f"\n{'='*60}")
print("PHASE 3: ADVANCED ENSEMBLE MODELS (OPTUNA OPTIMIZATION)")
print(f"{'='*60}")

def optimize_lightgbm(X, y, n_trials=100):
    """Optimize LightGBM hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'num_leaves': trial.suggest_int('num_leaves', 20, 200),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
            'random_state': 42,
            'verbose': -1
        }
        
        model = lgb.LGBMRegressor(**params)
        
        # 3-fold CV for faster optimization
        kfold = KFold(n_splits=3, shuffle=True, random_state=42)
        rmse_scores = []
        
        for train_idx, val_idx in kfold.split(X):
            X_train_fold = X[train_idx]
            X_val_fold = X[val_idx]
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
            rmse_scores.append(rmse)
        
        return np.mean(rmse_scores)
    
    print("   Optimizing LightGBM hyperparameters...")
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"   LightGBM - Best RMSE: {study.best_value:.4f}")
    print(f"   Best params: {study.best_params}")
    return study.best_params

def optimize_xgboost(X, y, n_trials=100):
    """Optimize XGBoost hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
            'gamma': trial.suggest_float('gamma', 0.0, 5.0),
            'random_state': 42,
            'verbosity': 0
        }
        
        model = xgb.XGBRegressor(**params)
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=42)
        rmse_scores = []
        
        for train_idx, val_idx in kfold.split(X):
            X_train_fold = X[train_idx]
            X_val_fold = X[val_idx]
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
            rmse_scores.append(rmse)
        
        return np.mean(rmse_scores)
    
    print("   Optimizing XGBoost hyperparameters...")
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"   XGBoost - Best RMSE: {study.best_value:.4f}")
    print(f"   Best params: {study.best_params}")
    return study.best_params

def optimize_catboost(X, y, n_trials=100):
    """Optimize CatBoost hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 500, 1500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'depth': trial.suggest_int('depth', 3, 8),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 8.0),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.7, 1.0),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 50),
            'random_state': 42,
            'verbose': False
        }
        
        model = cb.CatBoostRegressor(**params)
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=42)
        rmse_scores = []
        
        for train_idx, val_idx in kfold.split(X):
            X_train_fold = X[train_idx]
            X_val_fold = X[val_idx]
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
            rmse_scores.append(rmse)
        
        return np.mean(rmse_scores)
    
    print("   Optimizing CatBoost hyperparameters...")
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"   CatBoost - Best RMSE: {study.best_value:.4f}")
    print(f"   Best params: {study.best_params}")
    return study.best_params

# 8. LightGBM
print("\n8. LightGBM (Optuna Optimization)")
lgb_best_params = optimize_lightgbm(X_selected, y, n_trials=100)
lgb_model = lgb.LGBMRegressor(**lgb_best_params)
evaluator.evaluate_model(lgb_model, X_selected, y, "LightGBM_Optimized")

# 9. XGBoost
print("\n9. XGBoost (Optuna Optimization)")
xgb_best_params = optimize_xgboost(X_selected, y, n_trials=100)
xgb_model = xgb.XGBRegressor(**xgb_best_params)
evaluator.evaluate_model(xgb_model, X_selected, y, "XGBoost_Optimized")

# 10. CatBoost
print("\n10. CatBoost (Optuna Optimization)")
cb_best_params = optimize_catboost(X_selected, y, n_trials=100)
cb_model = cb.CatBoostRegressor(**cb_best_params)
evaluator.evaluate_model(cb_model, X_selected, y, "CatBoost_Optimized")


In [None]:

# =============================================================================
# 7. MODEL COMPARISON AND RESULTS ANALYSIS
# =============================================================================

print(f"\n{'='*60}")
print("MODEL COMPARISON AND RESULTS ANALYSIS")
print(f"{'='*60}")

# Display results summary
results_summary = evaluator.get_results_summary()
print("\nModel Performance Summary (sorted by RMSE):")
print(results_summary.to_string())

# Plot model comparison
evaluator.plot_results()

# Identify best models
results_df = pd.DataFrame(evaluator.results).T
best_3_models = results_df.nsmallest(3, 'rmse_mean')

print(f"\nTop 3 Best Performing Models:")
for i, (model_name, row) in enumerate(best_3_models.iterrows(), 1):
    print(f"{i}. {model_name}: RMSE={row['rmse_mean']:.4f}¬±{row['rmse_std']:.4f}, "
          f"R¬≤={row['r2_mean']:.4f}¬±{row['r2_std']:.4f}")


In [None]:

# =============================================================================
# 8. FEATURE IMPORTANCE ANALYSIS
# =============================================================================

def analyze_feature_importance_advanced(models_dict, X, feature_names):
    """Analyze feature importance from tree-based models"""
    print(f"\n{'='*60}")
    print("FEATURE IMPORTANCE ANALYSIS")
    print(f"{'='*60}")
    
    # Focus on tree-based models
    tree_models = {
        'Random Forest': rf_model,
        'Extra Trees': et_model,
        'LightGBM': lgb_model,
        'XGBoost': xgb_model,
        'CatBoost': cb_model
    }
    
    # Train models on full data for feature importance
    for name, model in tree_models.items():
        if name in ['Random Forest', 'Extra Trees']:
            model.fit(X, y)
        else:
            model.fit(X, y)
    
    # Extract feature importances
    importance_df = pd.DataFrame({'Feature': selected_features})
    
    for name, model in tree_models.items():
        if hasattr(model, 'feature_importances_'):
            importance_df[f'{name}_Importance'] = model.feature_importances_
    
    # Calculate average importance
    importance_cols = [col for col in importance_df.columns if 'Importance' in col]
    importance_df['Average_Importance'] = importance_df[importance_cols].mean(axis=1)
    
    # Sort by average importance
    importance_df = importance_df.sort_values('Average_Importance', ascending=False)
    
    print("Feature Importance Summary:")
    print(importance_df.to_string(index=False, float_format='%.4f'))
    
    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    
    # Create subplots for different models
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, (name, model) in enumerate(tree_models.items()):
        if i < len(axes) and hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]  # Top 10 features
            
            axes[i].bar(range(10), importances[indices])
            axes[i].set_title(f'{name} - Top 10 Features')
            axes[i].set_xticks(range(10))
            axes[i].set_xticklabels([selected_features[idx] for idx in indices], rotation=45)
    
    # Average importance plot
    if len(axes) > len(tree_models):
        top_10_avg = importance_df.head(10)
        axes[-1].bar(range(10), top_10_avg['Average_Importance'])
        axes[-1].set_title('Average Importance - Top 10 Features')
        axes[-1].set_xticks(range(10))
        axes[-1].set_xticklabels(top_10_avg['Feature'], rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return importance_df

# Analyze feature importance
feature_importance_results = analyze_feature_importance_advanced(
    {}, X_selected, selected_features
)


In [None]:

# =============================================================================
# 9. ENSEMBLE MODEL CREATION
# =============================================================================

print(f"\n{'='*60}")
print("ENSEMBLE MODEL CREATION")
print(f"{'='*60}")

class EnsembleModel:
    """Simple ensemble model using weighted averaging"""
    
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights if weights else [1.0] * len(models)
        self.weights = np.array(self.weights) / np.sum(self.weights)  # Normalize
    
    def fit(self, X, y):
        """Fit all base models"""
        for model in self.models:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        """Make ensemble predictions"""
        predictions = np.zeros(len(X))
        
        for i, model in enumerate(self.models):
            pred = model.predict(X)
            predictions += self.weights[i] * pred
        
        return predictions

# Select top 3 models for ensemble
top_model_names = best_3_models.index.tolist()
print(f"Creating ensemble from top 3 models: {top_model_names}")

# Create ensemble models based on best performers
ensemble_models = []
ensemble_names = []

if 'LightGBM_Optimized' in top_model_names:
    ensemble_models.append(lgb.LGBMRegressor(**lgb_best_params))
if 'XGBoost_Optimized' in top_model_names:
    ensemble_models.append(xgb.XGBRegressor(**xgb_best_params))
if 'CatBoost_Optimized' in top_model_names:
    ensemble_models.append(cb.CatBoostRegressor(**cb_best_params))

# Add best traditional model if space available
for name in ['Random_Forest', 'Extra_Trees']:
    if name in top_model_names and len(ensemble_models) < 3:
        if name == 'Random_Forest':
            ensemble_models.append(RandomForestRegressor(**rf_random.best_params_, random_state=42))
        elif name == 'Extra_Trees':
            ensemble_models.append(ExtraTreesRegressor(**et_random.best_params_, random_state=42))

# Create and evaluate ensemble
if len(ensemble_models) >= 2:
    ensemble = EnsembleModel(ensemble_models)
    evaluator.evaluate_model(ensemble, X_selected, y, "Ensemble_Top3")
    
    # Try weighted ensemble (inverse RMSE weighting)
    rmse_values = [evaluator.results[name]['rmse_mean'] for name in top_model_names[:len(ensemble_models)]]
    weights = [1/rmse for rmse in rmse_values]
    
    ensemble_weighted = EnsembleModel(ensemble_models, weights)
    evaluator.evaluate_model(ensemble_weighted, X_selected, y, "Ensemble_Weighted")

In [None]:


# =============================================================================
# 10. FINAL RESULTS AND MODEL SELECTION
# =============================================================================

print(f"\n{'='*60}")
print("FINAL RESULTS AND MODEL SELECTION")
print(f"{'='*60}")

# Final results summary
final_results = evaluator.get_results_summary()
print("Final Model Performance Summary:")
print(final_results.to_string())

# Select best model
all_results = pd.DataFrame(evaluator.results).T
best_model_name = all_results['rmse_mean'].idxmin()
best_model_rmse = all_results.loc[best_model_name, 'rmse_mean']
best_model_r2 = all_results.loc[best_model_name, 'r2_mean']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   RMSE: {best_model_rmse:.4f}¬±{all_results.loc[best_model_name, 'rmse_std']:.4f}")
print(f"   R¬≤: {best_model_r2:.4f}¬±{all_results.loc[best_model_name, 'r2_std']:.4f}")

# Performance insights
print(f"\nüìä PERFORMANCE INSIGHTS:")
print(f"1. Model Complexity vs Performance:")

# Group models by complexity
linear_models = ['Linear_Regression', 'Ridge_Regression', 'Lasso_Regression', 'ElasticNet_Regression']
tree_models = ['Decision_Tree', 'Random_Forest', 'Extra_Trees']
advanced_models = ['LightGBM_Optimized', 'XGBoost_Optimized', 'CatBoost_Optimized']
ensemble_models_names = [name for name in all_results.index if 'Ensemble' in name]

for group_name, models in [('Linear Models', linear_models), 
                          ('Tree Models', tree_models),
                          ('Advanced Models', advanced_models),
                          ('Ensemble Models', ensemble_models_names)]:
    group_results = all_results[all_results.index.isin(models)]
    if not group_results.empty:
        best_in_group = group_results['rmse_mean'].min()
        print(f"   {group_name}: Best RMSE = {best_in_group:.4f}")

print(f"\n2. Key Observations:")
print(f"   - Linear models baseline: RMSE ~{all_results.loc[all_results.index.isin(linear_models), 'rmse_mean'].min():.0f}")
print(f"   - Tree models improvement: RMSE ~{all_results.loc[all_results.index.isin(tree_models), 'rmse_mean'].min():.0f}")
print(f"   - Advanced models best: RMSE ~{all_results.loc[all_results.index.isin(advanced_models), 'rmse_mean'].min():.0f}")

if ensemble_models_names:
    ensemble_rmse = all_results.loc[all_results.index.isin(ensemble_models_names), 'rmse_mean'].min()
    print(f"   - Ensemble performance: RMSE ~{ensemble_rmse:.0f}")


In [None]:

# =============================================================================
# 11. SAVE RESULTS AND TRAINED MODELS
# =============================================================================

print(f"\n{'='*60}")
print("SAVING RESULTS AND MODELS")
print(f"{'='*60}")

# Create results directory
import os
os.makedirs('results/model_artifacts', exist_ok=True)
os.makedirs('results/submissions', exist_ok=True)

# Save results summary
final_results.to_csv('results/model_performance_summary.csv')
print("‚úì Model performance summary saved to: results/model_performance_summary.csv")

# Save feature importance
feature_importance_results.to_csv('results/feature_importance_analysis.csv', index=False)
print("‚úì Feature importance analysis saved to: results/feature_importance_analysis.csv")

# Save selected features
pd.Series(selected_features).to_csv('results/selected_features.csv', index=False, header=['feature'])
print("‚úì Selected features saved to: results/selected_features.csv")

# Save best hyperparameters
hyperparams = {
    'LightGBM': lgb_best_params,
    'XGBoost': xgb_best_params,
    'CatBoost': cb_best_params,
    'RandomForest': rf_random.best_params_,
    'ExtraTrees': et_random.best_params_,
    'Ridge': {'alpha': ridge_grid.best_params_['alpha']},
    'Lasso': {'alpha': lasso_grid.best_params_['alpha']},
    'ElasticNet': elastic_grid.best_params_
}

import json
with open('results/best_hyperparameters.json', 'w') as f:
    json.dump(hyperparams, f, indent=2)
print("‚úì Best hyperparameters saved to: results/best_hyperparameters.json")

print(f"\n{'='*60}")
print("MODELING PIPELINE COMPLETED SUCCESSFULLY!")
print(f"{'='*60}")

print(f"\nüìà FINAL SUMMARY:")
print(f"   Models Evaluated: {len(evaluator.results)}")
print(f"   Best Model: {best_model_name}")
print(f"   Best RMSE: {best_model_rmse:.4f}")
print(f"   Best R¬≤: {best_model_r2:.4f}")
print(f"   Features Used: {len(selected_features)}")
print(f"   Cross-Validation: {evaluator.cv_folds}-fold")

print(f"\nüéØ RECOMMENDATIONS:")
print(f"   1. Use {best_model_name} for final predictions")
print(f"   2. Consider ensemble if marginal improvement needed")
print(f"   3. Monitor for overfitting with {len(selected_features)} features")
print(f"   4. Validate on holdout test set before deployment")