## Proposed Improvements  
Current approach (potentially circular)  
snowflake_utils -> depends on -> step1  
step1 -> depends on -> snowflake_utils

# More modular approach
snowflake_utils -> depends on -> core_utilities
step1 -> depends on -> core_utilities

--------------------------------------

programtic change for testing vs production
Using a configuration file (YAML/JSON)  
Setting up environment variables  
Creating a config class that can be initialized with different values for testing vs. production  

---------------------------------------------------




In [None]:
# Cell: dependency_checker
"""
Dependencies: None (root cell)
Provides: Dependency checking functionality
"""
import logging
from typing import Dict, Set, Any, List
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DependencyChecker:
    def __init__(self):
        self.loaded_cells = set()
        self.load_times = {}
        # In dependency_checker cell, update the dependency_map:
        self.dependency_map = {
            'dependency_checker': set(),  # No dependencies
            'notebook_monitor': {'dependency_checker'},
            'imports': {'dependency_checker', 'notebook_monitor'},
            'global_constants': {'imports'},
            'step1': {'global_constants', 'imports', 'notebook_monitor'},
            'snowflake_utils': {'imports', 'step1', 'notebook_monitor'},
            'data_prep_normalization': {'imports', 'snowflake_utils', 'global_constants', 'notebook_monitor'},
            'feature_engineering': {'imports', 'data_prep_normalization', 'global_constants', 'notebook_monitor'},
            'model_training': {'imports', 'feature_engineering', 'global_constants', 'notebook_monitor'},
            'storage_utils': {'imports', 'snowflake_utils', 'global_constants', 'notebook_monitor'},
#            'final_evaluation_reporting': {'imports', 'model_training', 'global_constants', 'notebook_monitor', 'storage_utils'},
             'model_evaluator_class': {'imports', 'model_training', 'global_constants', 'notebook_monitor'},
#            'model_evaluator_methods': {'model_evaluator_class', 'storage_utils', 'notebook_monitor'},
            'model_evaluation_test': {'model_evaluator_class', 'notebook_monitor'}, # {'model_evaluator_methods', 'notebook_monitor'},
            'monitor_output': {'notebook_monitor', 'model_evaluation_test'}
        }

    def verify_execution_order(self, order: List[str]) -> bool:
        """Verify if a given execution order is valid"""
        executed = set()
        print("\nVerifying execution order:")
        print("=" * 50)
        
        for cell in order:
            # Check if cell exists in dependency map
            if cell not in self.dependency_map:
                print(f"❌ Unknown cell: {cell}")
                return False
            
            # Check if all dependencies are executed
            missing_deps = self.dependency_map[cell] - executed
            if missing_deps:
                print(f"❌ {cell}: Missing dependencies: {missing_deps}")
                return False
            
            print(f"✓ {cell}: All dependencies met")
            executed.add(cell)
        
        return True
    
    def show_cell_dependencies(self):
        """Show dependencies for each cell"""
        print("\nCell Dependencies:")
        print("=" * 50)
        for cell in self.dependency_map:
            deps = self.dependency_map[cell]
            print(f"\n{cell}:")
            if deps:
                for dep in sorted(deps):
                    print(f"  ├── {dep}")
            else:
                print("  └── No dependencies")
    
   
    def register_cell(self, cell_name: str):
        """Register a cell as loaded"""
        self.loaded_cells.add(cell_name)
        self.load_times[cell_name] = datetime.now()
        logger.info(f"Registered cell: {cell_name}")
        print(f"Registered cell: {cell_name}")  # Additional console output

    def check_dependencies(self, cell_name: str) -> bool:
        """Check if all dependencies for a cell are loaded"""
        if cell_name not in self.dependency_map:
            logger.warning(f"Unknown cell: {cell_name}")
            return False
            
        missing = self.dependency_map[cell_name] - self.loaded_cells
        if missing:
            logger.error(f"Missing dependencies for {cell_name}: {missing}")
            return False
        return True

    def get_cell_status(self) -> Dict[str, Dict[str, Any]]:
        """Get status of all cells and their dependencies"""
        status = {
            cell: {
                'loaded': cell in self.loaded_cells,
                'dependencies_met': self.check_dependencies(cell),
                'missing_dependencies': list(self.dependency_map[cell] - self.loaded_cells),
                'load_time': self.load_times.get(cell, None)
            }
            for cell in self.dependency_map
        }
        
        # Log the overall status
        logger.info("\nCell Status Summary:")
        for cell, info in status.items():
            load_status = '✓' if info['loaded'] else '✗'
            dep_status = '✓' if info['dependencies_met'] else '✗'
            load_time = info['load_time'].strftime('%H:%M:%S') if info['load_time'] else 'Not loaded'
            logger.info(f"{cell}: {load_status} Loaded ({load_time}), Dependencies: {dep_status}")
        
        return status

    def show_dependency_tree(self):
        """Display the dependency tree in a readable format"""
        print("\nDependency Tree:")
        print("=" * 50)
        for cell in self.dependency_map:
            deps = self.dependency_map[cell]
            status = "✓" if self.check_dependencies(cell) else "✗"
            loaded = "✓" if cell in self.loaded_cells else "✗"
            print(f"{status} {cell} [{loaded}]")
            if deps:
                for dep in deps:
                    dep_loaded = "✓" if dep in self.loaded_cells else "✗"
                    print(f"  ├── {dep_loaded} {dep}")
            print("  │")

    def validate_execution_order(self):
        """Validate that cells were executed in the correct order"""
        print("\nExecution Order Validation:")
        print("=" * 50)
        
        # Check if any cells were loaded before their dependencies
        for cell in self.loaded_cells:
            if cell in self.load_times:
                cell_time = self.load_times[cell]
                deps = self.dependency_map.get(cell, set())
                
                for dep in deps:
                    if dep in self.load_times:
                        dep_time = self.load_times[dep]
                        if dep_time > cell_time:
                            print(f"Warning: {cell} was loaded before its dependency {dep}")
# Initialize the dependency checker
dep_checker = DependencyChecker()
dep_checker.register_cell('dependency_checker')


# At the bottom of dependency_checker cell, add:
execution_order = [
    'dependency_checker',
    'notebook_monitor',
    'imports',
    'global_constants',
    'step1',
    'snowflake_utils',
    'data_prep_normalization',
    'feature_engineering',
    'model_training',
    'storage_utils',
    'model_evaluator_class',
#    'model_evaluator_methods',
    'model_evaluation_test',
    'monitor_output'
]

print("\nVerifying proposed execution order...")
dep_checker.verify_execution_order(execution_order)
print("\nShowing all cell dependencies...")
dep_checker.show_cell_dependencies()


# Show initial status
logger.info("Dependency checker initialized")
dep_checker.show_dependency_tree()



In [None]:
# Cell: notebook_monitor
"""
Dependencies: dependency_checker
Provides: Enhanced monitoring with memory usage and operation counts
"""
import time
import pandas as pd
import psutil
import os
from datetime import datetime
from typing import Optional, Dict, List, Any

class CellMonitor:
    def __init__(self):
        if not dep_checker.check_dependencies('notebook_monitor'):
            print("Warning: Dependencies not met for notebook monitor")
            
        self.execution_logs = []
        self.current_cell = None
        self.start_time = None
        self.end_time = None
        self.operation_counts = {}
        self.memory_snapshots = {}
        self.context = []  # Add context list
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)

    def add_context(self, context: str) -> None:
        """Add context to the current monitoring"""
        self.context.append(context)
        
    def remove_context(self) -> None:
        """Remove the most recent context"""
        if self.context:
            self.context.pop()

    def _get_memory_usage(self):
        """Get current memory usage in MB"""
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024

    def start(self, cell_name: str) -> None:
        """Start monitoring a cell execution"""
        # Add context if it exists
        if self.context:
            cell_name = f"{cell_name}_{'_'.join(self.context)}"
            
        self.current_cell = {
            'cell_name': cell_name,
            'start_time': datetime.now(),
            'status': 'running',
            'duration_seconds': 0,
            'start_memory_mb': self._get_memory_usage(),
            'peak_memory_mb': 0,
            'memory_change_mb': 0,
            'operation_count': 0,
            'operations': {},
            'error': None
        }
        self.start_time = time.time()
        self.operation_counts[cell_name] = {}

    def end(self) -> None:
        """End monitoring current cell execution"""
        if self.current_cell:
            self.end_time = time.time()
            duration = self.end_time - self.start_time
            end_memory = self._get_memory_usage()
            
            self.current_cell.update({
                'end_time': datetime.now(),
                'status': 'completed',
                'duration_seconds': round(duration, 2),
                'end_memory_mb': end_memory,
                'memory_change_mb': round(end_memory - self.current_cell['start_memory_mb'], 2)
            })
            
            self.execution_logs.append(self.current_cell.copy())
            self.current_cell = None

    def show_summary(self) -> pd.DataFrame:
        """Show execution summary"""
        print(f"DEBUG: Number of execution logs: {len(self.execution_logs)}")
        
        if not self.execution_logs:
            return pd.DataFrame()
    
        try:
            # Create DataFrame from logs
            df = pd.DataFrame(self.execution_logs)
            
            # Format timestamps
            if 'start_time' in df.columns:
                df['start_time'] = pd.to_datetime(df['start_time']).dt.strftime('%Y-%m-%d %H:%M:%S')
            if 'end_time' in df.columns:
                df['end_time'] = pd.to_datetime(df['end_time']).dt.strftime('%Y-%m-%d %H:%M:%S')
            
            return df
            
        except Exception as e:
            print(f"Error generating summary: {type(e).__name__} - {str(e)}")
            print(traceback.format_exc())
            return pd.DataFrame()

    def log_error(self, error: Exception) -> None:
        """Log an error for the current cell"""
        if self.current_cell:
            self.current_cell.update({
                'status': 'error',
                'error': f"{type(error).__name__}: {str(error)}",
                'end_memory_mb': self._get_memory_usage()
            })
            self.end()

# Initialize monitor
monitor = CellMonitor()
dep_checker.register_cell('notebook_monitor')

In [None]:
# Cell: imports
monitor.start('imports')
try:
    """
    Dependencies: dependency_checker, notebook_monitor
    Provides: All required package imports for the notebook
    """
    # Standard libraries
    import logging
    from typing import Dict, Set, Any, List, Optional, Tuple
    from datetime import datetime
    import time
    import traceback
    import json
    import gc
    import pickle
    import base64   # Add this import
    import joblib
    import io
    
    # Snowpark
    from snowflake.snowpark.functions import col, when
    import snowflake.snowpark.functions as F
    
    # Scientific computing
    import pandas as pd
    import numpy as np
    
    # Scikit-learn
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import SelectFromModel
    from sklearn.feature_selection import SelectKBest, f_classif  # Add this line with the other sklearn imports
    from sklearn.model_selection import train_test_split, StratifiedKFold
    from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                               f1_score, roc_auc_score, balanced_accuracy_score,matthews_corrcoef, 
                                precision_recall_curve, auc)  
    
    # Models
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.neural_network import MLPClassifier
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import validation_curve, cross_val_score
    from sklearn.metrics import classification_report


    
    # Visualization
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)

finally:
    monitor.end()
dep_checker.register_cell('imports')

In [None]:
# Cell: global_constants
monitor.start('global_constants')
try:
    """
    Dependencies: imports
    Provides: Global constants for the entire notebook
    """

    # Run configuration
    RUN_MODE = 'test'  # Options: 'TEST', 'FULL'

    # Department and Model Configuration for prefix to keep track of which unit and what is being modeled
    DEPARTMENT = 'UF'
    MODEL_TYPE = 'MAJOR_GIFT'
 
 
    # Database configuration
    SCHEMA = "UFF_MODEL"
    SOURCE_TABLE = "UF_MAJOR_GIFT_VIEWS_COMBO"

    
    # Data processing parameters
    SAMPLE_SIZE = 250000 #75000 47500  - ok 125000 -maybe 300000(too big)  # Number of records to process
    CHUNK_SIZE =25000 # 25000 10000   # Size of processing chunks
        # Feature sampling for FULL mode
    # These will be used to determine how many variables to include
    FEATURE_SAMPLES = {
        'third': 1/3,    # Use 1/3 of available features
        'half': 1/2,     # Use 1/2 of available features
        'full': 1.0      # Use all features
    }

  # model names
    MODEL_NAMES = ['LogisticRegression', 'RandomForest', 'SVC', 'DecisionTree', 'MLPClassifier']

    # Test mode sample size
#    TEST_SAMPLE_SIZE = 1000
 
    # Model parameters
    N_SPLITS = 5        # Number of cross-validation folds
    RANDOM_SEED = 42    # For reproducibility
    
    # Target columns
    TARGET_COLS = ['COMMIT_MAJOR', 'INFLATION_MAJOR_COMMIT']


        # Oversampling configuration
    ENABLE_OVERSAMPLING = False
    OVERSAMPLING_STRATEGY = 'auto'  # Options: 'auto', 'minority', specific ratio
    
    # Table naming templates
    def get_table_name(base_name: str, timestamp: Optional[str] = None) -> str:
        """Generate standardized table names"""
        # Avoid double prefixing
        if base_name.startswith(f"{DEPARTMENT}_{MODEL_TYPE}_"):
            name = base_name
        else:
            name = f"{DEPARTMENT}_{MODEL_TYPE}_{base_name}"
        if timestamp:
            name = f"{name}_{timestamp}"
        return name    
        # Define standard table names
#    TABLE_NAMES = {
#        'results': get_table_name('RESULTS'),
#        'features': get_table_name('FEATURES'),
#        'models': get_table_name('MODELS')
#    }

    
# Should be:
    TABLE_NAMES = {
        'results': get_table_name('MODEL_PERFORMANCE'),    # This becomes UF_MAJOR_GIFT_MODEL_PERFORMANCE
        'features': get_table_name('FEATURE_IMPORTANCE'),  # This becomes UF_MAJOR_GIFT_FEATURE_IMPORTANCE
        'models': get_table_name('MODELS') ,               # This becomes UF_MAJOR_GIFT_MODELS
        'imputed': get_table_name('IMPUTED')  # This becomes UF_MAJOR_GIFT_IMPUTED
    }
# Updated code - just to ensure consistency
#{
#       'model_performance': get_table_name('MODEL_PERFORMANCE'),  # This becomes UF_MAJOR_GIFT_MODEL_PERFORMANCE
#       'feature_importance': get_table_name('FEATURE_IMPORTANCE'),  # This becomes UF_MAJOR_GIFT_FEATURE_IMPORTANCE
#       'models': get_table_name('MODELS'),  # This becomes UF_MAJOR_GIFT_MODELS
#       'imputed': get_table_name('IMPUTED')  # This becomes UF_MAJOR_GIFT_IMPUTED
#    }

    
    # Function to get imputed table name with timestamp
#    def get_imputed_table_name(timestamp: Optional[str] = None) -> str:
#        """Get imputed table name with optional timestamp"""
#        return get_table_name('IMPUTED', timestamp)
#        #'imputed':  get_table_name('IMPUTED', timestamp=None)

finally:
    monitor.end()
dep_checker.register_cell('global_constants')

In [None]:
# Cell: step1
monitor.start('step1')
try:
    class EnvironmentConfig:
        # Static configurations that don't change between environments
        SCHEMA = SCHEMA # Use global constant
        TABLES = {
            'major_gift': SOURCE_TABLE, # Use global constant
#            'imputed': 'UF_MAJOR_GIFT_VIEWS_IMPUTED'
#             'imputed':TABLE_NAMES['imputed']# ()
             'imputed': get_table_name('IMPUTED')  # Use the function instead of TABLE_NAMES reference

        }
        
        # Environment configurations
        ENVIRONMENTS = {
            'test': {
                'database': 'PRE_PRODUCTION',
                'schema': SCHEMA,
                'warehouse': 'ANALYSIS'
            },
            'prod': {
                'database': 'PRODUCTION',
                'schema': SCHEMA,
                'warehouse': 'ANALYSIS'
            }
        }
        
        def __init__(self):
            self.current_env = 'test'  # default to test environment
            self.session = None
        
        def set_session(self, session):
            """Set the Snowflake session"""
            self.session = session
            self._apply_environment()
        
        def switch_environment(self, env_name):
            """Switch between environments"""
            if not self.session:
                raise ValueError("Session not initialized. Call set_session first.")
                
            if env_name not in self.ENVIRONMENTS:
                raise ValueError(f"Invalid environment. Use one of: {list(self.ENVIRONMENTS.keys())}")
            
            self.current_env = env_name
            self._apply_environment()
            self.show_environment_status()
        
        def _apply_environment(self):
            """Apply environment settings to Snowflake session"""
            if not self.session:
                raise ValueError("Session not initialized")
                
            env = self.ENVIRONMENTS[self.current_env]
            try:
                self.session.sql(f"USE DATABASE {env['database']}").collect()
                self.session.sql(f"USE SCHEMA {env['schema']}").collect()
                self.session.sql(f"USE WAREHOUSE {env['warehouse']}").collect()
            except Exception as e:
                print(f"Error setting environment: {str(e)}")
                raise

        def get_full_table_name(self, table_key: str) -> str:
            """Get fully qualified table name: database.schema.table"""
            env = self.ENVIRONMENTS[self.current_env]
            table_name = self.TABLES.get(table_key)
            if not table_name:
                raise ValueError(f"Invalid table key. Use one of: {list(self.TABLES.keys())}")
            return f"{env['database']}.{env['schema']}.{table_name}"
        
        def get_schema_db_name(self) -> str:
            """Get database.schema"""
            env = self.ENVIRONMENTS[self.current_env]
            return f"{env['database']}.{env['schema']}"
        
        def get_temp_table_name(self, base_name=None, timestamp=None):
            """
            Generate a temporary table name with timestamp
            Args:
                base_name: Optional base name (defaults to 'imputed' table)
                timestamp: Optional timestamp (will generate current timestamp if None)
            """
            from datetime import datetime
            
            if base_name is None:
                base_name = self.TABLES['imputed']
                
            if timestamp is None:
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                
            temp_name = f"{base_name}_{timestamp}"
            return {
                'table_name': temp_name,
                'full_name': f"{self.get_schema_db_name()}.{temp_name}",
                'timestamp': timestamp
                 }
        
        def show_environment_status(self):
            """Display current environment settings"""
            if not self.session:
                raise ValueError("Session not initialized")
                
            border = "=" * 60
            print(border)
            print(f"{'ENVIRONMENT STATUS':^60}")
            print(border)
            print(f"ENVIRONMENT: {self.current_env.upper()}")
            
            try:
                current_settings = self.session.sql("""
                SELECT 
                    CURRENT_DATABASE() as database,
                    CURRENT_SCHEMA() as schema,
                    CURRENT_WAREHOUSE() as warehouse
                """).collect()
                
                print(f"""
                DATABASE:  {current_settings[0]['DATABASE']}
                SCHEMA:    {current_settings[0]['SCHEMA']}
                WAREHOUSE: {current_settings[0]['WAREHOUSE']}
                """)
                print(border)
            except Exception as e:
                print(f"Error getting current settings: {str(e)}")
                raise

finally:
    monitor.end()
dep_checker.register_cell('step1')

In [None]:
# Cell: snowflake_utils
monitor.start('snowflake_utils')
try:
    class SnowflakeManager:
        def __init__(self, config: EnvironmentConfig):
            self.config = config
            self.session = config.session
            self.max_retries = 3
            self.retry_delay = 5  # seconds
            self._error_counts = {}

        def execute_with_retry(self, operation_name: str, operation, *args, **kwargs):
            monitor.start(f'execute_{operation_name}')
            try:
                for attempt in range(self.max_retries):
                    try:
                        result = operation(*args, **kwargs)
                        return result
                    except Exception as e:
                        self._track_error(operation_name, e)
                        if attempt < self.max_retries - 1:
                            print(f"Error in {operation_name}, retrying... (Attempt {attempt + 1}/{self.max_retries})")
                            time.sleep(self.retry_delay * (attempt + 1))
                            self.session = get_active_session()
                            self.config._apply_environment()
                        else:
                            raise
            except Exception as e:
                print(f"Error in {operation_name}: {type(e).__name__} - {str(e)}")
                print(traceback.format_exc())
                raise
            finally:
                monitor.end()

        def ensure_table_exists(self, table_type: str) -> None:
            """Ensure required tables exist with correct schema"""
            monitor.start('ensure_table_exists')
            try:
                schemas = {
                    'results': """  
                        CREATE TABLE IF NOT EXISTS {table_name} (
                            TARGET VARCHAR,
                            MODEL VARCHAR,
                            FEATURE_SAMPLE VARCHAR,
                            N_FEATURES NUMBER,
                            FEATURE_RATIO FLOAT,
                            SAMPLE_SIZE NUMBER,
                            ACCURACY_MEAN FLOAT,
                            ACCURACY_STD FLOAT,
                            PRECISION_MEAN FLOAT,
                            PRECISION_STD FLOAT,
                            RECALL_MEAN FLOAT,
                            RECALL_STD FLOAT,
                            F1_MEAN FLOAT,
                            F1_STD FLOAT,
                            ROC_AUC_MEAN FLOAT,
                            ROC_AUC_STD FLOAT,
                            TIMESTAMP TIMESTAMP_NTZ
                        )
                    """,
                    'models': """ 
                        CREATE TABLE IF NOT EXISTS {table_name} (
                            MODEL VARCHAR,
                            TARGET VARCHAR,
                            FEATURE_SAMPLE VARCHAR,
                            N_FEATURES NUMBER,
                            MODEL_OBJECT VARCHAR,  -- Serialized model
                            SELECTED_FEATURES VARCHAR,  -- Serialized feature list
                            METRICS VARCHAR,  -- Serialized metrics
                            SCALER VARCHAR,  -- Serialized scaler
                            IMPUTERS VARCHAR,  -- Serialized imputers
                            CREATED_AT TIMESTAMP_NTZ
                        )
                    """,
                    'features': """ 
                        CREATE TABLE IF NOT EXISTS {table_name} (
--                           MODEL VARCHAR, 
                            FEATURE_NAME VARCHAR,
                            IMPORTANCE FLOAT,
                            TARGET VARCHAR,
                            SAMPLE_TYPE VARCHAR,
                            CREATED_AT TIMESTAMP_NTZ
                        )
                    """
                }
                
                if table_type in schemas:
                    table_name = TABLE_NAMES[table_type]
                    full_name = f"{self.config.get_schema_db_name()}.{table_name}"
                    
                    # Check if table exists
                    exists_query = f"""
                    SELECT 1 
                    FROM INFORMATION_SCHEMA.TABLES 
                    WHERE TABLE_SCHEMA = '{self.config.SCHEMA}'
                    AND TABLE_NAME = '{table_name}'
                    """
                    
                    result = self.execute_with_retry(
                        'check_table_exists',
                        lambda: self.session.sql(exists_query).collect()
                    )
                    
                    if not result:
                        create_query = schemas[table_type].format(table_name=full_name)
                        self.execute_with_retry(
                            'create_table',
                            lambda: self.session.sql(create_query).collect()
                        )
                        print(f"Created table: {full_name}")
                
            finally:
                monitor.end()

        def load_table(self, table_key: str, sample_percent: Optional[float] = None,
                      columns: Optional[list] = None, where_clause: Optional[str] = None) -> pd.DataFrame:
            monitor.start('load_table')
            try:
                full_table_name = self.config.get_full_table_name(table_key)
                cols_str = '*' if not columns else ', '.join(columns)
                query = f'SELECT {cols_str} FROM {full_table_name}'
                
                if where_clause:
                    query += f' WHERE {where_clause}'
                if sample_percent is not None:
                    query += f' SAMPLE({sample_percent})'

                snowpark_df = self.execute_with_retry('load_table', 
                                                    lambda: self.session.sql(query))
                return snowpark_df.to_pandas()
            except Exception as e:
                print(f"Error loading table {table_key}: {type(e).__name__} - {str(e)}")
                print(traceback.format_exc())
                raise
            finally:
                monitor.end()

        def save_results(self, df: pd.DataFrame, base_name: str, 
                        mode: str = 'overwrite',
                        timestamp: Optional[str] = None,
                        partition_by: Optional[List[str]] = None, 
                        cluster_by: Optional[List[str]] = None) -> Dict[str, str]:
            """Save results to Snowflake"""
            monitor.start('save_results')
            try:
                # Determine table type and ensure it exists
                if base_name == 'MODEL_RESULTS':
                    table_type = 'results'
                    self.ensure_table_exists(table_type)
                    table_name = TABLE_NAMES[table_type]
                    mode = 'append'
                elif base_name == 'IMPUTED':
                    table_type = 'imputed'
#                    self.ensure_table_exists(table_type)
#                    if timestamp table_name = get_table_name('IMPUTED', timestamp)
#                    else table_name = get_table_name('IMPUTED')
#                    table_name = get_imputed_table_name(timestamp)
                    table_name = get_table_name('IMPUTED', timestamp)
                    mode = 'overwrite'
                elif base_name == 'MODELS':
                    table_type = 'models'
                    self.ensure_table_exists(table_type)
                    table_name = TABLE_NAMES[table_type]
                    mode = 'append'
                elif base_name == 'FEATURES':
                    table_type = 'features'
                    self.ensure_table_exists(table_type)
                    table_name = TABLE_NAMES[table_type]
                    mode = 'append'
                elif base_name == 'results' or base_name == 'features' or base_name == 'models':
                    # Handle direct table_type references
                    table_type = base_name
                    self.ensure_table_exists(table_type)
                    table_name = TABLE_NAMES[table_type]
                    mode = 'append'
                    
                else:
                                        # If not a predefined table, use the naming function
                    table_name = get_table_name(base_name)
                
                full_name = f"{self.config.get_schema_db_name()}.{table_name}"
                print(f"Saving to {full_name} with mode: {mode}")
                
                snowpark_df = self.session.create_dataframe(df)
                writer = snowpark_df.write

                if partition_by:
                    writer = writer.partition_by(partition_by)
                if cluster_by:
                    writer = writer.cluster_by(cluster_by)

                writer.save_as_table(full_name, mode=mode)
                
                return {
                    'table_name': table_name,
                    'full_name': full_name,
                    'timestamp': timestamp
                }
            
            except Exception as e:
                print(f"Error saving results: {type(e).__name__} - {str(e)}")
                print(traceback.format_exc())
                raise
            finally:
                monitor.end()

        def verify_tables(self) -> pd.DataFrame:
            """Verify all required tables exist with correct structure"""
            monitor.start('verify_tables')
            try:
                queries = {
                    'existence': f"""
                        SELECT TABLE_NAME, TABLE_SCHEMA
                        FROM INFORMATION_SCHEMA.TABLES 
                        WHERE TABLE_SCHEMA = '{self.config.SCHEMA}'
                        AND TABLE_NAME LIKE '{DEPARTMENT}_{MODEL_TYPE}_%'
                    """,
                    'structure': f"""
                        SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, IS_NULLABLE
                        FROM INFORMATION_SCHEMA.COLUMNS
                        WHERE TABLE_SCHEMA = '{self.config.SCHEMA}'
                        AND TABLE_NAME LIKE '{DEPARTMENT}_{MODEL_TYPE}_%'
                        ORDER BY TABLE_NAME, ORDINAL_POSITION
                    """
                }
                
                results = {}
                for query_name, query in queries.items():
                     # Execute query and convert result to DataFrame
                    snowpark_df = self.execute_with_retry(
                        f'verify_{query_name}',
                        lambda: self.session.sql(query)
                    ) 
                    results[query_name] = snowpark_df.to_pandas()
                
                return results
                
            finally:
                monitor.end()

        def _track_error(self, operation: str, error: Exception) -> None:
            error_key = f"{operation}:{type(error).__name__}"
            self._error_counts[error_key] = self._error_counts.get(error_key, 0) + 1

finally:
    monitor.end()
dep_checker.register_cell('snowflake_utils')

In [None]:
# Cell: data_prep_normalization
monitor.start('data_prep_normalization')
try:
    def load_and_prepare_data(config: EnvironmentConfig, 
                             sample_size: int = SAMPLE_SIZE,
                             random_seed: int = RANDOM_SEED) -> pd.DataFrame:

#        # Add feature selection to reduce overfitting
#        if len(self.features) > 50:  # If too many features
#            from sklearn.feature_selection import SelectKBest, f_classif
#            selector = SelectKBest(score_func=f_classif, k=min(30, len(self.features)))
#            feature_df_selected = selector.fit_transform(feature_df, target_for_selection)
#            print(f"Reduced features from {len(self.features)} to {feature_df_selected.shape[1]}")
        

        
        """
        Load specific sample size using Snowpark's capabilities with dynamic type handling
        """
        monitor.start('load_and_prepare_data')
        try:
            sf_manager = SnowflakeManager(config)
            
            # Load data from Snowflake
            query = f"""
            SELECT *
            FROM {sf_manager.config.get_full_table_name('major_gift')}
            ORDER BY RANDOM({random_seed})
            LIMIT {sample_size}
            """
            
            print(f"Loading {sample_size} records...")
            snow_df = sf_manager.execute_with_retry(
                'load_data',
                lambda: sf_manager.session.sql(query)
            )
            
            print("Converting to pandas...")
            df = snow_df.to_pandas()
            
            # Identify column types using pandas
            print("\nAnalyzing column data types:")
            numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
            categorical_columns = df.select_dtypes(include=['object', 'category']).columns
            
            print(f"Found {len(numeric_columns)} numeric columns")
            print(f"Found {len(categorical_columns)} categorical columns")
            
            # Process potential Y/N columns in categorical columns
            for col in categorical_columns:
                # Check for Y/N pattern in the column
                unique_vals = set(str(x).upper() for x in df[col].dropna().unique() if str(x).strip())
                is_yn = unique_vals and all(val in ['Y', 'N', 'YES', 'NO', 'TRUE', 'FALSE'] for val in unique_vals)
                
                if is_yn:
                    print(f"Converting Y/N column: {col}")
                    # Convert Y/N values to 1/0
                    yn_mapping = {
                        'Y': 1, 'YES': 1, 'TRUE': 1, 'y': 1, 'yes': 1, 'true': 1,
                        'N': 0, 'NO': 0, 'FALSE': 0, 'n': 0, 'no': 0, 'false': 0
                    }
                    
                    # Apply mapping
                    df[col] = df[col].map(lambda x: yn_mapping.get(str(x).upper(), None) 
                                         if pd.notna(x) and str(x).strip() else None)
                    
            # After conversion, recheck column types
            print("\nUpdated column data types:")
            numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
            categorical_columns = df.select_dtypes(include=['object', 'category']).columns
            
            print(f"Now have {len(numeric_columns)} numeric columns")
            print(f"Now have {len(categorical_columns)} categorical columns")
            
            return df
            
        except Exception as e:
            print(f"Error in data loading: {str(e)}")
            print(traceback.format_exc())
            raise
        finally:
            monitor.end()

    def determine_imputation_strategy(df: pd.DataFrame) -> Dict[str, List[str]]:
        """
        Determine appropriate imputation strategy for each column based on its characteristics
        """
        column_groups = {
            'mean': [],    # For financial and continuous data
            'median': [],  # For skewed distributions
            'constant': [] # For sparse or categorical-derived numerics
        }
        
        # DEBUG_START - Column Analysis
        print("\nAnalyzing column characteristics:")
        print("=" * 50)
        # DEBUG_END
        
        for col in df.columns:
            # Get non-null values for analysis
            non_null = df[col].dropna()
            null_pct = df[col].isnull().mean()
            
            # Handle completely null columns
            if null_pct == 1.0:
                column_groups['constant'].append(col)
                continue
                
            # For columns with some data
            if len(non_null) > 0:
                unique_pct = len(non_null.unique()) / len(non_null)
                try:
                    skewness = non_null.skew()
                except:
                    skewness = 0  # Default for non-numeric or error cases
                
                # DEBUG_START - Individual Column Stats
                print(f"\nColumn: {col}")
                print(f"Null %: {null_pct:.2%}")
                print(f"Unique %: {unique_pct:.2%}")
                print(f"Skewness: {skewness:.2f}")
                # DEBUG_END
                
                # Binary/Boolean derived columns go to constant
                if len(non_null.unique()) <= 2:
                    column_groups['constant'].append(col)
                # High nulls or few uniques go to constant
                elif null_pct > 0.9 or unique_pct < 0.01:
                    column_groups['constant'].append(col)
                # Highly skewed data goes to median
                elif abs(skewness) > 2:
                    column_groups['median'].append(col)
                # Everything else goes to mean
                else:
                    column_groups['mean'].append(col)
            else:
                column_groups['constant'].append(col)

        # VALIDATION_START - Column Assignment
        print("\nValidating column assignments:")
        print("=" * 50)
        all_assigned = set([col for group in column_groups.values() for col in group])
        all_columns = set(df.columns)
        unassigned = all_columns - all_assigned
        if unassigned:
            print(f"Warning: Adding unassigned columns to constant imputation: {unassigned}")
            column_groups['constant'].extend(unassigned)
        # VALIDATION_END

        return column_groups

    def impute_and_normalize(df: pd.DataFrame, 
                            config: EnvironmentConfig, 
                            chunk_size: int = CHUNK_SIZE,
                            save_imputed: bool = True,
                            timestamp: Optional[str] = None) -> Tuple[pd.DataFrame, StandardScaler, 
                                                             Dict[str, SimpleImputer],
                                                             Dict[str, List[str]]]:
        """
        Impute missing values and normalize data in chunks
        Returns:
            - Normalized DataFrame
            - Fitted StandardScaler
            - Dictionary of fitted imputers
            - Dictionary of imputation strategies
        """
        monitor.start('impute_and_normalize')
        try:
            # Get numeric columns
            numeric_df = df.copy()
            
            # DEBUG_START - Initial Data Stats
            print(f"\nInitial shape: {numeric_df.shape}")
            print("\nMissing value summary:")
            print(numeric_df.isnull().sum().sort_values(ascending=False))
            # DEBUG_END
            
            # Determine imputation strategy
            column_groups = determine_imputation_strategy(numeric_df)
            
            # Define imputation strategies
            imputers = {
                'mean': SimpleImputer(strategy='mean'),
                'median': SimpleImputer(strategy='median'),
                'constant': SimpleImputer(strategy='constant',  fill_value=0) #, keep_empty_feature=True) # Add when sklearn >= 1.8
            }

            # Initialize scaler
            scaler = StandardScaler()

            # Fit imputers and scaler on full dataset first
            for strategy, cols in column_groups.items():
                if cols:  # Only if we have columns for this strategy
                    imputers[strategy].fit(numeric_df[cols])
            
            # Process in chunks
            results = []
            for start_idx in range(0, len(numeric_df), chunk_size):
                chunk = numeric_df.iloc[start_idx:start_idx + chunk_size].copy()
                
                # Impute
                for strategy, cols in column_groups.items():
                    if cols:
                        chunk[cols] = imputers[strategy].transform(chunk[cols])
                
                results.append(chunk)
                print(f"Processed chunk {len(results)}: rows {start_idx} to {start_idx + len(chunk)}")
                
                gc.collect()

            # Combine all imputed results
            imputed_df = pd.concat(results, ignore_index=True)
            
            # Save imputed data if requested
            if save_imputed:
                sf_manager = SnowflakeManager(config)
                save_info = sf_manager.save_results(
                    imputed_df, 
                    base_name='IMPUTED',
                    mode='overwrite',
                    timestamp=timestamp  # Pass timestamp here

                )
                print(f"\nSaved imputed data to: {save_info['full_name']}")

            # Now normalize the imputed data
            scaler.fit(imputed_df)
            
            # Normalize in chunks
            normalized_results = []
            for start_idx in range(0, len(imputed_df), chunk_size):
                chunk = imputed_df.iloc[start_idx:start_idx + chunk_size].copy()
                chunk_normalized = pd.DataFrame(
                    scaler.transform(chunk),
                    columns=chunk.columns,
                    index=chunk.index
                )
                normalized_results.append(chunk_normalized)
                
                gc.collect()

            final_df = pd.concat(normalized_results, ignore_index=True)
            
            # VALIDATION_START - Final Validation
            print("\nFinal validation:")
            print(f"Final shape: {final_df.shape}")
            final_nulls = final_df.isnull().sum().sum()
            print(f"Total null values remaining: {final_nulls}")
            # VALIDATION_END

            return final_df, scaler, imputers, column_groups

        except Exception as e:
            print(f"Error in imputation and normalization: {str(e)}")
            print(traceback.format_exc())
            raise
        finally:
            monitor.end()

    # Test the data preparation
    if __name__ == "__main__":
        config = EnvironmentConfig()
        session = get_active_session()
        config.set_session(session)
        
        # Test with small sample first
        print("Testing data preparation...")
        df = load_and_prepare_data(config, sample_size=SAMPLE_SIZE)
        print(f"Initial shape: {df.shape}")
        
        # Prepare numeric data (will add this function in next section)
        numeric_df = df.select_dtypes(include=['int64', 'float64'])
        print(f"Numeric shape: {numeric_df.shape}")
        
        # Test imputation and normalization
        normalized_df, scaler, imputers, strategies = impute_and_normalize(
            numeric_df, 
            config, 
            chunk_size=CHUNK_SIZE,
            save_imputed=False
        )
        print(f"Normalized shape: {normalized_df.shape}")
        
        # Check normalization results
        print("\nNormalization check (mean should be ~0, std should be ~1):")
        print(normalized_df.describe())

finally:
    monitor.end()
dep_checker.register_cell('data_prep_normalization')

In [None]:
# Cell: feature_engineering
monitor.start('feature_engineering')
try:
    def prepare_numeric_data(df: pd.DataFrame,
                             target_cols: List[str] = ['COMMIT_MAJOR', 'INFLATION_MAJOR_COMMIT']
                             ) -> Tuple[pd.DataFrame, Dict[str, pd.Series]]:
        """
        Prepare numeric features and targets, including encoding categorical features
        """
        monitor.start('prepare_numeric_data')
        try:
            # Save targets with binary conversion
            targets = {}
            for col in target_cols:
                if col in df.columns:
                    # Convert 'MAJOR'/'NOT' to 1/0
                    targets[col] = (df[col] == 'MAJOR').astype(int)
                    # DEBUG_START - Target Distribution
                    print(f"\nTarget {col} distribution:")
                    print(targets[col].value_counts(normalize=True))
                    # DEBUG_END
            # First identify numeric and categorical columns
            numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
            categorical_columns = df.select_dtypes(include=['object', 'category']).columns
            print(f"\nFound {len(numeric_columns)} numeric columns and {len(categorical_columns)} categorical columns")
            # Create a DataFrame with numeric columns
            numeric_df = df[numeric_columns].copy()
            # Handle categorical columns with one-hot encoding
            if len(categorical_columns) > 0:
                print(f"Encoding {len(categorical_columns)} categorical columns")
                # For each categorical column, encode it and add to numeric_df
                for col in categorical_columns:
                    # Skip target columns
                    if col in target_cols:
                        continue
                    # Skip columns with too many unique values (threshold can be adjusted)
                    unique_count = df[col].nunique()
                    if unique_count > 20:  # Skip if more than 20 unique values
                        print(f"Skipping column {col} with {unique_count} unique values (too many for one-hot encoding)")
                        continue
                    # Clean column names by removing spaces and apostrophes
#                    clean_col = col.replace(' ', '').replace("'", "")
                    # Clean the values in the column as well
                    df[col] = df[col].astype(str).apply(lambda x: x.replace(' ', '').replace("'", "")).replace("-", "")
                    # Apply one-hot encoding
                    try:
                        # Get dummies without dropping first category to preserve all information
                        dummies = pd.get_dummies(df[col], prefix=col, dummy_na=False, drop_first=False)
                        dummies.columns = [col_name.upper().replace(' ', '_').replace('-', '_').replace('.', '_') 
                                            for col_name in dummies.columns]
                        # Remove columns with all zeros (useless predictors)
                        dummies = dummies.loc[:, dummies.sum() > 0]
                        print(f"  - Encoded {col}: added {dummies.shape[1]} dummy columns")
                        # Add to numeric DataFrame
                        numeric_df = pd.concat([numeric_df, dummies], axis=1)
                    except Exception as e:
                        print(f"  - Error encoding column {col}: {str(e)}")
            # Remove target columns if they exist in numeric_df
            numeric_df = numeric_df.drop(columns=target_cols, errors='ignore')
            # Remove specific columns if they exist
            columns_to_remove = ['TOTAL_COMMIT_VALUE', 'INFLATION_TOT_COMMIT']
            existing_columns = [col for col in columns_to_remove if col in numeric_df.columns]
            if existing_columns:
                print("\nRemoving columns:")
                for col in existing_columns:
                    print(f"- {col}")
                numeric_df = numeric_df.drop(columns=existing_columns)
            # Handle Y/N columns already converted to 1/0 (these should already be numeric)
            bool_cols = df.select_dtypes(include=['bool']).columns
            for col in bool_cols:
                if col not in numeric_df.columns:  # Only if not already included
                    numeric_df[col] = df[col].astype(float)
            print(f"Final shape after preparation: {numeric_df.shape}")
            print(f"Sample feature names: {list(numeric_df.columns[:5])}")
            return numeric_df, targets
        finally:
            monitor.end()
            
    class FeatureImportanceAnalyzer:
        def __init__(self, n_jobs: int = -1):
            self.feature_importances = {}
            self.feature_names = None  # Add this to store feature names
            self.n_jobs = n_jobs
            self.importance_estimator = RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                n_jobs=n_jobs,
                class_weight='balanced'
            )
        def calculate_importance(self, X: pd.DataFrame, y: pd.Series, target_name: str) -> Dict[str, float]:
            """Calculate feature importance for a target"""
            monitor.start(f'feature_importance_{target_name}')
            try:
                # Store feature names and their order
                self.feature_names = X.columns.tolist()
                # Fit random forest for importance calculation
                self.importance_estimator.fit(X, y)
                # Get feature importance
                importance = self.importance_estimator.feature_importances_
                importance_dict = dict(zip(self.feature_names, importance))
                # Sort by importance
                importance_dict = {
                    k: v for k, v in sorted(
                        importance_dict.items(),
                        key=lambda item: item[1],
                        reverse=True
                    )
                }
                # Store for this target
                self.feature_importances[target_name] = importance_dict
                return importance_dict
            finally:
                monitor.end()
                
        def select_features(self, df: pd.DataFrame, importance_dict: Dict[str, float],
                            sample_ratio: float, k: Optional[int] = None) -> pd.DataFrame:
            """
            Select top features based on importance and ratio
            Args:
                df: Input DataFrame
                importance_dict: Dictionary of feature importances
                sample_ratio: Ratio of features to select
                k: Optional specific number of features to select
            Returns:
                DataFrame with selected features
            """
            # Sort features by importance
            sorted_features = sorted(importance_dict.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
            # Calculate number of features if k not provided
            if k is None:
                k = max(1, min(
                    int(len(sorted_features) * sample_ratio),
                    len(sorted_features)
                ))
            else:
                k = min(k, len(sorted_features))  # Don't select more than available
            # Select top k features
            selected_features = [f[0] for f in sorted_features[:k]]
            print(f"Selected {len(selected_features)} features out of {len(sorted_features)}")
            # Verify features exist in DataFrame
            missing_features = [f for f in selected_features if f not in df.columns]
            if missing_features:
                print("\nWARNING: Some selected features not in DataFrame:")
                print(missing_features)
            # Return copy of DataFrame with selected features
            result_df = df[selected_features].copy()
            return result_df
            
        def plot_importance(self, target_name: str, top_n: int = 20):
            """Plot feature importance"""
            monitor.start(f'plot_importance_{target_name}')
            try:
                if target_name not in self.feature_importances:
                    raise ValueError(f"No importance calculated for {target_name}")
                importance_dict = self.feature_importances[target_name]
                # Get top N features
                features = list(importance_dict.keys())[:top_n]
                importance = list(importance_dict.values())[:top_n]
                # Create plot
                plt.figure(figsize=(12, 8))
                sns.barplot(x=importance, y=features)
                plt.title(f'Top {top_n} Feature Importance for {target_name}')
                plt.xlabel('Importance')
                plt.ylabel('Feature')
                plt.tight_layout()
                return plt.gcf()
            finally:
                monitor.end()
                
    # Test feature engineering
    if __name__ == "__main__":
        config = EnvironmentConfig()
        session = get_active_session()
        config.set_session(session)
        # Load sample data
        print("Testing feature engineering...")
        df = load_and_prepare_data(config, sample_size=SAMPLE_SIZE)
        # Prepare numeric data
        numeric_df, targets = prepare_numeric_data(df)
        normalized_df, scaler, imputers, strategies = impute_and_normalize(
            numeric_df,
            config,
            save_imputed=False
        )
        # Test feature importance and selection
        analyzer = FeatureImportanceAnalyzer()
        # Test for each target and feature sample size
        for target_name, target_values in targets.items():
            print(f"\n{'='*80}")
            print(f"Testing features for {target_name}")
            print(f"{'='*80}")
            # Calculate importance
            importance_dict = analyzer.calculate_importance(
                normalized_df,
                target_values,
                target_name
            )
            # Test different feature sample ratios
            for sample_name, ratio in FEATURE_SAMPLES.items():
                print(f"\n{'-'*40}")
                print(f"Testing {sample_name} feature sample")
                print(f"{'-'*40}")
                # Calculate features for this ratio
                total_features = normalized_df.shape[1]
                n_features = max(1, min(
                    int(total_features * ratio),
                    total_features
                ))
                # Select features
                sampled_df = analyzer.select_features(
                    normalized_df,
                    importance_dict,
                    ratio,
                    k=n_features
                )
                # Verify results
                print(f"\nVerification:")
                print(f"Expected {n_features} features")
                print(f"Got {len(sampled_df.columns)} features")
                print(f"Feature ratio: {len(sampled_df.columns)/total_features:.1%}")
            # Plot importance
            analyzer.plot_importance(target_name)
            plt.show()
finally:
    monitor.end()
dep_checker.register_cell('feature_engineering')

In [None]:
class ModelTrainer:
    def __init__(self, n_splits: int = N_SPLITS, chunk_size: int = CHUNK_SIZE):
        self.n_splits = n_splits
        self.chunk_size = chunk_size
        self.models = [
            ## consider C =1 Less regularization , max_iter=5000 ,or  solver='lbfgs'  # lbfgs often converges faster than saga must # Removed n_jobs=-1
            (MODEL_NAMES[0], LogisticRegression(
                random_state=42, solver='lbfgs', max_iter=5000,
                penalty='l2', C=0.1,  class_weight='balanced')) #n_jobs=-1,
            ,(MODEL_NAMES[1], RandomForestClassifier(
                random_state=42, n_jobs=-1, class_weight='balanced',
                n_estimators=100, max_depth=None))
##             ,(MODEL_NAMES[2], SVC(
##                 random_state=42, probability=True, class_weight='balanced',
##                 kernel='rbf'))
            ,(MODEL_NAMES[3], DecisionTreeClassifier(
                random_state=42, class_weight='balanced',
                max_depth=None))

# Note: sklearn's MLPClassifier doesn't have dropout
# Consider switching to MLPRegressor or using TensorFlow/PyTorch
            
##            ,(MODEL_NAMES[4], MLPClassifier(
##                random_state=42,
##                hidden_layer_sizes=(50,),  # Reduce complexity - smaller network OR hidden_layer_sizes=(50, 25) - Tapering layers
##                max_iter=3000,  # consider reducing iterations
##                early_stopping=True,
##                validation_fraction=0.2,  # Increase validation set for better early stopping 0.25,  # Larger validation set
##                n_iter_no_change=50,  # Stop earlier when no improvement 15 Stop sooner higher more patience
##                learning_rate_init=0.01 ,  # Slightly higher learning rate changed from  0.01
##                learning_rate='adaptive' ,   # Key addition!
##                alpha=0.01,  # STRONG regularization (key for overfitting) # Try values: 0.001, 0.01, 0.1, 1.0
##                solver='adam',
#                beta_1=0.9,
#                beta_2=0.999,
##                tol=1e-3,  # Less strict tolerance
#                verbose=True  # Monitor training progress
##                    ))
        ]
        self.cv = StratifiedKFold(
            n_splits=n_splits, shuffle=True, random_state=42
        )

    def train_and_evaluate(self, X: pd.DataFrame, y: pd.Series,
                           feature_sample: str = 'full') -> dict:
        """Train and evaluate models with cross-validation"""
        monitor.start('train_and_evaluate')
        try:
            # Feature selection: select top 20 features by f_classif for example
#            X_selected, selected_features = self.select_features(X, y, k = min(20, X.shape[1])) # k=20) #capped 20 features
            X_selected = X
            selected_features = X.columns.tolist()
            X_selected = pd.DataFrame(X_selected, columns=selected_features, index=X.index)

            results = {name: {
                'CV_SCORES': [],
                'FEATURE_IMPORTANCE': None,
                'MODEL': None,
                'AVG_METRICS': None,
                'BEST_THRESHOLD': None,
                'FEATURE_SAMPLE': feature_sample,
                'FEATURES': selected_features
            } for name, _ in self.models}

            # Cross-validation splits
            for fold, (train_idx, val_idx) in enumerate(self.cv.split(X_selected, y)):
                X_train = X_selected.iloc[train_idx][selected_features].copy()
                X_val = X_selected.iloc[val_idx][selected_features].copy()
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]

                for name, model in self.models:
                    try:
                        if name == 'MLPClassifier' and feature_sample != 'third':
                            print(f"Skipping MLPClassifier for {feature_sample}")
                            continue
                        if name == 'MLPClassifier':# and getattr(model, 'early_stopping', False):
                            # Use fit instead of partial_fit for early stopping support
                            model.fit(X_train, y_train)
                        elif hasattr(model, 'partial_fit'):
                            # For models supporting partial_fit and no early stopping, train in chunks
                            classes = np.unique(y)
                            for start_idx in range(0, len(X_train), self.chunk_size):
                                end_idx = min(start_idx + self.chunk_size, len(X_train))
                                X_chunk = X_train.iloc[start_idx:end_idx]
                                y_chunk = y_train.iloc[start_idx:end_idx]
                                model.partial_fit(X_chunk, y_chunk, classes=classes)
                        else:
                            model.fit(X_train, y_train)

                        y_pred = model.predict(X_val)
                        y_prob = model.predict_proba(X_val)[:, 1]

                        metrics = self.calculate_metrics(y_val, y_pred, y_prob)
                        results[name]['CV_SCORES'].append(metrics)

                        # Find optimal threshold for binary classification based on F1 score
                        thresholds = np.linspace(0, 1, 100)
                        f1_scores = [f1_score(y_val, y_prob >= t) for t in thresholds]
                        best_idx = np.argmax(f1_scores)
                        results[name]['BEST_THRESHOLD'] = thresholds[best_idx]

                        # Store feature importances for tree models
                        if hasattr(model, 'feature_importances_'):
                            if results[name]['FEATURE_IMPORTANCE'] is None:
                                results[name]['FEATURE_IMPORTANCE'] = {}
                            importance_dict = dict(zip(selected_features,
                                                       model.feature_importances_))
                            results[name]['FEATURE_IMPORTANCE'] = importance_dict

                        results[name]['MODEL'] = model

                    except Exception as e:
                        print(f"Error training {name} in fold {fold + 1}: {e}")
                        results[name]['CV_SCORES'].append({'ERROR': str(e)})

            # Aggregate average metrics across folds
            for name in results:
                cv_scores = results[name]['CV_SCORES']
                if cv_scores and all('ERROR' not in score for score in cv_scores):
                    avg_metrics = {}
                    for metric in cv_scores[0].keys():
                        vals = [score[metric] for score in cv_scores]
                        avg_metrics[metric] = {
                            'MEAN': np.mean(vals),
                            'STD': np.std(vals),
                            'CI_LOW': np.percentile(vals, 2.5),
                            'CI_HIGH': np.percentile(vals, 97.5)
                        }
                    results[name]['AVG_METRICS'] = avg_metrics

            return results
        finally:
            monitor.end()

    def predict_with_aligned_features(self, model, X: pd.DataFrame, selected_features: list) -> np.ndarray:
        """
        Predict using the model ensuring X columns are aligned with selected_features.

        Parameters:
            model: Trained model
            X: Input DataFrame for prediction
            selected_features: List of features used to train the model

        Returns:
            Predicted class labels as np.ndarray
        """
        missing_features = [f for f in selected_features if f not in X.columns]
        if missing_features:
            raise ValueError(f"Missing features for prediction: {missing_features}")

        X_aligned = X.loc[:, selected_features]
        return model.predict(X_aligned)

    def predict_proba_with_aligned_features(self, model, X: pd.DataFrame, selected_features: list) -> np.ndarray:
        """
        Predict class probabilities with feature alignment.

        Parameters:
            model: Trained model
            X: Input DataFrame for prediction
            selected_features: List of features used to train the model

        Returns:
            Predicted class probabilities as np.ndarray
        """
        missing_features = [f for f in selected_features if f not in X.columns]
        if missing_features:
            raise ValueError(f"Missing features for prediction: {missing_features}")

        X_aligned = X.loc[:, selected_features]
        return model.predict_proba(X_aligned)

    def select_features(self, X: pd.DataFrame, y: pd.Series, k: int = 20) -> tuple:
        """
        Select top k features based on ANOVA F-value (f_classif).

        Returns:
            Tuple containing:
                - DataFrame of selected features,
                - List of selected feature names
        """
        selector = SelectKBest(score_func=f_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        selected_features = X.columns[selector.get_support()].tolist()
        X_selected_df = pd.DataFrame(X_selected, columns=selected_features, index=X.index)
        return X_selected_df, selected_features

    def calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray,
                          y_prob: np.ndarray) -> dict:
        """
        Calculate common classification metrics.

        Returns:
            Dictionary containing accuracy, balanced accuracy, precision,
            recall, F1, ROC AUC, Matthews correlation coefficient, and PR AUC.
        """
        metrics = {}
        metrics['ACCURACY'] = accuracy_score(y_true, y_pred)
        metrics['BALANCED_ACCURACY'] = balanced_accuracy_score(y_true, y_pred)
        metrics['PRECISION'] = precision_score(y_true, y_pred, zero_division=0)
        metrics['RECALL'] = recall_score(y_true, y_pred, zero_division=0)
        metrics['F1'] = f1_score(y_true, y_pred, zero_division=0)
        metrics['ROC_AUC'] = roc_auc_score(y_true, y_prob)
        metrics['MCC'] = matthews_corrcoef(y_true, y_pred)

        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        metrics['PR_AUC'] = auc(recall, precision)

        # Optionally, add class distribution
        metrics['POS_CLASS_RATIO'] = np.mean(y_true == 1)

        return metrics

    def plot_cv_results(self, results: dict, show_bar_plots: bool = False,
                        show_box_plots: bool = True, target_name: str = None):
        """Plot cross-validation results comparing models across feature sets"""
        monitor.start('plot_cv_results')
        try:
            if not results:
                print("No results to plot")
                return None
            # Define metrics to plot
            metrics = ['ACCURACY', 'BALANCED_ACCURACY', 'PRECISION', 'RECALL',
                       'F1', 'ROC_AUC', 'PR_AUC', 'MCC']
    
            if show_box_plots:
                # Prepare data for box plots
                plot_data = []
                model_names = list(results.keys())
                print(f"\nProcessing plots for target: {target_name}")
                print(f"Number of models: {len(model_names)}")
    
                # Debug the results structure
                print("\nResults structure:")
                for model_name, model_results in results.items():
                    print(f"\nModel: {model_name}")
                    print(f"Type: {type(model_results)}")
                    if isinstance(model_results, dict):
                        print(f"Keys: {model_results.keys()}")
    
                # Process results for plotting
                for model_name, model_results in results.items():
                    print(f"\nProcessing model: {model_name}")
                    if isinstance(model_results, dict) and 'CV_SCORES' in model_results:
                        feature_set = model_results.get('FEATURE_SAMPLE', 'unknown')
                        print(f"Processing feature set: {feature_set}")
                        for score_dict in model_results['CV_SCORES']:
                            if not isinstance(score_dict, dict) or 'ERROR' in score_dict:
                                continue
                            row = {
                                'Model': model_name,
                                'Feature Set': feature_set,
                                **{k: v for k, v in score_dict.items() if k in metrics}
                            }
                            plot_data.append(row)
                    elif isinstance(model_results, list):
                        for result in model_results:
                            if isinstance(result, dict) and 'CV_SCORES' in result:
                                feature_set = result.get('FEATURE_SAMPLE', 'unknown')
                                print(f"Processing feature set: {feature_set}")
                                for score_dict in result['CV_SCORES']:
                                    if not isinstance(score_dict, dict) or 'ERROR' in score_dict:
                                        continue
                                    row = {
                                        'Model': model_name,
                                        'Feature Set': feature_set,
                                        **{k: v for k, v in score_dict.items() if k in metrics}
                                    }
                                    plot_data.append(row)
                if plot_data:
                    df = pd.DataFrame(plot_data)
    
                    # Debug the plot data
                    print("\nUnique Feature Sets:", df['Feature Set'].unique())
                    print("Unique Models:", df['Model'].unique())
    
                    # Ensure correct feature set order
                    feature_set_order = ['third', 'half', 'full']
                    df['Feature Set'] = pd.Categorical(df['Feature Set'],
                                                      categories=feature_set_order,
                                                      ordered=True)
    
                    # Create color palette based on actual models
                    palette = dict(zip(model_names,
                                       ['#1f77b4', '#ff7f0e', '#2ca02c',
                                        '#d62728', '#9467bd'][:len(model_names)]))
    
                    # Set figure size based on number of metrics
                    fig = plt.figure(figsize=(15, 5*len(metrics)))
    
                    for i, metric in enumerate(metrics, 1):
                        if metric in df.columns:
                            ax = plt.subplot(len(metrics), 1, i)
                            sns.boxplot(data=df,
                                        x='Feature Set',
                                        y=metric,
                                        hue='Model',
                                        ax=ax,
                                        palette=palette)
                            title = f'Model Performance: {metric}'
                            if target_name:
                                title += f' ({target_name})'
                            ax.set_title(title)
                            ax.set_xlabel('Feature Set Size')
                            ax.set_ylabel(metric)
                            ax.grid(True, linestyle='--', alpha=0.7)
                            ax.legend(title='Model', loc='upper right')
    
                    plt.tight_layout()
                    return fig
            return None
        except Exception as e:
            print(f"Error plotting CV results: {str(e)}")
            traceback.print_exc()
            return None
        finally:
            monitor.end()

In [None]:
# Cell: storage_utils
monitor.start('storage_utils')
try:
    class ModelStorageManager:
        def __init__(self, config: EnvironmentConfig):
            self.config = config
            self.sf_manager = SnowflakeManager(config)

        #  replace the serialize_object method:
       
        def serialize_object(self, obj: Any) -> str:
            """Serialize object using joblib with compression"""
            buffer = io.BytesIO()
            joblib.dump(obj, buffer, compress=3)  # Compression level 3 (good balance)
            return base64.b64encode(buffer.getvalue()).decode('utf-8')
        
        def deserialize_object(self, serialized_str: str) -> Any:
            """Deserialize object using joblib"""
            buffer = io.BytesIO(base64.b64decode(serialized_str))


        
        def serialize_objectOLD(self, obj: Any) -> str:
            """Serialize object to base64 string"""
            return base64.b64encode(pickle.dumps(obj)).decode('utf-8')


        def save_model(self, model: Any, model_name: str, 
                      target: str, feature_sample: str,
                      selected_features: List[str],
                      metrics: Dict[str, Any],
                      scaler: Any = None,
                      imputers: Dict[str, Any] = None) -> None:
            """Save trained model with metadata"""
            monitor.start('save_model')
            try:
                # Ensure features are stored in correct order
                model_data = pd.DataFrame([{
                    'MODEL': model_name,
                    'TARGET': target,
                    'FEATURE_SAMPLE': feature_sample,
                    'N_FEATURES': len(selected_features),
                    'MODEL_OBJECT': self.serialize_object(model),
                    'SELECTED_FEATURES': self.serialize_object(selected_features),  # Exact feature list
                    'METRICS': self.serialize_object(metrics),
                    'SCALER': self.serialize_object(scaler),
                    'IMPUTERS': self.serialize_object(imputers),
                    'CREATED_AT': datetime.now()
                }])
                
                self.sf_manager.save_results(
                    model_data, 
                    base_name='models',
                    mode='append'
                )
                
            finally:
                monitor.end()
                
        def save_imputation_config(self, 
                                 imputers: Dict[str, SimpleImputer],
                                 strategies: Dict[str, List[str]],
                                 scaler: StandardScaler,
                                 feature_stats: Dict[str, Dict[str, float]]) -> None:
            """Save imputation configuration and statistics"""
            monitor.start('save_imputation_config')
            try:
                imputation_data = pd.DataFrame([{
                    'IMPUTERS': self.serialize_object(imputers),
                    'STRATEGIES': self.serialize_object(strategies),
                    'SCALER': self.serialize_object(scaler),
                    'FEATURE_STATS': self.serialize_object(feature_stats),
                    'CREATED_AT': datetime.now()
                }])
                
                self.sf_manager.save_results(
                    imputation_data,
                    base_name='IMPUTED',
                    mode='overwrite'
                )
                
            finally:
                monitor.end()

        def load_latest_model(self, model_name: str, target: str, 
                            feature_sample: str) -> Tuple[Any, List[str], Dict[str, Any]]:
            """Load latest model and its components"""
            monitor.start('load_latest_model')
            try:
                query = f"""
                SELECT MODEL_OBJECT, SELECTED_FEATURES, METRICS
                FROM {TABLE_NAMES['models']}
                WHERE MODEL = '{model_name}'
                AND TARGET = '{target}'
                AND FEATURE_SAMPLE = '{feature_sample}'
                ORDER BY CREATED_AT DESC
                LIMIT 1
                """
                
                result = self.sf_manager.execute_with_retry(
                    'load_model',
                    lambda: self.sf_manager.session.sql(query).collect()
                )
                
                if not result:
                    return None, None, None
                    
                row = result[0]
                model = pickle.loads(base64.b64decode(row['MODEL_OBJECT']))
                features = pickle.loads(base64.b64decode(row['SELECTED_FEATURES']))
                metrics = pickle.loads(base64.b64decode(row['METRICS']))
                
                return model, features, metrics
                
            finally:
                monitor.end()

finally:
    monitor.end()
dep_checker.register_cell('storage_utils')

In [None]:
# Cell: model_evaluator_class
monitor.start('model_evaluator_class')
try:
    print("Available classes:", [name for name in dir() if name.startswith('Model')])
    
    class ModelEvaluator:
        def __init__(self, 
                    sample_size: int = SAMPLE_SIZE, 
                    n_splits: int = N_SPLITS,
                    chunk_size: int = CHUNK_SIZE):
            self.sample_size = sample_size
            self.n_splits = n_splits
            self.chunk_size = chunk_size
            self.feature_analyzer = FeatureImportanceAnalyzer()
            self.trainer = ModelTrainer(n_splits=n_splits, chunk_size=chunk_size)
            self.results = {}

        def run_full_evaluation(self, config: EnvironmentConfig) -> Dict[str, Any]:
            """Run complete evaluation pipeline"""
            monitor.start('run_full_evaluation')
            try:
                # Generate timestamp once at the start
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                all_results = {}
                storage_manager = ModelStorageManager(config)
                
                # Load and prepare initial data
                print("Loading data...")
                df = load_and_prepare_data(config, self.sample_size)
                numeric_df, targets = prepare_numeric_data(df)
                
                # Impute and normalize full dataset
                print("\nImputing and normalizing...")
                normalized_df, scaler, imputers, strategies = impute_and_normalize(
                    numeric_df, 
                    config,
                    save_imputed=True,
                    timestamp=timestamp
                )
        
                # Save imputation configuration
                storage_manager.save_imputation_config(
                    imputers=imputers,
                    strategies=strategies,
                    scaler=scaler,
                    feature_stats=None
                )
                
                print("\nStarting feature sample processing...")
                all_results = {}  # Initialize results dictionary
                
                # Evaluate with different feature samples
                for sample_name, sample_ratio in FEATURE_SAMPLES.items():
                    monitor.add_context(sample_name)
                    try:
                        print(f"\nProcessing {sample_name} feature sample "
                              f"({sample_ratio:.1%} of features)")
                        
                        sample_results = {}
                        
                        for target_name, target_values in targets.items():
                            print(f"\nEvaluating {target_name}")
                            
                            # Calculate feature importance
                            importance_dict = self.feature_analyzer.calculate_importance(
                                normalized_df.copy(), 
                                target_values,
                                target_name
                            )

                            # Calculate feature importance
                            importance_dict = self.feature_analyzer.calculate_importance(
                                normalized_df.copy(), 
                                target_values,
                                target_name
                            )                            
 
                            # Save feature importance data
                            feature_importance_data = []
                            for feature, importance in importance_dict.items():
                                    feature_importance_data.append({
#                                        'MODEL' : model_name,
                                        'FEATURE_NAME': feature,
                                        'IMPORTANCE': importance,
                                        'TARGET': target_name,
                                        'SAMPLE_TYPE': sample_name,
                                        'CREATED_AT': datetime.now()
                                    })
                            if feature_importance_data:  # Only save if we have data                            
                             feature_df = pd.DataFrame(feature_importance_data)
                             storage_manager.sf_manager.save_results(
                                feature_df,
                                base_name='features',  # This will use TABLE_NAMES['features']
                                mode='append'
                             )
                             print(f"Saved feature importance data for {target_name} - {sample_name}")
                            

                            
                            # Calculate number of features to select based on ratio
                            total_features = normalized_df.shape[1]
                            n_features = max(1, min(
                                int(total_features * sample_ratio),  # Requested number
                                total_features  # Total available
                            ))
#                            print(f"Selecting {n_features} features from {total_features} total")
                            print(f"Available features: {total_features}")
                            print(f"Requested features based on ratio {sample_ratio}: {int(total_features * sample_ratio)}")
                            print(f"Final number of features to select: {n_features}")
                             
                            
                            # Select features for this sample ratio
                            sampled_df = self.feature_analyzer.select_features(
                                normalized_df.copy(),
                                importance_dict,
                                sample_ratio,
#                                k=min(20, n_features)  # Don't request more features than we want for this sample
                                k=n_features  # Pass calculated number of features
                            )
                            print(f"Selected {len(sampled_df.columns)} features for {sample_name}")
                            
                            # Train and evaluate models
                            model_results = self.trainer.train_and_evaluate(
                                X=sampled_df,
                                y=target_values,
                                feature_sample=sample_name  # Make sure this is passed
                            )

                           # Save results to Snowflake for each model
                            for model_name, model_info in model_results.items():
                                if 'AVG_METRICS' in model_info and model_info['AVG_METRICS'] is not None:
                                    metrics = model_info['AVG_METRICS']
                                    result_row = pd.DataFrame([{
                                        'TARGET': target_name,
                                        'MODEL': model_name,
                                        'FEATURE_SAMPLE': sample_name,
                                        'N_FEATURES': len(sampled_df.columns),
                                        'FEATURE_RATIO': sample_ratio,
                                        'SAMPLE_SIZE': len(df),
                                        'ACCURACY_MEAN': metrics['ACCURACY']['MEAN'],
                                        'ACCURACY_STD': metrics['ACCURACY']['STD'],
                                        'PRECISION_MEAN': metrics['PRECISION']['MEAN'],
                                        'PRECISION_STD': metrics['PRECISION']['STD'],
                                        'RECALL_MEAN': metrics['RECALL']['MEAN'],
                                        'RECALL_STD': metrics['RECALL']['STD'],
                                        'F1_MEAN': metrics['F1']['MEAN'],
                                        'F1_STD': metrics['F1']['STD'],
                                        'ROC_AUC_MEAN': metrics['ROC_AUC']['MEAN'],
                                        'ROC_AUC_STD': metrics['ROC_AUC']['STD'],
                                        'TIMESTAMP': datetime.now()
                                    }])
                                    
                                    # Save to Snowflake
                                    storage_manager.sf_manager.save_results(
                                        result_row,
                                        base_name='MODEL_RESULTS',  # Use MODEL_RESULTS instead of 'results'base_name='results', 
                                        mode='append'
                                    )
                                    print(f"Saved results for {model_name} - {target_name} - {sample_name}")
                            
                            # Store results in nested structure
                            sample_results[target_name] = {
                                'FEATURE_IMPORTANCE': importance_dict,
                                'MODEL_PERFORMANCE': model_results,
                                'SELECTED_FEATURES': sampled_df.columns.tolist(),
                                'DATA_STATS': {
                                    'SAMPLE_SIZE': len(df),
                                    'N_FEATURES': len(sampled_df.columns),
                                    'FEATURE_RATIO': sample_ratio,
                                    'CLASS_DISTRIBUTION': target_values.value_counts(normalize=True).to_dict()
                                }
                            }
                            
                            # Save models with components
                            for model_name, model_info in model_results.items():
                                if ('ERROR' not in model_info and 
                                    model_info.get('MODEL') is not None):
                                    storage_manager.save_model(
                                        model=model_info['MODEL'],
                                        model_name=model_name,
                                        target=target_name,
                                        feature_sample=sample_name,
#                                        selected_features=sampled_df.columns.tolist(), 
                                        selected_features=model_info['FEATURES'],  # ✅ Actual features used by model!                                        
                                        metrics=model_info['AVG_METRICS'],
                                        scaler=scaler,
                                        imputers=imputers
                                    )
                        
                        all_results[sample_name] = sample_results
                        print(f"Added results for {sample_name}")
                        
                    finally:
                        monitor.remove_context()
                
                print("\nVerifying results structure:")
                for sample_name in all_results:
                    print(f"- {sample_name}")
                    for target_name in all_results[sample_name]:
                        print(f"  - {target_name}: {len(all_results[sample_name][target_name]['MODEL_PERFORMANCE'])} models")
                
                self.results = all_results
                return all_results
                
            finally:
                monitor.end()
#        def generate_report(self, show_bar_plots: bool = False, show_box_plots: bool = True, 
        def generate_report(self, show_bar_plots: bool = False, show_box_plots: bool = True, 
                           save_plots: bool = False) -> None:
            """Generate comprehensive evaluation report"""
            monitor.start('generate_report')
            try:
                if not self.results:
                    print("No results to report. Please run full_evaluation first.")
                    return
                
                print("\nResults structure at start of generate_report:")
                for sample_name, sample_results in self.results.items():
                    print(f"Found sample: {sample_name}")
                    
                # For each target, combine results across all feature sets
                for target_name in next(iter(self.results.values())):
                    print(f"\nResults for {target_name}:")
                    print("-"*80)
                    
                    # Combine model performance results across all feature sets
                    combined_performance = {}
                    
                    # First, initialize combined_performance with empty lists for each model
                    first_sample = next(iter(self.results.values()))
                    first_target = first_sample[target_name]
                    model_names = first_target['MODEL_PERFORMANCE'].keys()
                    combined_performance = {model_name: [] for model_name in model_names}
                    
                    # Then add results from each feature set
                    for sample_name, sample_results in self.results.items():
                        print(f"\nProcessing sample: {sample_name}")
                        target_results = sample_results[target_name]
                        model_results = target_results['MODEL_PERFORMANCE']
                        
                        for model_name, model_info in model_results.items():
                            print(f"  Processing model: {model_name}")
                            model_info['FEATURE_SAMPLE'] = sample_name  # Ensure feature set is tagged
                            combined_performance[model_name].append(model_info)
                    
                    # Generate plots
                    print(f"\nGenerating plots for {target_name}")
                    fig = self.trainer.plot_cv_results(
                        combined_performance,
                        show_bar_plots=show_bar_plots,
                        show_box_plots=show_box_plots,
                        target_name=target_name
                    )
                            
                    if fig is not None:
                        plt.figure(fig.number)
                        plt.show()
                        if save_plots:
                            plot_filename = f"cv_results_{target_name}.png"
                            fig.savefig(plot_filename, bbox_inches='tight')
                            print(f"Saved plot to {plot_filename}")
                        plt.close(fig)
                    
                    # Print detailed statistics for each feature set
                    for sample_name, sample_results in self.results.items():
                        print(f"\n{sample_name.upper()} FEATURE SET:")
                        target_results = sample_results[target_name]
                        stats = target_results['DATA_STATS']
                        print(f"Sample Size: {stats['SAMPLE_SIZE']:,}")
                        print(f"Number of Features: {stats['N_FEATURES']}")
                        print(f"Feature Ratio: {stats['FEATURE_RATIO']:.1%}")
                
                plt.close('all')
                    
            finally:
             monitor.end()


finally:
    monitor.end()
dep_checker.register_cell('model_evaluator_class')

In [None]:
# Cell: model_evaluation_test
monitor.start('model_evaluation_test')
try:
    # Test evaluation
    if __name__ == "__main__":
        try:
            config = EnvironmentConfig()
            session = get_active_session()
            config.set_session(session)
            
            evaluator = ModelEvaluator(sample_size=SAMPLE_SIZE)
            results = evaluator.run_full_evaluation(config)
            # Debug: Check what feature sets we have
            print("\nFeature sets in results:")
            for sample_name in results.keys():
                print(f"- {sample_name}")
            evaluator.generate_report(show_bar_plots=False, show_box_plots=True)
            
        except Exception as e:
            print(f"Error in evaluation: {str(e)}")
            traceback.print_exc()

finally:
    monitor.end()
dep_checker.register_cell('model_evaluation_test')

In [None]:
# Cell: monitor_output
"""
Dependencies:
- notebook_monitor
- final_evaluation_reporting
Provides: Execution summary and performance metrics
"""
def show_monitoring_results():
    try:
        if not dep_checker.check_dependencies('monitor_output'):
            raise RuntimeError("Dependencies not met for monitor output")

        # Show execution summary
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', None)
        summary_df = monitor.show_summary()
        
        def categorize_operation(cell_name):
            """Enhanced categorization that handles context-based names"""
            # Extract base operation name and context
            parts = cell_name.split('_')
            
            # Look for sample size contexts
            size_contexts = {'third', 'half', 'full', 'test'}
            context = next((p for p in parts if p.lower() in size_contexts), '')
            
            # Base categorization
            if 'train_and_evaluate' in cell_name:
                base_cat = 'Model Training'
            elif 'impute' in cell_name:
                base_cat = 'Data Preparation'
            elif 'feature_importance' in cell_name:
                base_cat = 'Feature Analysis'
            elif 'plot' in cell_name:
                base_cat = 'Visualization'
            elif 'load' in cell_name:
                base_cat = 'Data Loading'
            else:
                base_cat = 'Other'
            
            # Add context if it exists
            if context:
                return f"{base_cat} ({context})"
            return base_cat

        # Define categories and their icons
        CATEGORY_INDICATORS = {
            'Model Training': '📊',
            'Data Preparation': '🔧',
            'Feature Analysis': '📈',
            'Visualization': '📉',
            'Data Loading': '💾',
            'Other': '📌'
        }
        
        # Define time thresholds
        TIME_THRESHOLDS = {
            'critical': 180,  # 3 minutes - 🔴
            'warning': 60,    # 1 minute - 🟡
            'normal': 0       # Under 1 minute - 🟢
        }

        def get_time_indicator(duration):
            """Get time indicator symbol"""
            duration = float(duration.strip('s'))
            if duration >= TIME_THRESHOLDS['critical']:
                return '🔴'
            elif duration >= TIME_THRESHOLDS['warning']:
                return '🟡'
            return '🟢'

        # Process data
        performance_df = summary_df.sort_values('duration_seconds', ascending=False)
        performance_df['category'] = performance_df['cell_name'].apply(categorize_operation)
        
        # Calculate percentage of total time
        total_time = performance_df['duration_seconds'].sum()
        performance_df['percentage_of_total'] = (
            performance_df['duration_seconds'] / total_time * 100
        ).round(2)
        performance_df['percentage_of_total'] = performance_df['percentage_of_total'].apply(
            lambda x: f"{x:>5.1f}%"
        )
        
        # Format duration
        performance_df['duration_seconds'] = performance_df['duration_seconds'].apply(
            lambda x: f"{x:>6.2f}s"
        )
        
        # Print legend
        print("\nLegend:")
        print("=" * 80)
        print("Categories:", " | ".join(f"{icon} {cat}" for cat, icon in CATEGORY_INDICATORS.items()))
        print("Timing:", "🔴 >3m | 🟡 >1m | 🟢 <1m")
        
        # Show category summaries
        print("\nExecution Summary by Category:")
        print("-" * 80)
        category_stats = performance_df.groupby('category').agg({
            'cell_name': 'count',
            'duration_seconds': lambda x: f"{sum(float(i.strip('s')) for i in x):>6.2f}s"
        })
        category_stats.index = [f"{CATEGORY_INDICATORS.get(cat.split(' ')[0], '📌')} {cat}" 
                              for cat in category_stats.index]
        category_stats.columns = ['Operation Count', 'Total Time']
        print(category_stats.sort_values('Total Time', ascending=False))
        
        # Show detailed performance
        print("\nDetailed Operation Performance:")
        print("-" * 120)  # Wider to accommodate longer category names
        
        # Format DataFrame for display
        formatted_df = performance_df.copy()
        formatted_df['time_alert'] = formatted_df['duration_seconds'].apply(get_time_indicator)
        formatted_df['category'] = formatted_df['category'].apply(
            lambda x: f"{CATEGORY_INDICATORS.get(x.split(' ')[0], '📌')} {x:<25}"
        )
        formatted_df['cell_name'] = formatted_df['cell_name'].apply(lambda x: f"{x:<40}")
        formatted_df['status'] = formatted_df['status'].apply(lambda x: f"{x:<10}")
        
        display_cols = ['time_alert', 'category', 'cell_name', 'duration_seconds', 
                       'percentage_of_total', 'status']
        print(formatted_df[display_cols].to_string(index=False))
        
        # Performance alerts
        print("\nPerformance Alerts:")
        print("-" * 80)
        critical_ops = formatted_df[formatted_df['time_alert'] == '🔴']
        warning_ops = formatted_df[formatted_df['time_alert'] == '🟡']
        
        if not critical_ops.empty:
            print("\n🔴 Critical Operations (>3m):")
            for _, row in critical_ops.iterrows():
                print(f"   • {row['category']}: {row['cell_name']} ({row['duration_seconds']})")
                
        if not warning_ops.empty:
            print("\n🟡 Warning Operations (>1m):")
            for _, row in warning_ops.iterrows():
                print(f"   • {row['category']}: {row['cell_name']} ({row['duration_seconds']})")

        # Overall success rate
        success_rate = (performance_df['status'] == 'completed').mean() * 100
        print(f"\nOverall Success Rate: {success_rate:.1f}%")
        
        # Total execution time
        total_time_sec = sum(float(t.strip('s')) for t in performance_df['duration_seconds'])
        print(f"Total Execution Time: {total_time_sec:.2f}s ({total_time_sec/60:.2f}m)")

        return formatted_df
        
    except Exception as e:
        print(f"Error in monitoring output: {type(e).__name__} - {str(e)}")
        print(traceback.format_exc())
        return pd.DataFrame()

# Execute monitoring output
monitor_df = show_monitoring_results()
dep_checker.register_cell('monitor_output')