<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE3-MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Enhanced Feature Engineering MLP - RESUMABLE VERSION
# Added comprehensive checkpoint/resume functionality

import numpy as np
import pandas as pd
import os
import pickle
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.regularizers import l1_l2
import warnings
warnings.filterwarnings('ignore')

class ResumableEnhancedFeatureEngineeringMLP:
    """
    MLP with explicit feature engineering and full resume capability
    """

    def __init__(self, output_dir='/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'):
        self.output_dir = output_dir
        self.y_scaler = StandardScaler()
        self.x_scaler = StandardScaler()
        self.model = None

        # Create directory for enhanced model
        self.enhanced_dir = f"{output_dir}enhanced_mlp/"
        os.makedirs(self.enhanced_dir, exist_ok=True)

        # Checkpoint files
        self.checkpoint_dir = f"{self.enhanced_dir}checkpoints/"
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        self.checkpoint_files = {
            'enhanced_features': f"{self.checkpoint_dir}enhanced_features.npy",
            'feature_names': f"{self.checkpoint_dir}feature_names.pkl",
            'selected_features': f"{self.checkpoint_dir}selected_features.npy",
            'selected_indices': f"{self.checkpoint_dir}selected_indices.npy",
            'selector': f"{self.checkpoint_dir}selector.pkl",
            'split_data': f"{self.checkpoint_dir}split_data.npz",
            'scalers': f"{self.checkpoint_dir}scalers.pkl",
            'model_weights': f"{self.checkpoint_dir}training_checkpoint.weights.h5",
            'training_state': f"{self.checkpoint_dir}training_state.json",
            'final_model': f"{self.enhanced_dir}enhanced_feature_model.keras",
            'results': f"{self.checkpoint_dir}results.pkl"
        }

        self.pipeline_state = self.load_pipeline_state()
        print(f"📁 Enhanced MLP directory: {self.enhanced_dir}")
        print(f"💾 Checkpoint directory: {self.checkpoint_dir}")

    def load_pipeline_state(self):
        """Load or initialize pipeline state"""
        state_file = f"{self.checkpoint_dir}pipeline_state.json"
        if os.path.exists(state_file):
            with open(state_file, 'r') as f:
                state = json.load(f)
            print(f"📋 Loaded pipeline state: {state}")
            return state
        else:
            state = {
                'data_loaded': False,
                'features_enhanced': False,
                'features_selected': False,
                'data_split': False,
                'data_scaled': False,
                'model_trained': False,
                'model_evaluated': False,
                'pipeline_complete': False
            }
            print(f"🆕 Initialized new pipeline state")
            return state

    def save_pipeline_state(self):
        """Save current pipeline state"""
        state_file = f"{self.checkpoint_dir}pipeline_state.json"
        with open(state_file, 'w') as f:
            json.dump(self.pipeline_state, f, indent=2)
        print(f"💾 Pipeline state saved")

    def load_original_data(self, data_filename, windows_filename):
        """Load original data for feature engineering"""
        if self.pipeline_state['data_loaded']:
            print("📊 Data already loaded, skipping...")
            return None, None

        print("📊 Loading original data for feature engineering...")

        data_path = os.path.join(self.output_dir, data_filename)
        windows_path = os.path.join(self.output_dir, windows_filename)

        x = np.load(data_path)
        y = np.load(windows_path)

        # Save loaded data for resume
        np.save(f"{self.checkpoint_dir}original_x.npy", x)
        np.save(f"{self.checkpoint_dir}original_y.npy", y)

        self.pipeline_state['data_loaded'] = True
        self.save_pipeline_state()

        print(f"✅ Original data loaded and saved: X={x.shape}, y={y.shape}")
        return x, y

    def create_enhanced_features(self, x=None, force_recreate=False):
        """Create enhanced features with checkpoint support"""
        if self.pipeline_state['features_enhanced'] and not force_recreate:
            print("🔧 Enhanced features already created, loading from checkpoint...")
            X_enhanced = np.load(self.checkpoint_files['enhanced_features'])
            with open(self.checkpoint_files['feature_names'], 'rb') as f:
                feature_names = pickle.load(f)
            print(f"✅ Loaded enhanced features: {X_enhanced.shape}")
            return X_enhanced, feature_names

        # Load original data if not provided
        if x is None:
            x = np.load(f"{self.checkpoint_dir}original_x.npy")

        print(f"\n🔧 Creating enhanced features based on correlation analysis...")
        print(f"   Starting with {x.shape[1]} original features")

        enhanced_features = []
        feature_names = []

        # 1. Original features (scaled)
        enhanced_features.append(x)
        feature_names.extend([f"orig_{i}" for i in range(x.shape[1])])
        print(f"   ✅ Original features: {x.shape[1]}")

        # 2. Best interaction found (Feature 7 × Feature 8)
        if x.shape[1] > 8:
            best_interaction = x[:, 7] * x[:, 8]
            enhanced_features.append(best_interaction.reshape(-1, 1))
            feature_names.append("feat_7_x_feat_8")
            print(f"   ✅ Best interaction (7×8): 1 feature")

        # 3. Top feature interactions (systematic)
        print("   Creating systematic feature interactions...")
        interaction_features = []
        interaction_names = []

        top_features = min(15, x.shape[1])
        interaction_count = 0

        for i in range(top_features):
            for j in range(i+1, top_features):
                if interaction_count < 50:
                    # Multiplication
                    mult_feat = x[:, i] * x[:, j]
                    interaction_features.append(mult_feat)
                    interaction_names.append(f"feat_{i}_x_feat_{j}")

                    # Division (safe)
                    if np.all(np.abs(x[:, j]) > 1e-8):
                        div_feat = x[:, i] / (x[:, j] + 1e-8)
                        interaction_features.append(div_feat)
                        interaction_names.append(f"feat_{i}_div_feat_{j}")

                    interaction_count += 2

                    if interaction_count >= 50:
                        break
            if interaction_count >= 50:
                break

        if interaction_features:
            interaction_matrix = np.column_stack(interaction_features)
            enhanced_features.append(interaction_matrix)
            feature_names.extend(interaction_names)
            print(f"   ✅ Feature interactions: {len(interaction_features)}")

        # 4. Polynomial features (degree 2) for top features
        print("   Creating polynomial features...")
        poly_features = []
        poly_names = []

        top_poly_features = min(10, x.shape[1])
        for i in range(top_poly_features):
            # Quadratic terms
            quad_feat = x[:, i] ** 2
            poly_features.append(quad_feat)
            poly_names.append(f"feat_{i}_squared")

            # Cubic terms (selective)
            if i < 5:  # Only for top 5
                cube_feat = x[:, i] ** 3
                poly_features.append(cube_feat)
                poly_names.append(f"feat_{i}_cubed")

        if poly_features:
            poly_matrix = np.column_stack(poly_features)
            enhanced_features.append(poly_matrix)
            feature_names.extend(poly_names)
            print(f"   ✅ Polynomial features: {len(poly_features)}")

        # 5. Statistical features
        print("   Creating statistical features...")
        stat_features = []
        stat_names = []

        if x.shape[1] >= 5:
            for window in [3, 5]:
                if x.shape[1] >= window:
                    for start in range(0, min(20, x.shape[1] - window + 1), window):
                        end = start + window
                        mean_feat = np.mean(x[:, start:end], axis=1)
                        std_feat = np.std(x[:, start:end], axis=1)

                        stat_features.extend([mean_feat, std_feat])
                        stat_names.extend([f"mean_{start}_{end}", f"std_{start}_{end}"])

        if stat_features:
            stat_matrix = np.column_stack(stat_features)
            enhanced_features.append(stat_matrix)
            feature_names.extend(stat_names)
            print(f"   ✅ Statistical features: {len(stat_features)}")

        # 6. Combine all features
        X_enhanced = np.hstack(enhanced_features)

        # Save enhanced features
        np.save(self.checkpoint_files['enhanced_features'], X_enhanced)
        with open(self.checkpoint_files['feature_names'], 'wb') as f:
            pickle.dump(feature_names, f)

        self.pipeline_state['features_enhanced'] = True
        self.save_pipeline_state()

        print(f"\n📈 Feature Engineering Summary:")
        print(f"   Original features: {x.shape[1]}")
        print(f"   Enhanced features: {X_enhanced.shape[1]}")
        print(f"   Enhancement factor: {X_enhanced.shape[1] / x.shape[1]:.1f}x")
        print(f"💾 Enhanced features saved to checkpoint")

        return X_enhanced, feature_names

    def select_best_features(self, X_enhanced=None, y=None, max_features=200, force_reselect=False):
        """Select best features with checkpoint support"""
        if self.pipeline_state['features_selected'] and not force_reselect:
            print("🎯 Feature selection already done, loading from checkpoint...")
            X_selected = np.load(self.checkpoint_files['selected_features'])
            selected_indices = np.load(self.checkpoint_files['selected_indices'])
            with open(self.checkpoint_files['selector'], 'rb') as f:
                selector = pickle.load(f)
            print(f"✅ Loaded selected features: {X_selected.shape}")
            return X_selected, selected_indices, selector

        # Load data if not provided
        if X_enhanced is None:
            X_enhanced = np.load(self.checkpoint_files['enhanced_features'])
        if y is None:
            y = np.load(f"{self.checkpoint_dir}original_y.npy")

        print(f"🎯 Selecting best {max_features} features from {X_enhanced.shape[1]}...")

        # Use mutual information for non-linear feature selection
        selector = SelectKBest(mutual_info_regression, k=min(max_features, X_enhanced.shape[1]))
        X_selected = selector.fit_transform(X_enhanced, y)

        # Get selected feature indices
        selected_indices = selector.get_support(indices=True)
        selected_scores = selector.scores_[selected_indices]

        # Save feature selection
        np.save(self.checkpoint_files['selected_features'], X_selected)
        np.save(self.checkpoint_files['selected_indices'], selected_indices)
        with open(self.checkpoint_files['selector'], 'wb') as f:
            pickle.dump(selector, f)

        self.pipeline_state['features_selected'] = True
        self.save_pipeline_state()

        print(f"   Selected {X_selected.shape[1]} features")
        print(f"   Score range: [{np.min(selected_scores):.6f}, {np.max(selected_scores):.6f}]")
        print(f"💾 Feature selection saved to checkpoint")

        return X_selected, selected_indices, selector

    def split_and_scale_data(self, X_selected=None, y=None, force_resplit=False):
        """Split and scale data with checkpoint support"""
        if self.pipeline_state['data_scaled'] and not force_resplit:
            print("📊 Data already split and scaled, loading from checkpoint...")
            data = np.load(self.checkpoint_files['split_data'])
            with open(self.checkpoint_files['scalers'], 'rb') as f:
                scalers = pickle.load(f)

            self.x_scaler = scalers['x_scaler']
            self.y_scaler = scalers['y_scaler']

            return (data['x_train_scaled'], data['x_val_scaled'], data['x_test_scaled'],
                   data['y_train_scaled'], data['y_val_scaled'], data['y_test_scaled'])

        # Load data if not provided
        if X_selected is None:
            X_selected = np.load(self.checkpoint_files['selected_features'])
        if y is None:
            y = np.load(f"{self.checkpoint_dir}original_y.npy")

        print(f"📊 Splitting and scaling data...")

        # Split data
        x_temp, x_test, y_temp, y_test = train_test_split(
            X_selected, y, test_size=0.2, random_state=42
        )
        x_train, x_val, y_train, y_val = train_test_split(
            x_temp, y_temp, test_size=0.25, random_state=42
        )

        # Scale features and targets
        x_train_scaled = self.x_scaler.fit_transform(x_train)
        x_val_scaled = self.x_scaler.transform(x_val)
        x_test_scaled = self.x_scaler.transform(x_test)

        y_train_scaled = self.y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
        y_val_scaled = self.y_scaler.transform(y_val.reshape(-1, 1)).flatten()
        y_test_scaled = self.y_scaler.transform(y_test.reshape(-1, 1)).flatten()

        # Save split and scaled data
        np.savez(self.checkpoint_files['split_data'],
                x_train_scaled=x_train_scaled, x_val_scaled=x_val_scaled, x_test_scaled=x_test_scaled,
                y_train_scaled=y_train_scaled, y_val_scaled=y_val_scaled, y_test_scaled=y_test_scaled)

        with open(self.checkpoint_files['scalers'], 'wb') as f:
            pickle.dump({'x_scaler': self.x_scaler, 'y_scaler': self.y_scaler}, f)

        self.pipeline_state['data_split'] = True
        self.pipeline_state['data_scaled'] = True
        self.save_pipeline_state()

        print(f"   Train: {x_train_scaled.shape[0]} samples, {x_train_scaled.shape[1]} features")
        print(f"   Validation: {x_val_scaled.shape[0]} samples")
        print(f"   Test: {x_test_scaled.shape[0]} samples")
        print(f"💾 Split and scaled data saved to checkpoint")

        return (x_train_scaled, x_val_scaled, x_test_scaled,
               y_train_scaled, y_val_scaled, y_test_scaled)

    def build_interaction_focused_mlp(self, input_dim):
        """Build MLP optimized for learning feature interactions"""
        print(f"🏗️ Building interaction-focused MLP for {input_dim} features...")

        model = Sequential([
            Dense(4096, input_dim=input_dim, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.4),

            Dense(2048, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.4),

            Dense(1024, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.3),

            Dense(512, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.3),

            Dense(256, activation='relu'),
            Dropout(0.2),

            Dense(128, activation='relu'),
            Dropout(0.2),

            Dense(64, activation='relu'),
            Dropout(0.1),

            Dense(32, activation='relu'),

            Dense(1, activation='linear')
        ])

        optimizer = keras.optimizers.Adam(
            learning_rate=0.002,
            clipnorm=1.0,
            beta_1=0.9,
            beta_2=0.999
        )

        model.compile(
            loss='mae',
            optimizer=optimizer,
            metrics=['mse', 'mae']
        )

        print(f"   Model parameters: {model.count_params():,}")
        return model

    def train_enhanced_model(self, force_retrain=False):
        """Train model with resume capability"""
        if self.pipeline_state['model_trained'] and not force_retrain:
            print("🚀 Model already trained, loading from checkpoint...")
            if os.path.exists(self.checkpoint_files['final_model']):
                self.model = load_model(self.checkpoint_files['final_model'])
                print("✅ Loaded trained model")
                return None
            else:
                print("⚠️  Final model not found, will retrain...")

        # Load scaled data
        data = np.load(self.checkpoint_files['split_data'])
        x_train_scaled = data['x_train_scaled']
        x_val_scaled = data['x_val_scaled']
        y_train_scaled = data['y_train_scaled']
        y_val_scaled = data['y_val_scaled']

        print(f"🚀 Training enhanced feature model...")

        # Build model
        self.model = self.build_interaction_focused_mlp(x_train_scaled.shape[1])

        # Check for existing training checkpoint
        initial_epoch = 0
        if os.path.exists(self.checkpoint_files['training_state']) and not force_retrain:
            try:
                with open(self.checkpoint_files['training_state'], 'r') as f:
                    training_state = json.load(f)
                initial_epoch = training_state.get('last_epoch', 0)
                if initial_epoch > 0:
                    self.model.load_weights(self.checkpoint_files['model_weights'])
                    print(f"📋 Resuming training from epoch {initial_epoch}")
            except:
                print("⚠️  Could not load training checkpoint, starting fresh")
                initial_epoch = 0

        # Custom callback to save training state
        class TrainingStateCallback(keras.callbacks.Callback):
            def __init__(self, state_file, weights_file):
                self.state_file = state_file
                self.weights_file = weights_file

            def on_epoch_end(self, epoch, logs=None):
                # Save every 50 epochs
                if epoch % 50 == 0:
                    state = {
                        'last_epoch': epoch + 1,
                        'val_loss': float(logs.get('val_loss', 0)),
                        'loss': float(logs.get('loss', 0))
                    }
                    with open(self.state_file, 'w') as f:
                        json.dump(state, f)
                    self.model.save_weights(self.weights_file)

        # Enhanced callbacks
        callbacks = [
            ModelCheckpoint(
                f"{self.enhanced_dir}best_enhanced_model.weights.h5",
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True,
                verbose=1
            ),
            EarlyStopping(
                monitor='val_loss',
                patience=150,
                restore_best_weights=True,
                verbose=1,
                min_delta=0.0001
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=50,
                min_lr=1e-7,
                verbose=1
            ),
            TrainingStateCallback(
                self.checkpoint_files['training_state'],
                self.checkpoint_files['model_weights']
            )
        ]

        # Train model
        history = self.model.fit(
            x_train_scaled, y_train_scaled,
            validation_data=(x_val_scaled, y_val_scaled),
            epochs=1500,
            batch_size=64,
            callbacks=callbacks,
            verbose=1,
            initial_epoch=initial_epoch
        )

        # Save final model
        self.model.save(self.checkpoint_files['final_model'])

        self.pipeline_state['model_trained'] = True
        self.save_pipeline_state()

        print("✅ Enhanced model training complete and saved!")
        return history

    def evaluate_enhanced_model(self, history=None, force_reevaluate=False):
        """Evaluate model with checkpoint support"""
        if self.pipeline_state['model_evaluated'] and not force_reevaluate:
            print("📊 Model already evaluated, loading results...")
            with open(self.checkpoint_files['results'], 'rb') as f:
                results = pickle.load(f)
            print(f"✅ Loaded evaluation results: R²={results['r2']:.6f}")
            return results

        print("📊 Evaluating enhanced model...")

        # Load model if not in memory
        if self.model is None:
            self.model = load_model(self.checkpoint_files['final_model'])

        # Load scalers and test data
        data = np.load(self.checkpoint_files['split_data'])
        with open(self.checkpoint_files['scalers'], 'rb') as f:
            scalers = pickle.load(f)

        self.x_scaler = scalers['x_scaler']
        self.y_scaler = scalers['y_scaler']

        x_test_scaled = data['x_test_scaled']
        y_test_scaled = data['y_test_scaled']

        # Predictions
        y_pred_scaled = self.model.predict(x_test_scaled, verbose=0)
        y_pred = self.y_scaler.inverse_transform(y_pred_scaled).flatten()
        y_true = self.y_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

        # Metrics
        r2 = r2_score(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)

        # Accuracy metrics
        acc_05 = np.mean(np.abs(y_true - y_pred) <= 0.5) * 100
        acc_1 = np.mean(np.abs(y_true - y_pred) <= 1) * 100
        acc_15 = np.mean(np.abs(y_true - y_pred) <= 1.5) * 100
        acc_2 = np.mean(np.abs(y_true - y_pred) <= 2) * 100

        results = {
            'r2': r2, 'mse': mse, 'mae': mae, 'rmse': rmse,
            'acc_05': acc_05, 'acc_1': acc_1, 'acc_15': acc_15, 'acc_2': acc_2,
            'y_true': y_true, 'y_pred': y_pred
        }

        # Save results
        with open(self.checkpoint_files['results'], 'wb') as f:
            pickle.dump(results, f)

        self.pipeline_state['model_evaluated'] = True
        self.save_pipeline_state()

        print(f"📈 Enhanced Model Results:")
        print(f"   R²: {r2:.6f}")
        print(f"   MAE: {mae:.4f}")
        print(f"   RMSE: {rmse:.4f}")
        print(f"   Accuracy ±0.5: {acc_05:.1f}%")
        print(f"   Accuracy ±1.0: {acc_1:.1f}%")
        print(f"   Accuracy ±1.5: {acc_15:.1f}%")
        print(f"   Accuracy ±2.0: {acc_2:.1f}%")
        print(f"💾 Results saved to checkpoint")

        # Plot results if history available
        if history:
            self.plot_enhanced_results(y_true, y_pred, history, r2, mae)

        return results

    def plot_enhanced_results(self, y_true, y_pred, history, r2, mae):
        """Plot comprehensive results"""
        fig, axes = plt.subplots(2, 4, figsize=(24, 12))

        # Training history plots and other visualizations
        # (same as original but with checkpoint awareness)

        # Training history - Loss
        axes[0,0].plot(history.history['loss'], label='Training', linewidth=2)
        axes[0,0].plot(history.history['val_loss'], label='Validation', linewidth=2)
        axes[0,0].set_title('Enhanced Model Loss')
        axes[0,0].set_yscale('log')
        axes[0,0].legend()
        axes[0,0].grid(True, alpha=0.3)

        # Training history - MAE
        axes[0,1].plot(history.history['mae'], label='Training', linewidth=2)
        axes[0,1].plot(history.history['val_mae'], label='Validation', linewidth=2)
        axes[0,1].set_title('Enhanced Model MAE')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)

        # Predictions scatter
        axes[0,2].scatter(y_true, y_pred, alpha=0.6, s=3, color='darkblue')
        axes[0,2].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
        axes[0,2].set_xlabel('True Values')
        axes[0,2].set_ylabel('Predictions')
        axes[0,2].set_title(f'Enhanced Predictions\n(R²={r2:.6f})')
        axes[0,2].grid(True, alpha=0.3)

        # Residuals and other plots
        residuals = y_true - y_pred
        axes[1,0].scatter(y_pred, residuals, alpha=0.6, s=3, color='green')
        axes[1,0].axhline(y=0, color='r', linestyle='--')
        axes[1,0].set_xlabel('Predictions')
        axes[1,0].set_ylabel('Residuals')
        axes[1,0].set_title('Enhanced Model Residuals')
        axes[1,0].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(f"{self.enhanced_dir}enhanced_results.png", dpi=150, bbox_inches='tight')
        plt.show()

    def run_enhanced_pipeline(self, data_filename, windows_filename, force_restart=False):
        """Run complete enhanced pipeline with resume capability"""
        print("="*80)
        print("🚀 RESUMABLE ENHANCED FEATURE ENGINEERING MLP PIPELINE")
        print("="*80)

        if force_restart:
            print("🔄 Force restart requested, clearing all checkpoints...")
            self.clear_checkpoints()
            self.pipeline_state = self.load_pipeline_state()

        try:
            # 1. Load original data
            print(f"\n📋 Step 1/7: Loading Data")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['data_loaded'] else '⏳ Pending'}")
            x, y = self.load_original_data(data_filename, windows_filename)

            # 2. Create enhanced features
            print(f"\n📋 Step 2/7: Feature Engineering")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['features_enhanced'] else '⏳ Pending'}")
            X_enhanced, feature_names = self.create_enhanced_features(x)

            # 3. Select best features
            print(f"\n📋 Step 3/7: Feature Selection")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['features_selected'] else '⏳ Pending'}")
            X_selected, selected_indices, selector = self.select_best_features(X_enhanced, y)

            # 4. Split and scale data
            print(f"\n📋 Step 4/7: Data Preparation")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['data_scaled'] else '⏳ Pending'}")
            scaled_data = self.split_and_scale_data(X_selected, y)

            # 5. Train model
            print(f"\n📋 Step 5/7: Model Training")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['model_trained'] else '⏳ Pending'}")
            history = self.train_enhanced_model()

            # 6. Evaluate model
            print(f"\n📋 Step 6/7: Model Evaluation")
            print(f"   Status: {'✅ Complete' if self.pipeline_state['model_evaluated'] else '⏳ Pending'}")
            results = self.evaluate_enhanced_model(history)

            # 7. Mark pipeline complete
            print(f"\n📋 Step 7/7: Pipeline Completion")
            self.pipeline_state['pipeline_complete'] = True
            self.save_pipeline_state()

            print("\n" + "="*80)
            print("🎉 RESUMABLE ENHANCED FEATURE ENGINEERING COMPLETE!")
            print("="*80)
            print(f"Final R²: {results['r2']:.6f}")
            print(f"Final MAE: {results['mae']:.4f}")
            print(f"Accuracy ±2: {results['acc_2']:.1f}%")
            print(f"Model saved: {self.checkpoint_files['final_model']}")
            print(f"All checkpoints saved in: {self.checkpoint_dir}")
            print("="*80)

            return {
                'model': self.model,
                'results': results,
                'history': history,
                'feature_names': feature_names,
                'selected_indices': selected_indices,
                'x_scaler': self.x_scaler,
                'y_scaler': self.y_scaler,
                'checkpoint_dir': self.checkpoint_dir
            }

        except Exception as e:
            print(f"❌ Enhanced pipeline failed: {e}")
            import traceback
            traceback.print_exc()
            return None

    def clear_checkpoints(self):
        """Clear all checkpoint files"""
        print("🗑️ Clearing all checkpoints...")
        for name, filepath in self.checkpoint_files.items():
            if os.path.exists(filepath):
                os.remove(filepath)
                print(f"   Removed: {name}")

        # Reset pipeline state
        state_file = f"{self.checkpoint_dir}pipeline_state.json"
        if os.path.exists(state_file):
            os.remove(state_file)

        # Remove original data files
        orig_files = [
            f"{self.checkpoint_dir}original_x.npy",
            f"{self.checkpoint_dir}original_y.npy"
        ]
        for filepath in orig_files:
            if os.path.exists(filepath):
                os.remove(filepath)

        print("✅ All checkpoints cleared")

    def get_pipeline_status(self):
        """Get detailed pipeline status"""
        print("\n📋 PIPELINE STATUS REPORT")
        print("="*50)

        steps = [
            ('Data Loading', 'data_loaded'),
            ('Feature Engineering', 'features_enhanced'),
            ('Feature Selection', 'features_selected'),
            ('Data Preparation', 'data_scaled'),
            ('Model Training', 'model_trained'),
            ('Model Evaluation', 'model_evaluated'),
            ('Pipeline Complete', 'pipeline_complete')
        ]

        for i, (step_name, state_key) in enumerate(steps, 1):
            status = "✅ Complete" if self.pipeline_state.get(state_key, False) else "⏳ Pending"
            print(f"{i}/7: {step_name:<20} {status}")

        print("="*50)

        # Show checkpoint file status
        print("\n💾 CHECKPOINT FILES STATUS")
        print("="*50)
        for name, filepath in self.checkpoint_files.items():
            exists = "✅" if os.path.exists(filepath) else "❌"
            size = ""
            if os.path.exists(filepath):
                size_bytes = os.path.getsize(filepath)
                if size_bytes > 1024*1024:
                    size = f" ({size_bytes/(1024*1024):.1f}MB)"
                elif size_bytes > 1024:
                    size = f" ({size_bytes/1024:.1f}KB)"
                else:
                    size = f" ({size_bytes}B)"
            print(f"{exists} {name:<20} {size}")
        print("="*50)

    def resume_from_step(self, step_name):
        """Resume pipeline from a specific step"""
        step_mapping = {
            'data': 'data_loaded',
            'features': 'features_enhanced',
            'selection': 'features_selected',
            'preparation': 'data_scaled',
            'training': 'model_trained',
            'evaluation': 'model_evaluated'
        }

        if step_name not in step_mapping:
            print(f"❌ Unknown step: {step_name}")
            print(f"Available steps: {list(step_mapping.keys())}")
            return

        # Reset pipeline state from the specified step onwards
        state_key = step_mapping[step_name]
        reset_from = False

        for key in self.pipeline_state:
            if key == state_key:
                reset_from = True
            if reset_from:
                self.pipeline_state[key] = False

        self.save_pipeline_state()
        print(f"🔄 Pipeline reset from step: {step_name}")
        self.get_pipeline_status()


# Enhanced interface functions
def run_resumable_enhanced_feature_mlp(data_filename, windows_filename, force_restart=False):
    """
    Run resumable enhanced feature engineering MLP

    Args:
        data_filename: e.g., 'generated-data-OPTIMIZED.npy'
        windows_filename: e.g., 'generated-data-true-window-OPTIMIZED.npy'
        force_restart: If True, clear all checkpoints and start fresh
    """
    enhanced_mlp = ResumableEnhancedFeatureEngineeringMLP()
    return enhanced_mlp.run_enhanced_pipeline(data_filename, windows_filename, force_restart)

def check_pipeline_status(output_dir='/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'):
    """Check the status of the pipeline"""
    enhanced_mlp = ResumableEnhancedFeatureEngineeringMLP(output_dir)
    enhanced_mlp.get_pipeline_status()
    return enhanced_mlp

def clear_all_checkpoints(output_dir='/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'):
    """Clear all checkpoints and start fresh"""
    enhanced_mlp = ResumableEnhancedFeatureEngineeringMLP(output_dir)
    enhanced_mlp.clear_checkpoints()
    return enhanced_mlp

def resume_from_step(step_name, output_dir='/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'):
    """Resume pipeline from a specific step"""
    enhanced_mlp = ResumableEnhancedFeatureEngineeringMLP(output_dir)
    enhanced_mlp.resume_from_step(step_name)
    return enhanced_mlp

# Main execution
if __name__ == "__main__":
    print("🚀 Resumable Enhanced Feature Engineering MLP")

    # SPECIFY YOUR FILES
    data_file = 'generated-data-OPTIMIZED.npy'
    windows_file = 'generated-data-true-window-OPTIMIZED.npy'

    print(f"Running resumable enhanced pipeline on: {data_file}, {windows_file}")

    # Check current status first
    mlp = check_pipeline_status()

    # Run pipeline (will resume automatically)
    results = run_resumable_enhanced_feature_mlp(data_file, windows_file)

    if results:
        print(f"\n🎊 Enhanced model complete!")
        print(f"R² improvement: {results['results']['r2']:.6f}")
        print(f"Accuracy ±2: {results['results']['acc_2']:.1f}%")
        print(f"Checkpoints saved in: {results['checkpoint_dir']}")
    else:
        print("❌ Enhanced pipeline failed")

# Usage examples
"""
# 1. Run pipeline (will resume automatically if crashed)
results = run_resumable_enhanced_feature_mlp(
    'generated-data-OPTIMIZED.npy',
    'generated-data-true-window-OPTIMIZED.npy'
)

# 2. Check pipeline status
mlp = check_pipeline_status()

# 3. Force restart from beginning
results = run_resumable_enhanced_feature_mlp(
    'generated-data-OPTIMIZED.npy',
    'generated-data-true-window-OPTIMIZED.npy',
    force_restart=True
)

# 4. Resume from specific step (if you want to rerun from training)
mlp = resume_from_step('training')
results = run_resumable_enhanced_feature_mlp(
    'generated-data-OPTIMIZED.npy',
    'generated-data-true-window-OPTIMIZED.npy'
)

# 5. Clear all checkpoints
mlp = clear_all_checkpoints()
"""

🚀 Resumable Enhanced Feature Engineering MLP
Running resumable enhanced pipeline on: generated-data-OPTIMIZED.npy, generated-data-true-window-OPTIMIZED.npy
🆕 Initialized new pipeline state
📁 Enhanced MLP directory: /content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/enhanced_mlp/
💾 Checkpoint directory: /content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/enhanced_mlp/checkpoints/

📋 PIPELINE STATUS REPORT
1/7: Data Loading         ⏳ Pending
2/7: Feature Engineering  ⏳ Pending
3/7: Feature Selection    ⏳ Pending
4/7: Data Preparation     ⏳ Pending
5/7: Model Training       ⏳ Pending
6/7: Model Evaluation     ⏳ Pending
7/7: Pipeline Complete    ⏳ Pending

💾 CHECKPOINT FILES STATUS
❌ enhanced_features    
❌ feature_names        
❌ selected_features    
❌ selected_indices     
❌ selector             
❌ split_data           
❌ scalers              
❌ model_weights        
❌ training_state       
❌ final_model          
❌ results              
🆕 Initialized new pipeline state
📁 Enhanced MLP 

In [None]:
from google.colab import drive
drive.mount('/content/drive')