<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE3-MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Enhanced Feature Engineering MLP
# Based on correlation analysis showing Feature 7 × Feature 8 interaction

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.regularizers import l1_l2
import warnings
warnings.filterwarnings('ignore')

class EnhancedFeatureEngineeringMLP:
    """
    MLP with explicit feature engineering based on correlation analysis
    """

    def __init__(self, output_dir='/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'):
        self.output_dir = output_dir
        self.y_scaler = StandardScaler()
        self.x_scaler = StandardScaler()  # For engineered features
        self.model = None

        # Create directory for enhanced model
        self.enhanced_dir = f"{output_dir}enhanced_mlp/"
        os.makedirs(self.enhanced_dir, exist_ok=True)

        print(f"📁 Enhanced MLP directory: {self.enhanced_dir}")

    def load_original_data(self, data_filename, windows_filename):
        """Load original data for feature engineering"""
        print("📊 Loading original data for feature engineering...")

        data_path = os.path.join(self.output_dir, data_filename)
        windows_path = os.path.join(self.output_dir, windows_filename)

        x = np.load(data_path)
        y = np.load(windows_path)

        print(f"✅ Original data loaded: X={x.shape}, y={y.shape}")
        return x, y

    def create_enhanced_features(self, x):
        """
        Create enhanced features based on correlation analysis findings
        """
        print("\n🔧 Creating enhanced features based on correlation analysis...")
        print(f"   Starting with {x.shape[1]} original features")

        enhanced_features = []
        feature_names = []

        # 1. Original features (scaled)
        enhanced_features.append(x)
        feature_names.extend([f"orig_{i}" for i in range(x.shape[1])])
        print(f"   ✅ Original features: {x.shape[1]}")

        # 2. Best interaction found (Feature 7 × Feature 8)
        if x.shape[1] > 8:
            best_interaction = x[:, 7] * x[:, 8]
            enhanced_features.append(best_interaction.reshape(-1, 1))
            feature_names.append("feat_7_x_feat_8")
            print(f"   ✅ Best interaction (7×8): 1 feature")

        # 3. Top feature interactions (systematic)
        print("   Creating systematic feature interactions...")
        interaction_features = []
        interaction_names = []

        # Focus on top features that showed some correlation
        top_features = min(15, x.shape[1])  # Use top 15 features
        interaction_count = 0

        for i in range(top_features):
            for j in range(i+1, top_features):
                if interaction_count < 50:  # Limit interactions
                    # Multiplication
                    mult_feat = x[:, i] * x[:, j]
                    interaction_features.append(mult_feat)
                    interaction_names.append(f"feat_{i}_x_feat_{j}")

                    # Division (safe)
                    if np.all(np.abs(x[:, j]) > 1e-8):
                        div_feat = x[:, i] / (x[:, j] + 1e-8)
                        interaction_features.append(div_feat)
                        interaction_names.append(f"feat_{i}_div_feat_{j}")

                    interaction_count += 2

                    if interaction_count >= 50:
                        break
            if interaction_count >= 50:
                break

        if interaction_features:
            interaction_matrix = np.column_stack(interaction_features)
            enhanced_features.append(interaction_matrix)
            feature_names.extend(interaction_names)
            print(f"   ✅ Feature interactions: {len(interaction_features)}")

        # 4. Polynomial features (degree 2) for top features
        print("   Creating polynomial features...")
        poly_features = []
        poly_names = []

        top_poly_features = min(10, x.shape[1])
        for i in range(top_poly_features):
            # Quadratic terms
            quad_feat = x[:, i] ** 2
            poly_features.append(quad_feat)
            poly_names.append(f"feat_{i}_squared")

            # Cubic terms (selective)
            if i < 5:  # Only for top 5
                cube_feat = x[:, i] ** 3
                poly_features.append(cube_feat)
                poly_names.append(f"feat_{i}_cubed")

        if poly_features:
            poly_matrix = np.column_stack(poly_features)
            enhanced_features.append(poly_matrix)
            feature_names.extend(poly_names)
            print(f"   ✅ Polynomial features: {len(poly_features)}")

        # 5. Statistical features (rolling-like operations)
        print("   Creating statistical features...")
        stat_features = []
        stat_names = []

        if x.shape[1] >= 5:
            # Moving averages across features (treating features as sequence)
            for window in [3, 5]:
                if x.shape[1] >= window:
                    for start in range(0, min(20, x.shape[1] - window + 1), window):
                        end = start + window
                        mean_feat = np.mean(x[:, start:end], axis=1)
                        std_feat = np.std(x[:, start:end], axis=1)

                        stat_features.extend([mean_feat, std_feat])
                        stat_names.extend([f"mean_{start}_{end}", f"std_{start}_{end}"])

        if stat_features:
            stat_matrix = np.column_stack(stat_features)
            enhanced_features.append(stat_matrix)
            feature_names.extend(stat_names)
            print(f"   ✅ Statistical features: {len(stat_features)}")

        # 6. Combine all features
        X_enhanced = np.hstack(enhanced_features)

        print(f"\n📈 Feature Engineering Summary:")
        print(f"   Original features: {x.shape[1]}")
        print(f"   Enhanced features: {X_enhanced.shape[1]}")
        print(f"   Enhancement factor: {X_enhanced.shape[1] / x.shape[1]:.1f}x")

        return X_enhanced, feature_names

    def select_best_features(self, X_enhanced, y, max_features=200):
        """Select best features using multiple criteria"""
        print(f"\n🎯 Selecting best {max_features} features from {X_enhanced.shape[1]}...")

        # Use mutual information for non-linear feature selection
        selector = SelectKBest(mutual_info_regression, k=min(max_features, X_enhanced.shape[1]))
        X_selected = selector.fit_transform(X_enhanced, y)

        # Get selected feature indices
        selected_indices = selector.get_support(indices=True)
        selected_scores = selector.scores_[selected_indices]

        print(f"   Selected {X_selected.shape[1]} features")
        print(f"   Score range: [{np.min(selected_scores):.6f}, {np.max(selected_scores):.6f}]")

        return X_selected, selected_indices, selector

    def build_interaction_focused_mlp(self, input_dim):
        """Build MLP optimized for learning feature interactions"""
        print(f"\n🏗️ Building interaction-focused MLP for {input_dim} features...")

        model = Sequential([
            # Very wide first layer to capture many interactions
            Dense(4096, input_dim=input_dim, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.4),

            # Wide second layer for interaction combinations
            Dense(2048, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.4),

            # Compression layers
            Dense(1024, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.3),

            Dense(512, activation='relu',
                  kernel_regularizer=l1_l2(0.0001, 0.001)),
            BatchNormalization(),
            Dropout(0.3),

            Dense(256, activation='relu'),
            Dropout(0.2),

            Dense(128, activation='relu'),
            Dropout(0.2),

            Dense(64, activation='relu'),
            Dropout(0.1),

            Dense(32, activation='relu'),

            # Output
            Dense(1, activation='linear')
        ])

        # Optimizer tuned for weak signals
        optimizer = keras.optimizers.Adam(
            learning_rate=0.002,  # Higher LR for weak signals
            clipnorm=1.0,
            beta_1=0.9,
            beta_2=0.999
        )

        # MAE loss often better for weak signals
        model.compile(
            loss='mae',  # Mean Absolute Error
            optimizer=optimizer,
            metrics=['mse', 'mae']
        )

        print(f"   Model parameters: {model.count_params():,}")
        print(f"   Loss function: MAE (better for weak signals)")
        print(f"   Learning rate: 0.002 (higher for weak relationships)")

        return model

    def train_enhanced_model(self, x_train, y_train, x_val, y_val):
        """Train the enhanced model with optimized settings"""
        print(f"\n🚀 Training enhanced feature model...")

        # Build model
        self.model = self.build_interaction_focused_mlp(x_train.shape[1])

        # Enhanced callbacks
        callbacks = [
            ModelCheckpoint(
                f"{self.enhanced_dir}best_enhanced_model.weights.h5",
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True,
                verbose=1
            ),
            EarlyStopping(
                monitor='val_loss',
                patience=150,  # More patience for weak signals
                restore_best_weights=True,
                verbose=1,
                min_delta=0.0001
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=50,
                min_lr=1e-7,
                verbose=1
            )
        ]

        # Train model
        history = self.model.fit(
            x_train, y_train,
            validation_data=(x_val, y_val),
            epochs=1500,  # More epochs for weak signals
            batch_size=64,
            callbacks=callbacks,
            verbose=1
        )

        print("✅ Enhanced model training complete!")
        return history

    def evaluate_enhanced_model(self, x_test, y_test, history):
        """Evaluate the enhanced model"""
        print("\n📊 Evaluating enhanced model...")

        # Predictions
        y_pred_scaled = self.model.predict(x_test, verbose=0)
        y_pred = self.y_scaler.inverse_transform(y_pred_scaled).flatten()
        y_true = self.y_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

        # Metrics
        r2 = r2_score(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)

        # Accuracy metrics
        acc_05 = np.mean(np.abs(y_true - y_pred) <= 0.5) * 100
        acc_1 = np.mean(np.abs(y_true - y_pred) <= 1) * 100
        acc_15 = np.mean(np.abs(y_true - y_pred) <= 1.5) * 100
        acc_2 = np.mean(np.abs(y_true - y_pred) <= 2) * 100

        results = {
            'r2': r2, 'mse': mse, 'mae': mae, 'rmse': rmse,
            'acc_05': acc_05, 'acc_1': acc_1, 'acc_15': acc_15, 'acc_2': acc_2,
            'y_true': y_true, 'y_pred': y_pred
        }

        print(f"📈 Enhanced Model Results:")
        print(f"   R²: {r2:.6f}")
        print(f"   MAE: {mae:.4f}")
        print(f"   RMSE: {rmse:.4f}")
        print(f"   Accuracy ±0.5: {acc_05:.1f}%")
        print(f"   Accuracy ±1.0: {acc_1:.1f}%")
        print(f"   Accuracy ±1.5: {acc_15:.1f}%")
        print(f"   Accuracy ±2.0: {acc_2:.1f}%")

        # Plot enhanced results
        self.plot_enhanced_results(y_true, y_pred, history, r2, mae)

        return results

    def plot_enhanced_results(self, y_true, y_pred, history, r2, mae):
        """Plot comprehensive results for enhanced model"""
        fig, axes = plt.subplots(2, 4, figsize=(24, 12))

        # Training history - Loss
        axes[0,0].plot(history.history['loss'], label='Training', linewidth=2)
        axes[0,0].plot(history.history['val_loss'], label='Validation', linewidth=2)
        axes[0,0].set_title('Enhanced Model Loss')
        axes[0,0].set_yscale('log')
        axes[0,0].legend()
        axes[0,0].grid(True, alpha=0.3)

        # Training history - MAE
        axes[0,1].plot(history.history['mae'], label='Training', linewidth=2)
        axes[0,1].plot(history.history['val_mae'], label='Validation', linewidth=2)
        axes[0,1].set_title('Enhanced Model MAE')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)

        # Predictions scatter
        axes[0,2].scatter(y_true, y_pred, alpha=0.6, s=3, color='darkblue')
        axes[0,2].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
        axes[0,2].set_xlabel('True Values')
        axes[0,2].set_ylabel('Predictions')
        axes[0,2].set_title(f'Enhanced Predictions\n(R²={r2:.6f})')
        axes[0,2].grid(True, alpha=0.3)

        # Learning rate (if available)
        if 'lr' in history.history:
            axes[0,3].plot(history.history['lr'], linewidth=2)
            axes[0,3].set_title('Learning Rate Schedule')
            axes[0,3].set_yscale('log')
        else:
            # Show improvement over epochs
            val_loss = history.history['val_loss']
            best_val_loss = np.minimum.accumulate(val_loss)
            axes[0,3].plot(best_val_loss, linewidth=2, color='green')
            axes[0,3].set_title('Best Validation Loss')
            axes[0,3].set_yscale('log')
        axes[0,3].grid(True, alpha=0.3)

        # Residuals
        residuals = y_true - y_pred
        axes[1,0].scatter(y_pred, residuals, alpha=0.6, s=3, color='green')
        axes[1,0].axhline(y=0, color='r', linestyle='--')
        axes[1,0].set_xlabel('Predictions')
        axes[1,0].set_ylabel('Residuals')
        axes[1,0].set_title('Enhanced Model Residuals')
        axes[1,0].grid(True, alpha=0.3)

        # Error distribution
        axes[1,1].hist(residuals, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
        axes[1,1].set_xlabel('Residuals')
        axes[1,1].set_ylabel('Frequency')
        axes[1,1].set_title('Error Distribution')
        axes[1,1].grid(True, alpha=0.3)

        # Accuracy comparison
        tolerances = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
        accuracies = [np.mean(np.abs(residuals) <= tol) * 100 for tol in tolerances]
        bars = axes[1,2].bar(tolerances, accuracies, alpha=0.7, color='lightgreen')
        axes[1,2].set_xlabel('Error Tolerance')
        axes[1,2].set_ylabel('Accuracy (%)')
        axes[1,2].set_title('Enhanced Accuracy vs Tolerance')
        axes[1,2].grid(True, alpha=0.3)

        # Add accuracy labels
        for bar, acc in zip(bars, accuracies):
            height = bar.get_height()
            axes[1,2].text(bar.get_x() + bar.get_width()/2., height + 1,
                          f'{acc:.1f}%', ha='center', va='bottom')

        # Sample predictions line chart
        sample_size = min(300, len(y_true))
        indices = np.random.choice(len(y_true), sample_size, replace=False)
        indices = np.sort(indices)
        axes[1,3].plot(indices, y_true[indices], 'b-', label='True', linewidth=1.5, alpha=0.8)
        axes[1,3].plot(indices, y_pred[indices], 'r--', label='Enhanced Pred', linewidth=1.5, alpha=0.8)
        axes[1,3].set_xlabel('Sample Index')
        axes[1,3].set_ylabel('VAR Window Size')
        axes[1,3].set_title('Enhanced Sample Predictions')
        axes[1,3].legend()
        axes[1,3].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    def run_enhanced_pipeline(self, data_filename, windows_filename):
        """Run complete enhanced feature engineering pipeline"""
        print("="*80)
        print("🚀 ENHANCED FEATURE ENGINEERING MLP PIPELINE")
        print("="*80)

        try:
            # 1. Load original data
            x, y = self.load_original_data(data_filename, windows_filename)

            # 2. Create enhanced features
            X_enhanced, feature_names = self.create_enhanced_features(x)

            # 3. Select best features
            X_selected, selected_indices, selector = self.select_best_features(X_enhanced, y)

            # 4. Split data
            print(f"\n📊 Splitting enhanced data...")
            x_temp, x_test, y_temp, y_test = train_test_split(
                X_selected, y, test_size=0.2, random_state=42
            )
            x_train, x_val, y_train, y_val = train_test_split(
                x_temp, y_temp, test_size=0.25, random_state=42
            )

            # 5. Scale features and targets
            x_train_scaled = self.x_scaler.fit_transform(x_train)
            x_val_scaled = self.x_scaler.transform(x_val)
            x_test_scaled = self.x_scaler.transform(x_test)

            y_train_scaled = self.y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
            y_val_scaled = self.y_scaler.transform(y_val.reshape(-1, 1)).flatten()
            y_test_scaled = self.y_scaler.transform(y_test.reshape(-1, 1)).flatten()

            print(f"   Train: {x_train_scaled.shape[0]} samples, {x_train_scaled.shape[1]} features")
            print(f"   Validation: {x_val_scaled.shape[0]} samples")
            print(f"   Test: {x_test_scaled.shape[0]} samples")

            # 6. Train enhanced model
            history = self.train_enhanced_model(
                x_train_scaled, y_train_scaled, x_val_scaled, y_val_scaled
            )

            # 7. Evaluate
            results = self.evaluate_enhanced_model(x_test_scaled, y_test_scaled, history)

            # 8. Save enhanced model
            model_file = f"{self.enhanced_dir}enhanced_feature_model.keras"
            self.model.save(model_file)

            print("\n" + "="*80)
            print("🎉 ENHANCED FEATURE ENGINEERING COMPLETE!")
            print("="*80)
            print(f"Original max correlation: ~0.025")
            print(f"Enhanced interaction correlation: 0.037")
            print(f"Final R²: {results['r2']:.6f}")
            print(f"Final MAE: {results['mae']:.4f}")
            print(f"Accuracy ±2: {results['acc_2']:.1f}%")
            print(f"Model saved: {model_file}")
            print("="*80)

            return {
                'model': self.model,
                'results': results,
                'history': history,
                'feature_names': feature_names,
                'selected_indices': selected_indices,
                'x_scaler': self.x_scaler,
                'y_scaler': self.y_scaler
            }

        except Exception as e:
            print(f"❌ Enhanced pipeline failed: {e}")
            import traceback
            traceback.print_exc()
            return None

# Simple interface function
def run_enhanced_feature_mlp(data_filename, windows_filename):
    """
    Run enhanced feature engineering MLP

    Args:
        data_filename: e.g., 'generated-data-OPTIMIZED.npy'
        windows_filename: e.g., 'generated-data-true-window-OPTIMIZED.npy'
    """
    enhanced_mlp = EnhancedFeatureEngineeringMLP()
    return enhanced_mlp.run_enhanced_pipeline(data_filename, windows_filename)

# Main execution
if __name__ == "__main__":
    print("🚀 Enhanced Feature Engineering MLP")

    # SPECIFY YOUR FILES
    data_file = 'generated-data-OPTIMIZED.npy'
    windows_file = 'generated-data-true-window-OPTIMIZED.npy'

    print(f"Running enhanced pipeline on: {data_file}, {windows_file}")

    results = run_enhanced_feature_mlp(data_file, windows_file)

    if results:
        print(f"\n🎊 Enhanced model complete!")
        print(f"R² improvement: {results['results']['r2']:.6f}")
        print(f"Accuracy ±2: {results['results']['acc_2']:.1f}%")
    else:
        print("❌ Enhanced pipeline failed")

# Usage example
"""
# Run enhanced feature engineering MLP
results = run_enhanced_feature_mlp(
    'generated-data-OPTIMIZED.npy',
    'generated-data-true-window-OPTIMIZED.npy'
)

# Check improvement
if results:
    r2_enhanced = results['results']['r2']
    acc_2_enhanced = results['results']['acc_2']
    print(f"Enhanced R²: {r2_enhanced:.6f}")
    print(f"Enhanced Accuracy ±2: {acc_2_enhanced:.1f}%")
"""

NameError: name 'test_data' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')