In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:
def generate_financial_timeseries(rows, cols, start_price=100):
    """
    Generate realistic financial time series data with the following characteristics:
    - Geometric Brownian Motion for price evolution
    - Realistic volatility (15-25% annualized)
    - Occasional jumps and mean reversion
    - Correlation between some assets
    """
    
    # Parameters for realistic financial data
    dt = 1/252  # Daily time step (252 trading days per year)
    mu = 0.08   # Annual drift (8% expected return)
    
    # Initialize the price matrix
    prices = np.zeros((rows, cols))
    
    # Set different starting prices for each asset (columns)
    start_prices = np.random.uniform(50, 200, cols)
    prices[0, :] = start_prices
    
    for t in range(1, rows):
        for asset in range(cols):
            # Individual asset volatility (15-25% annualized)
            sigma = np.random.uniform(0.15, 0.25)
            
            # Add some correlation with previous asset (except first one)
            if asset > 0:
                correlation_factor = 0.3 * np.random.randn()
                correlated_shock = correlation_factor * (prices[t-1, asset-1] / prices[t-2, asset-1] - 1) if t > 1 else 0
            else:
                correlated_shock = 0
            
            # Geometric Brownian Motion with occasional jumps
            random_shock = np.random.randn()
            
            # Add occasional jump (5% probability)
            if np.random.random() < 0.05:
                jump = np.random.uniform(-0.1, 0.15)  # Jump between -10% and +15%
            else:
                jump = 0
            
            # Price evolution: S(t+1) = S(t) * exp((mu - 0.5*sigma^2)*dt + sigma*sqrt(dt)*Z + jump)
            log_return = (mu - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * random_shock + jump + correlated_shock
            
            prices[t, asset] = prices[t-1, asset] * np.exp(log_return)
    
    # Round to 2 decimal places for realistic price formatting
    prices = np.round(prices, 2)
    
    return prices

rows = 700
col = 200
# Generate the dataset
financial_data = generate_financial_timeseries(rows, col)

# Create DataFrame with proper labeling
dates = [datetime(2024, 1, 1) + timedelta(days=i) for i in range(rows)]
asset_names = [f"ASSET_{i+1:02d}" for i in range(col)]

df = pd.DataFrame(financial_data, 
                 index=pd.to_datetime(dates),
                 columns=asset_names)

print("Financial Time Series Dataset")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Date Range: {df.index[0].strftime('%Y-%m-%d')} to {df.index[-1].strftime('%Y-%m-%d')}")
print(f"Price Range: ${df.values.min():.2f} - ${df.values.max():.2f}")
print("\nDataset Preview:")
print(df.head(10))

In [None]:
# SAVE the DATASET ons the server
import psycopg2
from sqlalchemy import create_engine
import getpass

password = getpass.getpass("Database password: ")
engine = create_engine(f"postgresql://postgres:{password}@localhost:5432/fintech_db")
# Reset index to make dates a proper column
df_to_save = df.reset_index()
df_to_save = df_to_save.rename(columns={'index': 'date'})

# Save to PostgreSQL server
df_to_save.to_sql('asset_prices', engine, if_exists='replace', index=False)
engine.dispose()
print("✅ Financial time series data saved to server!")
print(f"📊 Saved: {len(df_to_save)} rows × {len(df_to_save.columns)} columns")

In [None]:
# LOAD the DATASET from the server
import psycopg2
from sqlalchemy import create_engine
import getpass

password = getpass.getpass("Database password: ")
engine = create_engine(f"postgresql://postgres:{password}@localhost:5432/fintech_db")

# Load from PostgreSQL server
df_loaded = pd.read_sql("SELECT * FROM asset_prices", engine)

# Convert date column back to datetime and set as index
df_loaded['date'] = pd.to_datetime(df_loaded['date'])
df_loaded = df_loaded.set_index('date')

df = df_loaded # rename
engine.dispose()

print("✅ Financial time series data loaded from server!")
print(f"📊 Loaded: {df.shape}")
print(f"📅 Date range: {df.index[0]} to {df.index[-1]}")
print("\nFirst 5 rows:")
print(df.head())


'''Strategies for Handling Missing Numeric Values'''


In [None]:
# MISSING VALUES FUNCTION = create them
def introduce_missing_values(data, n_missing, seed=123):
    np.random.seed(seed)
    
    # Work with a copy to preserve original data
    if isinstance(data, pd.DataFrame):
        data_with_missing = data.copy()
        rows, cols = data.shape
    else:
        data_with_missing = data.copy().astype(float)  # Convert to float to allow NaN
        rows, cols = data.shape
    
    # Generate random positions for missing values
    total_positions = rows * cols
    missing_indices = np.random.choice(total_positions, size=n_missing, replace=False)
    
    # Convert flat indices to (row, col) positions
    missing_positions = [(idx // cols, idx % cols) for idx in missing_indices]
    
    # Introduce missing values
    for row, col in missing_positions:
        if isinstance(data_with_missing, pd.DataFrame):
            data_with_missing.iloc[row, col] = np.nan
        else:
            data_with_missing[row, col] = np.nan
    
    return data_with_missing, missing_positions

# Apply missing values to both DataFrame and numpy array
print("\n" + "="*80)
print("INTRODUCING 1000 RANDOM MISSING VALUES")
print("="*80)

# For DataFrame
df_missing, missing_pos_df = introduce_missing_values(df, n_missing = 1000)

# For numpy array
financial_data_missing, missing_pos_array = introduce_missing_values(df, n_missing = 1000)

print(f"\nMissing values introduced at positions (row, col):")
for i, (row, col) in enumerate(missing_pos_df, 1):
    asset_name = df.columns[col] if col < len(df.columns) else f"Asset_{col}"
    date_str = df.index[row].strftime('%Y-%m-%d') if row < len(df.index) else f"Day_{row}"
    print(f"{i:2d}. Position ({row:2d}, {col:2d}) - {asset_name} on {date_str}")

print(f"\nDataFrame with missing values:")
print("Missing values count:", df_missing.isnull().sum().sum())
#print(df_missing.head(20))

print(f"\nNumPy array with missing values:")
print("Missing values count:", (np.isnan(financial_data_missing).sum()))
print("Missing values count:", sum(np.isnan(financial_data_missing).sum()))
print("Shape:", financial_data_missing.shape)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

class FinancialImputationAnalyzer:
    """
    Advanced imputation analyzer specifically designed for financial time series data.
    Implements mean/median imputation with financial data considerations.
    """
    
    def __init__(self, data_with_missing, original_data=None):
        """
        Initialize the imputation analyzer
        
        Parameters:
        - data_with_missing: DataFrame or numpy array with missing values
        - original_data: Original complete data for evaluation (optional)
        """
        self.data_missing = data_with_missing.copy() if hasattr(data_with_missing, 'copy') else data_with_missing.copy()
        self.original_data = original_data.copy() if original_data is not None else None
        self.imputation_results = {}
        self.performance_metrics = {}
        
    def analyze_missing_pattern(self):
        """Analyze the pattern of missing values for financial context"""
        if isinstance(self.data_missing, pd.DataFrame):
            missing_info = self.data_missing.isnull()
        else:
            missing_info = pd.DataFrame(np.isnan(self.data_missing))
        
        print("🔍 MISSING VALUE PATTERN ANALYSIS")
        print("=" * 50)
        
        # Overall statistics
        total_missing = missing_info.sum().sum()
        total_cells = missing_info.shape[0] * missing_info.shape[1]
        missing_pct = (total_missing / total_cells) * 100
        
        print(f"Dataset shape: {missing_info.shape}")
        print(f"Total missing values: {total_missing:,}")
        print(f"Missing percentage: {missing_pct:.2f}%")
        
        # Missing by time period (rows)
        missing_by_row = missing_info.sum(axis=1)
        print(f"\nMissing values per time period:")
        print(f"  Min: {missing_by_row.min()}")
        print(f"  Max: {missing_by_row.max()}")
        print(f"  Mean: {missing_by_row.mean():.1f}")
        
        # Missing by asset (columns)
        missing_by_col = missing_info.sum(axis=0)
        print(f"\nMissing values per asset:")
        print(f"  Min: {missing_by_col.min()}")
        print(f"  Max: {missing_by_col.max()}")
        print(f"  Mean: {missing_by_col.mean():.1f}")
        
        # Financial data specific checks
        print(f"\n📊 FINANCIAL DATA QUALITY CHECKS:")
        print("-" * 40)
        
        # Check for consecutive missing values (problematic for time series)
        consecutive_missing = []
        for col in range(missing_info.shape[1]):
            col_missing = missing_info.iloc[:, col]
            consecutive = 0
            max_consecutive = 0
            for val in col_missing:
                if val:
                    consecutive += 1
                    max_consecutive = max(max_consecutive, consecutive)
                else:
                    consecutive = 0
            consecutive_missing.append(max_consecutive)
        
        max_consecutive_overall = max(consecutive_missing)
        print(f"Maximum consecutive missing values: {max_consecutive_overall}")
        
        if max_consecutive_overall > 5:
            print("⚠️  WARNING: Long consecutive missing periods detected!")
            print("   Consider using interpolation instead of mean/median")
        else:
            print("✅ Missing pattern suitable for mean/median imputation")
            
        return {
            'total_missing': total_missing,
            'missing_pct': missing_pct,
            'max_consecutive': max_consecutive_overall,
            'missing_by_row': missing_by_row,
            'missing_by_col': missing_by_col
        }
    
    def simple_mean_imputation(self):
        """
        Simple mean imputation - replaces missing values with column mean
        ⚠️ WARNING: This distorts variance and ignores time series nature
        """
        print("\n📊 SIMPLE MEAN IMPUTATION")
        print("=" * 40)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            for col in imputed_data.columns:
                mean_val = imputed_data[col].mean()
                imputed_data[col].fillna(mean_val, inplace=True)
                print(f"  {col}: filled with mean {mean_val:.2f}")
        else:
            imputed_data = self.data_missing.copy()
            for col in range(imputed_data.shape[1]):
                col_data = imputed_data[:, col]
                mean_val = np.nanmean(col_data)
                mask = np.isnan(col_data)
                imputed_data[mask, col] = mean_val
                if col < 5:  # Print first 5 for brevity
                    print(f"  Asset {col+1:03d}: filled with mean {mean_val:.2f}")
            
            if imputed_data.shape[1] > 5:
                print(f"  ... and {imputed_data.shape[1]-5} more assets")
        
        self.imputation_results['simple_mean'] = imputed_data
        return imputed_data
    
    def simple_median_imputation(self):
        """
        Simple median imputation - replaces missing values with column median
        More robust to outliers than mean
        """
        print("\n📊 SIMPLE MEDIAN IMPUTATION")
        print("=" * 40)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            for col in imputed_data.columns:
                median_val = imputed_data[col].median()
                imputed_data[col].fillna(median_val, inplace=True)
                print(f"  {col}: filled with median {median_val:.2f}")
        else:
            imputed_data = self.data_missing.copy()
            for col in range(imputed_data.shape[1]):
                col_data = imputed_data[:, col]
                median_val = np.nanmedian(col_data)
                mask = np.isnan(col_data)
                imputed_data[mask, col] = median_val
                if col < 5:  # Print first 5 for brevity
                    print(f"  Asset {col+1:03d}: filled with median {median_val:.2f}")
            
            if imputed_data.shape[1] > 5:
                print(f"  ... and {imputed_data.shape[1]-5} more assets")
        
        self.imputation_results['simple_median'] = imputed_data
        return imputed_data
    
    def rolling_mean_imputation(self, window=30):
        """
        Rolling mean imputation - uses local time window mean
        Better for financial time series as it adapts to local trends
        """
        print(f"\n📊 ROLLING MEAN IMPUTATION (Window: {window})")
        print("=" * 50)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            for col in imputed_data.columns:
                # Calculate rolling mean
                rolling_mean = imputed_data[col].rolling(window=window, center=True, min_periods=1).mean()
                # Fill missing values
                imputed_data[col] = imputed_data[col].fillna(rolling_mean)
                # If still missing (edge cases), use global mean
                global_mean = imputed_data[col].mean()
                imputed_data[col] = imputed_data[col].fillna(global_mean)
                
                filled_count = self.data_missing[col].isnull().sum()
                print(f"  {col}: filled {filled_count} values with rolling mean")
        else:
            imputed_data = self.data_missing.copy()
            for col in range(imputed_data.shape[1]):
                col_data = imputed_data[:, col]
                
                # Create rolling mean using pandas for convenience
                temp_series = pd.Series(col_data)
                rolling_mean = temp_series.rolling(window=window, center=True, min_periods=1).mean()
                
                # Fill missing values
                mask = np.isnan(col_data)
                imputed_data[mask, col] = rolling_mean[mask]
                
                # Handle remaining NaN with global mean
                remaining_nan = np.isnan(imputed_data[:, col])
                if remaining_nan.any():
                    global_mean = np.nanmean(imputed_data[:, col])
                    imputed_data[remaining_nan, col] = global_mean
                
                if col < 5:
                    filled_count = mask.sum()
                    print(f"  Asset {col+1:03d}: filled {filled_count} values with rolling mean")
            
            if imputed_data.shape[1] > 5:
                print(f"  ... and {imputed_data.shape[1]-5} more assets")
        
        self.imputation_results['rolling_mean'] = imputed_data
        return imputed_data
    
    def rolling_median_imputation(self, window=30):
        """
        Rolling median imputation - uses local time window median
        Even more robust to outliers, good for volatile financial data
        """
        print(f"\n📊 ROLLING MEDIAN IMPUTATION (Window: {window})")
        print("=" * 50)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            for col in imputed_data.columns:
                # Calculate rolling median
                rolling_median = imputed_data[col].rolling(window=window, center=True, min_periods=1).median()
                # Fill missing values
                imputed_data[col] = imputed_data[col].fillna(rolling_median)
                # If still missing (edge cases), use global median
                global_median = imputed_data[col].median()
                imputed_data[col] = imputed_data[col].fillna(global_median)
                
                filled_count = self.data_missing[col].isnull().sum()
                print(f"  {col}: filled {filled_count} values with rolling median")
        else:
            imputed_data = self.data_missing.copy()
            for col in range(imputed_data.shape[1]):
                col_data = imputed_data[:, col]
                
                # Create rolling median using pandas for convenience
                temp_series = pd.Series(col_data)
                rolling_median = temp_series.rolling(window=window, center=True, min_periods=1).median()
                
                # Fill missing values
                mask = np.isnan(col_data)
                imputed_data[mask, col] = rolling_median[mask]
                
                # Handle remaining NaN with global median
                remaining_nan = np.isnan(imputed_data[:, col])
                if remaining_nan.any():
                    global_median = np.nanmedian(imputed_data[:, col])
                    imputed_data[remaining_nan, col] = global_median
                
                if col < 5:
                    filled_count = mask.sum()
                    print(f"  Asset {col+1:03d}: filled {filled_count} values with rolling median")
            
            if imputed_data.shape[1] > 5:
                print(f"  ... and {imputed_data.shape[1]-5} more assets")
        
        self.imputation_results['rolling_median'] = imputed_data
        return imputed_data
    
    def evaluate_imputation_quality(self, method_name, imputed_data):
        """
        Evaluate imputation quality if original data is available - NO SKLEARN VERSION
        """
        if self.original_data is None:
            print(f"\n⚠️  No original data available for {method_name} evaluation")
            return None
            
        print(f"\n📈 IMPUTATION QUALITY EVALUATION: {method_name.upper()}")
        print("=" * 60)
        
        try:
            # Force everything to be numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
                
            if hasattr(imputed_data, 'values'):
                imputed_np = imputed_data.values.copy()
            else:
                imputed_np = imputed_data.copy()
            
            print(f"  Data shapes - Missing: {missing_np.shape}, Original: {original_np.shape}, Imputed: {imputed_np.shape}")
            
            # Create mask
            mask = np.isnan(missing_np)
            total_missing = np.sum(mask)
            print(f"  Found {total_missing} missing values to evaluate")
            
            if total_missing == 0:
                print("  No missing values found!")
                return None
            
            # Extract values using simple indexing
            orig_vals = []
            imp_vals = []
            
            rows, cols = missing_np.shape
            for i in range(rows):
                for j in range(cols):
                    if mask[i, j]:  # This was originally missing
                        orig_vals.append(original_np[i, j])
                        imp_vals.append(imputed_np[i, j])
            
            orig_vals = np.array(orig_vals)
            imp_vals = np.array(imp_vals)
            
            print(f"  Extracted {len(orig_vals)} value pairs")
            
            # Simple metrics - MANUAL CALCULATION ONLY
            differences = orig_vals - imp_vals
            abs_differences = np.abs(differences)
            
            mae = np.mean(abs_differences)
            mse = np.mean(differences**2)
            rmse = np.sqrt(mse)
            
            # Simple percentage error (avoid division by zero)
            nonzero_mask = orig_vals != 0
            if np.sum(nonzero_mask) > 0:
                pct_errors = abs_differences[nonzero_mask] / np.abs(orig_vals[nonzero_mask])
                mape = np.mean(pct_errors) * 100
            else:
                mape = float('inf')
            
            # Simple correlation
            if len(orig_vals) > 1:
                corr = np.corrcoef(orig_vals, imp_vals)[0, 1]
            else:
                corr = 1.0
                
            print(f"  MAE: {mae:.4f}")
            print(f"  RMSE: {rmse:.4f}")
            print(f"  MAPE: {mape:.2f}%" if np.isfinite(mape) else "  MAPE: ∞")
            print(f"  Correlation: {corr:.4f}")
            
            # Financial interpretation
            print(f"\n💰 FINANCIAL INTERPRETATION:")
            print(f"  Average price difference: ${mae:.2f}")
            print(f"  Typical error magnitude: ${rmse:.2f}")
            
            if np.isfinite(mape):
                if mape < 5:
                    print("  ✅ EXCELLENT: Very accurate imputation")
                elif mape < 10:
                    print("  ✅ GOOD: Acceptable imputation quality")
                elif mape < 20:
                    print("  ⚠️  FAIR: Moderate imputation errors")
                else:
                    print("  ❌ POOR: High imputation errors")
            else:
                print("  ⚠️  Note: MAPE could not be calculated due to zero values")
            
            metrics = {
                'MAE': mae,
                'RMSE': rmse, 
                'MAPE': mape,
                'Correlation': corr,
                'N_Values': len(orig_vals)
            }
            
            self.performance_metrics[method_name] = metrics
            return metrics
            
        except Exception as e:
            print(f"  ERROR in evaluation: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def compare_all_methods(self, window=30):
        """
        Run all imputation methods and compare results
        """
        print("\n" + "="*80)
        print("🎯 COMPREHENSIVE MEAN/MEDIAN IMPUTATION ANALYSIS")
        print("="*80)
        
        # Analyze missing pattern first
        self.analyze_missing_pattern()
        
        # Run all methods
        methods = {
            'Simple Mean': self.simple_mean_imputation,
            'Simple Median': self.simple_median_imputation,
            'Rolling Mean': lambda: self.rolling_mean_imputation(window),
            'Rolling Median': lambda: self.rolling_median_imputation(window)
        }
        
        results = {}
        for name, method in methods.items():
            print(f"\n{'='*20} {name.upper()} {'='*20}")
            imputed = method()
            results[name] = imputed
            
            # Evaluate quality
            metrics = self.evaluate_imputation_quality(name.lower().replace(' ', '_'), imputed)
        
        # Summary comparison
        if self.performance_metrics:
            self.print_method_comparison()
        
        return results
    
    def print_method_comparison(self):
        """Print comparison of all methods"""
        print("\n" + "="*80)
        print("🏆 METHOD COMPARISON SUMMARY")
        print("="*80)
        
        comparison_df = pd.DataFrame(self.performance_metrics).T
        print(comparison_df.round(4))
        
        # Find best method by lowest RMSE
        best_method = comparison_df['RMSE'].idxmin()
        print(f"\n🏆 BEST PERFORMING METHOD: {best_method.replace('_', ' ').title()}")
        print(f"   RMSE: {comparison_df.loc[best_method, 'RMSE']:.4f}")
        
        # Recommendations
        print(f"\n💡 RECOMMENDATIONS FOR FINANCIAL TIME SERIES:")
        print("-" * 50)
        
        if 'rolling_median' in self.performance_metrics:
            print("🥇 BEST PRACTICE: Rolling Median")
            print("   ✅ Adapts to local market conditions")
            print("   ✅ Robust to price spikes/crashes")
            print("   ✅ Preserves time series properties")
        
        print(f"\n🥈 ALTERNATIVE: Rolling Mean")
        print("   ✅ Good for stable market periods")
        print("   ⚠️  Sensitive to outliers")
        
        print(f"\n⚠️  AVOID: Simple Mean/Median")
        print("   ❌ Ignores time series structure")
        print("   ❌ Can create artificial patterns")
        print("   ❌ Distorts volatility")
        
        print(f"\n🔄 NEXT STEPS:")
        print("   1. Try interpolation methods (linear, spline)")
        print("   2. Consider forward/backward fill")
        print("   3. Test advanced methods (KNN, MICE)")


# READY TO USE
print("=" * 80)
print("FINANCIAL IMPUTATION ANALYZER")
print("=" * 80)
print("analyzer = FinancialImputationAnalyzer(data_with_missing, original_data)")
print("results = analyzer.compare_all_methods(window=30)")

Prompt Engineer to run the analyzeer and results + show where the imputation occured and also show some specific imputed values
USE the two datasets
financial_data_missing
&
df (original data)

In [None]:
class FinancialForwardBackwardFill:
    """
    Forward/Backward Fill imputation for financial time series data.
    LOCF = Last Observation Carried Forward
    NOCB = Next Observation Carried Backward
    """
    
    def __init__(self, data_with_missing, original_data=None):
        """
        Initialize the forward/backward fill analyzer
        
        Parameters:
        - data_with_missing: DataFrame or numpy array with missing values
        - original_data: Original complete data for evaluation (optional)
        """
        self.data_missing = data_with_missing.copy() if hasattr(data_with_missing, 'copy') else data_with_missing.copy()
        self.original_data = original_data.copy() if original_data is not None else None
        self.imputation_results = {}
        self.performance_metrics = {}
        
    def analyze_missing_pattern(self):
        """Analyze missing pattern - critical for forward/backward fill"""
        if isinstance(self.data_missing, pd.DataFrame):
            missing_info = self.data_missing.isnull()
        else:
            missing_info = pd.DataFrame(np.isnan(self.data_missing))
        
        print("Forward/Backward Fill - Missing Pattern Analysis")
        print("=" * 60)
        
        total_missing = missing_info.sum().sum()
        total_cells = missing_info.shape[0] * missing_info.shape[1]
        missing_pct = (total_missing / total_cells) * 100
        
        print(f"Dataset shape: {missing_info.shape}")
        print(f"Total missing values: {total_missing:,}")
        print(f"Missing percentage: {missing_pct:.2f}%")
        
        # Check for edge cases (first/last row missing)
        first_row_missing = missing_info.iloc[0, :].sum()
        last_row_missing = missing_info.iloc[-1, :].sum()
        
        print(f"\nEdge case analysis:")
        print(f"  First row missing values: {first_row_missing}")
        print(f"  Last row missing values: {last_row_missing}")
        
        if first_row_missing > 0:
            print("  WARNING: First row has missing values - forward fill will fail here")
        if last_row_missing > 0:
            print("  WARNING: Last row has missing values - backward fill will fail here")
            
        # Analyze consecutive gaps
        max_consecutive_gaps = []
        for col in range(missing_info.shape[1]):
            col_missing = missing_info.iloc[:, col]
            consecutive = 0
            max_consecutive = 0
            for val in col_missing:
                if val:
                    consecutive += 1
                    max_consecutive = max(max_consecutive, consecutive)
                else:
                    consecutive = 0
            max_consecutive_gaps.append(max_consecutive)
        
        overall_max_gap = max(max_consecutive_gaps) if max_consecutive_gaps else 0
        print(f"  Maximum consecutive gap: {overall_max_gap} periods")
        
        if overall_max_gap > 10:
            print("  WARNING: Long consecutive gaps detected!")
            print("  Consider interpolation for gaps > 10 periods")
        else:
            print("  GOOD: Gap length suitable for forward/backward fill")
            
        return {
            'total_missing': total_missing,
            'missing_pct': missing_pct,
            'first_row_missing': first_row_missing,
            'last_row_missing': last_row_missing,
            'max_gap': overall_max_gap
        }
    
    def forward_fill_locf(self):
        """
        Forward Fill (LOCF) - Last Observation Carried Forward
        Uses previous valid value to fill missing data
        """
        print("\nFORWARD FILL (LOCF) - Last Observation Carried Forward")
        print("=" * 60)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            
            # Forward fill each column
            for col in imputed_data.columns:
                before_count = imputed_data[col].isnull().sum()
                imputed_data[col] = imputed_data[col].fillna(method='ffill')
                after_count = imputed_data[col].isnull().sum()
                filled_count = before_count - after_count
                print(f"  {col}: filled {filled_count} values via forward fill")
                
                if after_count > 0:
                    print(f"    WARNING: {after_count} values still missing (no prior value available)")
        
        else:
            imputed_data = self.data_missing.copy()
            rows, cols = imputed_data.shape
            
            for col in range(cols):
                filled_count = 0
                remaining_missing = 0
                
                # Forward fill column by column
                for row in range(1, rows):  # Start from row 1
                    if np.isnan(imputed_data[row, col]):
                        if not np.isnan(imputed_data[row-1, col]):
                            # Fill with previous value
                            imputed_data[row, col] = imputed_data[row-1, col]
                            filled_count += 1
                        else:
                            remaining_missing += 1
                
                if col < 5:  # Print first 5 for brevity
                    print(f"  Asset {col+1:03d}: filled {filled_count} values")
                    if remaining_missing > 0:
                        print(f"    WARNING: {remaining_missing} values still missing")
            
            if cols > 5:
                print(f"  ... processed {cols-5} more assets")
        
        self.imputation_results['forward_fill'] = imputed_data
        return imputed_data
    
    def backward_fill_nocb(self):
        """
        Backward Fill (NOCB) - Next Observation Carried Backward
        Uses next valid value to fill missing data
        """
        print("\nBACKWARD FILL (NOCB) - Next Observation Carried Backward")
        print("=" * 60)
        
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            
            # Backward fill each column
            for col in imputed_data.columns:
                before_count = imputed_data[col].isnull().sum()
                imputed_data[col] = imputed_data[col].fillna(method='bfill')
                after_count = imputed_data[col].isnull().sum()
                filled_count = before_count - after_count
                print(f"  {col}: filled {filled_count} values via backward fill")
                
                if after_count > 0:
                    print(f"    WARNING: {after_count} values still missing (no future value available)")
        
        else:
            imputed_data = self.data_missing.copy()
            rows, cols = imputed_data.shape
            
            for col in range(cols):
                filled_count = 0
                remaining_missing = 0
                
                # Backward fill column by column (go backwards)
                for row in range(rows-2, -1, -1):  # Start from second-to-last row, go to row 0
                    if np.isnan(imputed_data[row, col]):
                        if not np.isnan(imputed_data[row+1, col]):
                            # Fill with next value
                            imputed_data[row, col] = imputed_data[row+1, col]
                            filled_count += 1
                        else:
                            remaining_missing += 1
                
                if col < 5:  # Print first 5 for brevity
                    print(f"  Asset {col+1:03d}: filled {filled_count} values")
                    if remaining_missing > 0:
                        print(f"    WARNING: {remaining_missing} values still missing")
            
            if cols > 5:
                print(f"  ... processed {cols-5} more assets")
        
        self.imputation_results['backward_fill'] = imputed_data
        return imputed_data
    
    def combined_fill(self):
        """
        Combined Forward-Backward Fill
        1. First apply forward fill
        2. Then apply backward fill to remaining gaps
        """
        print("\nCOMBINED FORWARD-BACKWARD FILL")
        print("=" * 50)
        
        # Start with forward fill
        if isinstance(self.data_missing, pd.DataFrame):
            imputed_data = self.data_missing.copy()
            
            # Step 1: Forward fill
            imputed_data = imputed_data.fillna(method='ffill')
            # Step 2: Backward fill remaining
            imputed_data = imputed_data.fillna(method='bfill')
            
            # Count what was filled
            original_missing = self.data_missing.isnull().sum().sum()
            final_missing = imputed_data.isnull().sum().sum()
            filled_count = original_missing - final_missing
            
            print(f"  Total values filled: {filled_count}")
            print(f"  Remaining missing: {final_missing}")
        
        else:
            imputed_data = self.data_missing.copy()
            rows, cols = imputed_data.shape
            
            # Step 1: Forward fill
            for col in range(cols):
                for row in range(1, rows):
                    if np.isnan(imputed_data[row, col]) and not np.isnan(imputed_data[row-1, col]):
                        imputed_data[row, col] = imputed_data[row-1, col]
            
            # Step 2: Backward fill remaining
            for col in range(cols):
                for row in range(rows-2, -1, -1):
                    if np.isnan(imputed_data[row, col]) and not np.isnan(imputed_data[row+1, col]):
                        imputed_data[row, col] = imputed_data[row+1, col]
            
            original_missing = np.sum(np.isnan(self.data_missing))
            final_missing = np.sum(np.isnan(imputed_data))
            filled_count = original_missing - final_missing
            
            print(f"  Total values filled: {filled_count}")
            print(f"  Remaining missing: {final_missing}")
        
        self.imputation_results['combined_fill'] = imputed_data
        return imputed_data
    
    def evaluate_imputation_quality(self, method_name, imputed_data):
        """
        Evaluate fill quality - same as before but optimized for time series
        """
        if self.original_data is None:
            print(f"\nNo original data available for {method_name} evaluation")
            return None
            
        print(f"\nIMPUTATION QUALITY EVALUATION: {method_name.upper()}")
        print("=" * 50)
        
        try:
            # Convert to numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
                
            if hasattr(imputed_data, 'values'):
                imputed_np = imputed_data.values.copy()
            else:
                imputed_np = imputed_data.copy()
            
            # Find missing positions
            mask = np.isnan(missing_np)
            total_missing = np.sum(mask)
            
            if total_missing == 0:
                print("  No missing values found!")
                return None
            
            # Extract original and imputed values
            orig_vals = []
            imp_vals = []
            
            rows, cols = missing_np.shape
            for i in range(rows):
                for j in range(cols):
                    if mask[i, j]:
                        orig_vals.append(original_np[i, j])
                        imp_vals.append(imputed_np[i, j])
            
            orig_vals = np.array(orig_vals)
            imp_vals = np.array(imp_vals)
            
            print(f"  Evaluated {len(orig_vals)} imputed values")
            
            # Calculate metrics
            differences = orig_vals - imp_vals
            abs_differences = np.abs(differences)
            
            mae = np.mean(abs_differences)
            rmse = np.sqrt(np.mean(differences**2))
            
            # MAPE calculation
            nonzero_mask = orig_vals != 0
            if np.sum(nonzero_mask) > 0:
                mape = np.mean(abs_differences[nonzero_mask] / np.abs(orig_vals[nonzero_mask])) * 100
            else:
                mape = float('inf')
            
            # Correlation
            if len(orig_vals) > 1:
                corr = np.corrcoef(orig_vals, imp_vals)[0, 1]
            else:
                corr = 1.0
            
            print(f"  MAE: ${mae:.2f}")
            print(f"  RMSE: ${rmse:.2f}")
            print(f"  MAPE: {mape:.1f}%" if np.isfinite(mape) else "  MAPE: ∞")
            print(f"  Correlation: {corr:.3f}")
            
            # Time series specific evaluation
            print(f"\n  TIME SERIES EVALUATION:")
            
            # Check for trend preservation
            orig_trend = np.mean(np.diff(orig_vals))
            imp_trend = np.mean(np.diff(imp_vals))
            trend_error = abs(orig_trend - imp_trend)
            
            print(f"  Original trend: {orig_trend:.3f}/period")
            print(f"  Imputed trend: {imp_trend:.3f}/period") 
            print(f"  Trend preservation error: {trend_error:.3f}")
            
            # Financial interpretation
            if mape < 2:
                print("  EXCELLENT: Very accurate for financial data")
            elif mape < 5:
                print("  GOOD: Acceptable for most financial analysis")
            elif mape < 10:
                print("  FAIR: Use with caution for risk calculations")
            else:
                print("  POOR: High errors - consider other methods")
            
            metrics = {
                'MAE': mae,
                'RMSE': rmse,
                'MAPE': mape,
                'Correlation': corr,
                'Trend_Error': trend_error,
                'N_Values': len(orig_vals)
            }
            
            self.performance_metrics[method_name] = metrics
            return metrics
            
        except Exception as e:
            print(f"  ERROR: {e}")
            return None
    
    def compare_all_methods(self):
        """
        Compare Forward Fill, Backward Fill, and Combined approaches
        """
        print("\n" + "="*80)
        print("FORWARD/BACKWARD FILL COMPREHENSIVE ANALYSIS")
        print("="*80)
        
        # Analyze missing pattern
        pattern_info = self.analyze_missing_pattern()
        
        # Run all methods
        print(f"\n{'='*25} FORWARD FILL {'='*25}")
        ff_result = self.forward_fill_locf()
        self.evaluate_imputation_quality('forward_fill', ff_result)
        
        print(f"\n{'='*25} BACKWARD FILL {'='*24}")
        bf_result = self.backward_fill_nocb()
        self.evaluate_imputation_quality('backward_fill', bf_result)
        
        print(f"\n{'='*23} COMBINED FILL {'='*23}")
        combined_result = self.combined_fill()
        self.evaluate_imputation_quality('combined_fill', combined_result)
        
        # Summary comparison
        if self.performance_metrics:
            self.print_comparison()
        
        return {
            'Forward Fill': ff_result,
            'Backward Fill': bf_result,
            'Combined Fill': combined_result
        }
    
    def print_comparison(self):
        """Print method comparison"""
        print("\n" + "="*60)
        print("METHOD COMPARISON SUMMARY")
        print("="*60)
        
        if self.performance_metrics:
            comparison_df = pd.DataFrame(self.performance_metrics).T
            print(comparison_df.round(3))
            
            # Find best method
            best_method = comparison_df['RMSE'].idxmin()
            print(f"\nBEST PERFORMING: {best_method.replace('_', ' ').title()}")
        
        print(f"\nFINANCIAL TIME SERIES RECOMMENDATIONS:")
        print("-" * 45)
        print("BEST CHOICE: Combined Fill")
        print("  - Fills most gaps possible")
        print("  - Uses both past and future information")
        print("  - Minimal remaining missing values")
        
        print(f"\nFORWARD FILL (LOCF):")
        print("  - Conservative approach")
        print("  - Only uses past information") 
        print("  - Good for real-time applications")
        
        print(f"\nBACKWARD FILL (NOCB):")
        print("  - Uses future information")
        print("  - Good for filling beginning gaps")
        print("  - Less realistic for trading strategies")


# READY TO USE
print("="*60)
print("FORWARD/BACKWARD FILL ANALYZER READY!")
print("="*60)
print("Simple and effective for financial time series gaps")
print("\nUSAGE:")
print("analyzer = FinancialForwardBackwardFill(data_with_missing, original_data)")
print("results = analyzer.compare_all_methods()")

Prompt Engineer to run the analyzeer and results + show where the imputation occured and also show some specific imputed values
USE the two datasets
financial_data_missing
&
df (original data)

In [None]:
analyzer = FinancialForwardBackwardFill(financial_data_missing, df)

# Run comprehensive analysis
results = analyzer.compare_all_methods()

# Access specific methods
forward_filled = results['Forward Fill']
combined_filled = results['Combined Fill']  
print(pd.DataFrame(results['Combined Fill']).head(10))

# Show some specific imputed values
backward_filled = results['Backward Fill']
print("\nSample of imputed values (original NaN -> new value):")
for i in range(10):  # Show first 10 missing positions
    row, col = np.where(missing_mask)
    if i < len(row):
        print(f"Position ({row[i]}, {col[i]}): NaN -> {backward_filled.iloc[row[i], col[i]]:.2f}")


In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

class SimpleSklearnKNNImputation:
    """
    Simple K-NN imputation using sklearn KNNImputer.
    Clean, straightforward implementation for financial time series data.
    """
    
    def __init__(self, data_with_missing, original_data=None):
        """
        Initialize sklearn KNN imputation
        
        Parameters:
        - data_with_missing: DataFrame or numpy array with missing values
        - original_data: Original complete data for evaluation (optional)
        """
        self.data_missing = data_with_missing.copy() if hasattr(data_with_missing, 'copy') else data_with_missing.copy()
        self.original_data = original_data.copy() if original_data is not None else None
        self.imputation_results = {}
        self.performance_metrics = {}
        self.scalers = {}
        
    def analyze_missing_pattern(self):
        """Quick missing value analysis"""
        if isinstance(self.data_missing, pd.DataFrame):
            missing_info = self.data_missing.isnull()
            data_shape = self.data_missing.shape
        else:
            missing_info = pd.DataFrame(np.isnan(self.data_missing))
            data_shape = self.data_missing.shape
        
        total_missing = missing_info.sum().sum()
        missing_pct = (total_missing / (data_shape[0] * data_shape[1])) * 100
        
        print("K-NN IMPUTATION - MISSING DATA ANALYSIS")
        print("=" * 50)
        print(f"Dataset shape: {data_shape}")
        print(f"Total missing values: {total_missing:,}")
        print(f"Missing percentage: {missing_pct:.2f}%")
        
        return {'total_missing': total_missing, 'missing_pct': missing_pct}
    
    def knn_imputation_basic(self, n_neighbors=5):
        """
        Basic sklearn KNN imputation
        """
        print(f"\nBASIC SKLEARN K-NN IMPUTATION (K={n_neighbors})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        
        # Apply KNN imputation
        imputer = KNNImputer(n_neighbors=n_neighbors)
        imputed_data = imputer.fit_transform(data)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        
        self.imputation_results['knn_basic'] = imputed_data
        return imputed_data
    
    def knn_imputation_scaled(self, n_neighbors=5):
        """
        KNN imputation with feature scaling (better for financial data)
        """
        print(f"\nSCALED SKLEARN K-NN IMPUTATION (K={n_neighbors})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        print("Applying StandardScaler for better distance calculations...")
        
        # Scale the data first (handles NaN automatically)
        scaler = StandardScaler()  # Z-score normalization : (~N(0,1))
        data_scaled = scaler.fit_transform(data)
        
        # Apply KNN imputation on scaled data
        imputer = KNNImputer(n_neighbors=n_neighbors)
        imputed_scaled = imputer.fit_transform(data_scaled)
        
        # Transform back to original scale
        imputed_data = scaler.inverse_transform(imputed_scaled)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        
        self.imputation_results['knn_scaled'] = imputed_data
        self.scalers['knn_scaled'] = scaler
        return imputed_data
    
    def knn_imputation_different_k(self, k_values=[3, 5, 7, 10]):
        """
        Test different K values to find optimal
        """
        print(f"\nTESTING DIFFERENT K VALUES: {k_values}")
        print("=" * 50)
        
        results = {}
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        for k in k_values:
            print(f"\nTesting K={k}...")
            
            # Apply KNN imputation
            imputer = KNNImputer(n_neighbors=k)
            imputed_data = imputer.fit_transform(data)
            
            # Quick evaluation if original data available
            if self.original_data is not None:
                metrics = self.quick_evaluate(imputed_data, f'k_{k}')
                results[f'K={k}'] = {
                    'data': imputed_data,
                    'rmse': metrics['RMSE'] if metrics else None
                }
                print(f"  RMSE: {metrics['RMSE']:.3f}" if metrics else "  No evaluation data")
            else:
                results[f'K={k}'] = {'data': imputed_data, 'rmse': None}
                print(f"  Imputation completed")
        
        # Find best K if evaluation possible
        if self.original_data is not None:
            best_k = min(k_values, key=lambda k: results[f'K={k}']['rmse'] if results[f'K={k}']['rmse'] else float('inf'))
            print(f"\nBest K value: {best_k}")
            self.imputation_results['knn_best_k'] = results[f'K={best_k}']['data']
        
        return results
    
    def quick_evaluate(self, imputed_data, method_name):
        """Quick evaluation for K comparison"""
        if self.original_data is None:
            return None
            
        try:
            # Convert to numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
            
            # Find missing positions and extract values
            mask = np.isnan(missing_np)
            orig_vals = original_np[mask]
            imp_vals = imputed_data[mask]
            
            # Calculate RMSE
            rmse = np.sqrt(np.mean((orig_vals - imp_vals) ** 2))
            
            return {'RMSE': rmse}
            
        except Exception:
            return None
    
    def evaluate_imputation_quality(self, method_name, imputed_data):
        """
        Comprehensive evaluation of KNN imputation quality
        """
        if self.original_data is None:
            print(f"\nNo original data available for {method_name} evaluation")
            return None
            
        print(f"\nSKLEARN K-NN EVALUATION: {method_name.upper()}")
        print("=" * 50)
        
        try:
            # Convert to numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
            
            # Find missing positions
            mask = np.isnan(missing_np)
            total_missing = np.sum(mask)
            
            # Extract values
            orig_vals = original_np[mask]
            imp_vals = imputed_data[mask]
            
            print(f"Evaluated {len(orig_vals)} imputed values")
            
            # Calculate metrics
            mae = np.mean(np.abs(orig_vals - imp_vals))
            rmse = np.sqrt(np.mean((orig_vals - imp_vals) ** 2))
            
            # MAPE calculation
            nonzero_mask = orig_vals != 0
            if np.sum(nonzero_mask) > 0:
                mape = np.mean(np.abs(orig_vals - imp_vals)[nonzero_mask] / np.abs(orig_vals[nonzero_mask])) * 100
            else:
                mape = float('inf')
            
            # Correlation
            if len(orig_vals) > 1:
                corr = np.corrcoef(orig_vals, imp_vals)[0, 1]
            else:
                corr = 1.0
            
            print(f"MAE: ${mae:.2f}")
            print(f"RMSE: ${rmse:.2f}")
            print(f"MAPE: {mape:.1f}%" if np.isfinite(mape) else "MAPE: ∞")
            print(f"Correlation: {corr:.3f}")
            
            # Financial interpretation
            if mape < 3:
                print("EXCELLENT: Very accurate K-NN imputation")
            elif mape < 7:
                print("GOOD: Acceptable K-NN performance")
            elif mape < 15:
                print("FAIR: Consider different K or scaling")
            else:
                print("POOR: K-NN may not suit this data")
            
            metrics = {
                'MAE': mae,
                'RMSE': rmse,
                'MAPE': mape,
                'Correlation': corr,
                'N_Values': len(orig_vals)
            }
            
            self.performance_metrics[method_name] = metrics
            return metrics
            
        except Exception as e:
            print(f"ERROR: {e}")
            return None
    
    def compare_all_methods(self, n_neighbors=5):
        """
        Compare different sklearn KNN approaches
        """
        print("\n" + "="*60)
        print("SKLEARN K-NN IMPUTATION ANALYSIS")
        print("="*60)
        
        # Analyze missing pattern
        self.analyze_missing_pattern()
        
        # Test basic KNN
        print(f"\n{'='*15} BASIC K-NN {'='*15}")
        basic_result = self.knn_imputation_basic(n_neighbors)
        self.evaluate_imputation_quality('knn_basic', basic_result)
        
        # Test scaled KNN
        print(f"\n{'='*15} SCALED K-NN {'='*14}")
        scaled_result = self.knn_imputation_scaled(n_neighbors)
        self.evaluate_imputation_quality('knn_scaled', scaled_result)
        
        # Test different K values
        print(f"\n{'='*12} K VALUE TESTING {'='*12}")
        k_results = self.knn_imputation_different_k([3, 5, 7, 10])
        
        # Summary comparison
        if self.performance_metrics:
            self.print_comparison()
        
        return {
            'Basic KNN': basic_result,
            'Scaled KNN': scaled_result,
            'K Results': k_results
        }
    
    def print_comparison(self):
        """Print method comparison summary"""
        print("\n" + "="*50)
        print("SKLEARN K-NN COMPARISON SUMMARY")
        print("="*50)
        
        if self.performance_metrics:
            comparison_df = pd.DataFrame(self.performance_metrics).T
            print(comparison_df.round(3))
            
            # Find best method
            best_method = comparison_df['RMSE'].idxmin()
            print(f"\nBest performing: {best_method.replace('_', ' ').title()}")
        
        print(f"\nRECOMMENDATIONS:")
        print("- Scaled K-NN usually performs best for financial data")
        print("- K=5 is good default, but aslo test  other ranges")
        print("- Feature scaling important when assets have different price ranges")
        print("- sklearn KNNImputer is robust and well-tested")


# READY TO USE
print("="*50)
print("SKLEARN K-NN IMPUTATION READY!")
print("="*50)
print("Simple sklearn-based KNN imputation")
print("\nUSAGE:")
print("analyzer = SimpleSklearnKNNImputation(data_with_missing, original_data)")
print("results = analyzer.compare_all_methods(n_neighbors=5)")

In [None]:
analyzer = SimpleSklearnKNNImputation(financial_data_missing, df)

# Run comprehensive analysis
results = analyzer.compare_all_methods(n_neighbors = 5)

# Access specific results
best_imputed = results['Scaled KNN']  # Usually performs best
print(pd.DataFrame(best_imputed).head(10))

print("\nSample of imputed values (original NaN -> new value):")
for i in range(10):  # Show first 10 missing positions
    row, col = np.where(missing_mask)
    if i < len(row):
        print(f"Position ({row[i]}, {col[i]}): NaN -> {best_imputed[row[i], col[i]]:.2f}")

In [None]:
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
#from sklearn.preprocessing import StandardScaler

class SimpleSklearnMICEImputation:
    """
    Simple MICE (Multiple Imputation by Chained Equations) using sklearn IterativeImputer.
    Clean, straightforward implementation for financial time series data.
    """
    
    def __init__(self, data_with_missing, original_data=None):
        """
        Initialize MICE imputation
        
        Parameters:
        - data_with_missing: DataFrame or numpy array with missing values
        - original_data: Original complete data for evaluation (optional)
        """
        self.data_missing = data_with_missing.copy() if hasattr(data_with_missing, 'copy') else data_with_missing.copy()
        self.original_data = original_data.copy() if original_data is not None else None
        self.imputation_results = {}
        self.performance_metrics = {}
        self.scalers = {}
        
    def analyze_missing_pattern(self):
        """Quick missing value analysis for MICE"""
        if isinstance(self.data_missing, pd.DataFrame):
            missing_info = self.data_missing.isnull()
            data_shape = self.data_missing.shape
        else:
            missing_info = pd.DataFrame(np.isnan(self.data_missing))
            data_shape = self.data_missing.shape
        
        total_missing = missing_info.sum().sum()
        missing_pct = (total_missing / (data_shape[0] * data_shape[1])) * 100
        
        print("MICE IMPUTATION - MISSING DATA ANALYSIS")
        print("=" * 50)
        print(f"Dataset shape: {data_shape}")
        print(f"Total missing values: {total_missing:,}")
        print(f"Missing percentage: {missing_pct:.2f}%")
        
        if missing_pct > 50:
            print("WARNING: High missing percentage may affect MICE performance")
        elif missing_pct > 20:
            print("CAUTION: Moderate missing percentage - MICE will work but may be slow")
        else:
            print("GOOD: Missing percentage suitable for MICE imputation")
        
        return {'total_missing': total_missing, 'missing_pct': missing_pct}
    
    def mice_imputation_basic(self, max_iter=10, random_state=42):
        """
        Basic MICE imputation with BayesianRidge estimator
        """
        print(f"\nBASIC MICE IMPUTATION (max_iter={max_iter})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        print("Using BayesianRidge estimator (default)")
        
        # Apply MICE imputation
        imputer = IterativeImputer(
            estimator=BayesianRidge(),
            max_iter=max_iter,
            random_state=random_state,
            verbose=0
        )
        
        imputed_data = imputer.fit_transform(data)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        print(f"Convergence after {max_iter} iterations")
        
        self.imputation_results['mice_basic'] = imputed_data
        return imputed_data
    
    def mice_imputation_linear(self, max_iter=10, random_state=42):
        """
        MICE imputation with LinearRegression estimator
        """
        print(f"\nMICE WITH LINEAR REGRESSION (max_iter={max_iter})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        print("Using LinearRegression estimator (faster)")
        
        # Apply MICE imputation
        imputer = IterativeImputer(
            estimator=LinearRegression(),
            max_iter=max_iter,
            random_state=random_state,
            verbose=0
        )
        
        imputed_data = imputer.fit_transform(data)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        print(f"Linear regression converged after {max_iter} iterations")
        
        self.imputation_results['mice_linear'] = imputed_data
        return imputed_data
    
    def mice_imputation_rf(self, max_iter=5, random_state=42):
        """
        MICE imputation with RandomForest estimator (more robust but slower)
        """
        print(f"\nMICE WITH RANDOM FOREST (max_iter={max_iter})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        print("Using RandomForest estimator (robust, handles non-linearity)")
        
        # Apply MICE imputation with RandomForest
        imputer = IterativeImputer(
            estimator=RandomForestRegressor(n_estimators=10, random_state=random_state),
            max_iter=max_iter,  # Fewer iterations for RF as it's slower
            random_state=random_state,
            verbose=0
        )
        
        imputed_data = imputer.fit_transform(data)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        print(f"RandomForest MICE completed")
        
        self.imputation_results['mice_rf'] = imputed_data
        return imputed_data
    
    def mice_imputation_scaled(self, max_iter=10, random_state=42):
        """
        MICE imputation with feature scaling (better for financial data)
        """
        print(f"\nSCALED MICE IMPUTATION (max_iter={max_iter})")
        print("=" * 50)
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        print(f"Processing {data.shape[0]}x{data.shape[1]} dataset...")
        print("Applying StandardScaler before MICE...")
        
        # Scale the data first (handles NaN automatically)
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        # Apply MICE imputation on scaled data
        imputer = IterativeImputer(
            estimator=BayesianRidge(),
            max_iter=max_iter,
            random_state=random_state,
            verbose=0
        )
        
        imputed_scaled = imputer.fit_transform(data_scaled)
        
        # Transform back to original scale
        imputed_data = scaler.inverse_transform(imputed_scaled)
        
        # Count imputed values
        original_missing = np.sum(np.isnan(data))
        final_missing = np.sum(np.isnan(imputed_data))
        imputed_count = original_missing - final_missing
        
        print(f"Successfully imputed: {imputed_count} values")
        print(f"Remaining missing: {final_missing}")
        print(f"Scaled MICE completed and inverse transformed")
        
        self.imputation_results['mice_scaled'] = imputed_data
        self.scalers['mice_scaled'] = scaler
        return imputed_data
    
    def mice_test_iterations(self, iter_values=[5, 10, 15, 20]):
        """
        Test different iteration counts to find optimal convergence
        """
        print(f"\nTESTING DIFFERENT ITERATION COUNTS: {iter_values}")
        print("=" * 60)
        
        results = {}
        
        # Convert to numpy array
        if hasattr(self.data_missing, 'values'):
            data = self.data_missing.values.copy()
        else:
            data = self.data_missing.copy()
        
        for max_iter in iter_values:
            print(f"\nTesting max_iter={max_iter}...")
            
            # Apply MICE imputation
            imputer = IterativeImputer(
                estimator=BayesianRidge(),
                max_iter=max_iter,
                random_state=42,
                verbose=0
            )
            
            imputed_data = imputer.fit_transform(data)
            
            # Quick evaluation if original data available
            if self.original_data is not None:
                metrics = self.quick_evaluate(imputed_data, f'iter_{max_iter}')
                results[f'iter_{max_iter}'] = {
                    'data': imputed_data,
                    'rmse': metrics['RMSE'] if metrics else None,
                    'iterations': max_iter
                }
                print(f"  RMSE: {metrics['RMSE']:.3f}" if metrics else "  No evaluation data")
            else:
                results[f'iter_{max_iter}'] = {'data': imputed_data, 'rmse': None, 'iterations': max_iter}
                print(f"  MICE completed in {max_iter} iterations")
        
        # Find best iteration count if evaluation possible
        if self.original_data is not None:
            best_iter = min(iter_values, key=lambda i: results[f'iter_{i}']['rmse'] if results[f'iter_{i}']['rmse'] else float('inf'))
            print(f"\nBest iteration count: {best_iter}")
            self.imputation_results['mice_best_iter'] = results[f'iter_{best_iter}']['data']
        
        return results
    
    def quick_evaluate(self, imputed_data, method_name):
        """Quick evaluation for iteration/method comparison"""
        if self.original_data is None:
            return None
            
        try:
            # Convert to numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
            
            # Find missing positions and extract values
            mask = np.isnan(missing_np)
            orig_vals = original_np[mask]
            imp_vals = imputed_data[mask]
            
            # Calculate RMSE
            rmse = np.sqrt(np.mean((orig_vals - imp_vals) ** 2))
            
            return {'RMSE': rmse}
            
        except Exception:
            return None
    
    def evaluate_imputation_quality(self, method_name, imputed_data):
        """
        Comprehensive evaluation of MICE imputation quality
        """
        if self.original_data is None:
            print(f"\nNo original data available for {method_name} evaluation")
            return None
            
        print(f"\nMICE EVALUATION: {method_name.upper()}")
        print("=" * 50)
        
        try:
            # Convert to numpy arrays
            if hasattr(self.data_missing, 'values'):
                missing_np = self.data_missing.values.copy()
            else:
                missing_np = self.data_missing.copy()
                
            if hasattr(self.original_data, 'values'):
                original_np = self.original_data.values.copy()
            else:
                original_np = self.original_data.copy()
            
            # Find missing positions
            mask = np.isnan(missing_np)
            total_missing = np.sum(mask)
            
            # Extract values
            orig_vals = original_np[mask]
            imp_vals = imputed_data[mask]
            
            print(f"Evaluated {len(orig_vals)} imputed values")
            
            # Calculate metrics
            mae = np.mean(np.abs(orig_vals - imp_vals))
            rmse = np.sqrt(np.mean((orig_vals - imp_vals) ** 2))
            
            # MAPE calculation
            nonzero_mask = orig_vals != 0
            if np.sum(nonzero_mask) > 0:
                mape = np.mean(np.abs(orig_vals - imp_vals)[nonzero_mask] / np.abs(orig_vals[nonzero_mask])) * 100
            else:
                mape = float('inf')
            
            # Correlation
            if len(orig_vals) > 1:
                corr = np.corrcoef(orig_vals, imp_vals)[0, 1]
            else:
                corr = 1.0
            
            # MICE-specific: Variance preservation
            orig_var = np.var(orig_vals)
            imp_var = np.var(imp_vals)
            var_ratio = imp_var / orig_var if orig_var > 0 else 1.0
            
            print(f"MAE: ${mae:.2f}")
            print(f"RMSE: ${rmse:.2f}")
            print(f"MAPE: {mape:.1f}%" if np.isfinite(mape) else "MAPE: ∞")
            print(f"Correlation: {corr:.3f}")
            print(f"Variance ratio: {var_ratio:.3f} (1.0 = perfect)")
            
            # Financial interpretation
            if mape < 2:
                print("EXCELLENT: MICE captured multivariate relationships very well")
            elif mape < 5:
                print("GOOD: MICE found meaningful patterns between variables")
            elif mape < 10:
                print("FAIR: Some multivariate structure captured")
            else:
                print("POOR: Consider different estimator or more iterations")
            
            metrics = {
                'MAE': mae,
                'RMSE': rmse,
                'MAPE': mape,
                'Correlation': corr,
                'Variance_Ratio': var_ratio,
                'N_Values': len(orig_vals)
            }
            
            self.performance_metrics[method_name] = metrics
            return metrics
            
        except Exception as e:
            print(f"ERROR: {e}")
            return None
    
    def compare_all_mice_methods(self, max_iter=10):
        """
        Compare different MICE approaches
        """
        print("\n" + "="*60)
        print("MICE IMPUTATION COMPREHENSIVE ANALYSIS")
        print("="*60)
        
        # Analyze missing pattern
        self.analyze_missing_pattern()
        
        # Test basic MICE
        print(f"\n{'='*15} BASIC MICE {'='*15}")
        basic_result = self.mice_imputation_basic(max_iter)
        self.evaluate_imputation_quality('mice_basic', basic_result)
        
        # Test linear MICE (faster)
        print(f"\n{'='*14} LINEAR MICE {'='*14}")
        linear_result = self.mice_imputation_linear(max_iter)
        self.evaluate_imputation_quality('mice_linear', linear_result)
        
        # Test scaled MICE
        print(f"\n{'='*14} SCALED MICE {'='*14}")
        scaled_result = self.mice_imputation_scaled(max_iter)
        self.evaluate_imputation_quality('mice_scaled', scaled_result)
        
        # Test RandomForest MICE (fewer iterations as it's slower)
        print(f"\n{'='*12} RANDOM FOREST MICE {'='*12}")
        rf_result = self.mice_imputation_rf(max_iter=5)  # Fewer iterations for RF
        self.evaluate_imputation_quality('mice_rf', rf_result)
        
        # Test different iteration counts
        print(f"\n{'='*10} ITERATION TESTING {'='*10}")
        iter_results = self.mice_test_iterations([5, 10, 15])
        
        # Summary comparison
        if self.performance_metrics:
            self.print_comparison()
        
        return {
            'Basic MICE': basic_result,
            'Linear MICE': linear_result,
            'Scaled MICE': scaled_result,
            'RandomForest MICE': rf_result,
            'Iteration Results': iter_results
        }
    
    def print_comparison(self):
        """Print MICE method comparison summary"""
        print("\n" + "="*50)
        print("MICE COMPARISON SUMMARY")
        print("="*50)
        
        if self.performance_metrics:
            comparison_df = pd.DataFrame(self.performance_metrics).T
            print(comparison_df.round(3))
            
            # Find best method
            best_method = comparison_df['RMSE'].idxmin()
            print(f"\nBest performing: {best_method.replace('_', ' ').title()}")
        
        print(f"\nMICE RECOMMENDATIONS:")
        print("- Basic MICE (BayesianRidge): Good balance of speed and accuracy")
        print("- Linear MICE: Fastest for large datasets")
        print("- Scaled MICE: Best for financial data with different scales")
        print("- RandomForest MICE: Best for non-linear relationships")
        print("- 10-15 iterations usually sufficient for convergence")
        
        print(f"\nADVANTAGES OF MICE:")
        print("✅ Captures relationships between variables")
        print("✅ Preserves variance and distributions")
        print("✅ Handles different variable types")
        print("✅ Provides uncertainty estimates")


# READY TO USE
print("="*50)
print("SKLEARN MICE IMPUTATION READY!")
print("="*50)
print("Multiple Imputation by Chained Equations with sklearn")
print("\nUSAGE:")
print("analyzer = SimpleSklearnMICEImputation(data_with_missing, original_data)")
print("results = analyzer.compare_all_mice_methods(max_iter=10)")

In [None]:
analyzer = SimpleSklearnMICEImputation(financial_data_missing, df)

# Run comprehensive analysis
results = analyzer.compare_all_mice_methods(max_iter=10)

# Access specific methods
scaled_mice = results['Scaled MICE']  # Usually best for financial data
print(pd.DataFrame(scaled_mice).head(10))

print("\nSample of imputed values from Scaled MICE (original NaN -> new value):")
for i in range(10):  # Show first 10 missing positions
    row, col = np.where(missing_mask)
    if i < len(row):
        print(f"Position ({row[i]}, {col[i]}): NaN -> {scaled_mice[row[i], col[i]]:.2f}")