# Stage 1 : Preprocessing

In [2]:
#cek column tiap data
# ===================================================================
# 🔍 DATASET COLUMN AUDITOR - Check All 11 Files Structure
# ===================================================================

import pandas as pd
import os
from collections import Counter

# 📁 Dataset filenames (sesuai dengan yang lu punya)
DATASET_FILES = [
    'AAPL_al.csv', 'AMZN_yf.csv', 'BAC_al.csv', 'BBCA_yf.csv',
    'GOOGL_yf.csv', 'JPM_al.csv', 'META_al.csv', 'MSFT_yf.csv',
    'NFLX_al.csv', 'NVDA_al.csv', 'TSLA_yf.csv'
]

def audit_all_datasets():
    """Comprehensive audit of all 11 datasets"""
    print("🔍 STARTING DATASET COLUMN AUDIT")
    print("="*80)

    audit_results = []
    all_columns = []
    file_status = {}

    for i, file_path in enumerate(DATASET_FILES, 1):
        print(f"\n[{i}/11] 📊 Auditing: {file_path}")
        print("-" * 50)

        try:
            # Load file
            df = pd.read_csv(file_path)
            ticker = file_path.split('_')[0]

            # Basic info
            print(f"✅ File loaded successfully")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {list(df.columns)}")
            print(f"   Data types: {dict(df.dtypes)}")

            # Sample data
            print(f"   First row sample:")
            for col in df.columns:
                sample_val = df[col].iloc[0] if len(df) > 0 else "N/A"
                print(f"     {col}: {sample_val}")

            # Missing values
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print(f"   ⚠️  Missing values: {dict(missing[missing > 0])}")
            else:
                print(f"   ✅ No missing values")

            # Date column detection
            date_cols = [col for col in df.columns if any(word in col.lower() for word in ['date', 'time', 'timestamp'])]
            print(f"   📅 Potential date columns: {date_cols}")

            # Store results
            audit_results.append({
                'File': file_path,
                'Ticker': ticker,
                'Shape': f"{df.shape[0]}x{df.shape[1]}",
                'Columns': list(df.columns),
                'Column_Count': len(df.columns),
                'Missing_Values': missing.sum(),
                'Date_Columns': date_cols,
                'Status': 'SUCCESS'
            })

            # Collect all unique columns
            all_columns.extend(df.columns.tolist())
            file_status[file_path] = 'SUCCESS'

        except Exception as e:
            print(f"❌ Error loading file: {str(e)}")
            audit_results.append({
                'File': file_path,
                'Ticker': file_path.split('_')[0],
                'Shape': 'ERROR',
                'Columns': [],
                'Column_Count': 0,
                'Missing_Values': 0,
                'Date_Columns': [],
                'Status': f'ERROR: {str(e)}'
            })
            file_status[file_path] = f'ERROR: {str(e)}'

    return audit_results, all_columns, file_status

def analyze_column_patterns(audit_results, all_columns):
    """Analyze column patterns across all files"""
    print(f"\n{'='*80}")
    print("📊 COLUMN PATTERN ANALYSIS")
    print("="*80)

    # Count column occurrences
    column_counts = Counter(all_columns)
    print(f"\n🔢 Column Frequency Across All Files:")
    print("-" * 40)
    for col, count in column_counts.most_common():
        percentage = (count / len(DATASET_FILES)) * 100
        status = "✅" if count == len(DATASET_FILES) else "⚠️" if count >= len(DATASET_FILES) * 0.7 else "❌"
        print(f"{status} {col:<20} : {count:>2}/11 files ({percentage:5.1f}%)")

    # Standard columns check
    standard_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
    print(f"\n🎯 Standard Columns Availability:")
    print("-" * 40)

    for std_col in standard_cols:
        # Check variations
        variations = [col for col in column_counts.keys() if std_col.lower() in col.lower()]
        if variations:
            print(f"✅ {std_col} variants found: {variations}")
        else:
            print(f"❌ {std_col} NOT FOUND in any file")

    # Unique column sets
    print(f"\n📋 Unique Column Sets:")
    print("-" * 40)
    column_sets = {}
    for result in audit_results:
        if result['Status'] == 'SUCCESS':
            col_tuple = tuple(sorted(result['Columns']))
            if col_tuple not in column_sets:
                column_sets[col_tuple] = []
            column_sets[col_tuple].append(result['Ticker'])

    for i, (cols, tickers) in enumerate(column_sets.items(), 1):
        print(f"Set {i}: {tickers}")
        print(f"  Columns: {list(cols)}")
        print()

def generate_column_mapping_suggestions(audit_results):
    """Generate smart column mapping suggestions"""
    print(f"\n{'='*80}")
    print("🧠 SMART COLUMN MAPPING SUGGESTIONS")
    print("="*80)

    # Collect all unique columns
    all_unique_cols = set()
    for result in audit_results:
        if result['Status'] == 'SUCCESS':
            all_unique_cols.update(result['Columns'])

    # Smart mapping
    mapping_suggestions = {}

    for col in all_unique_cols:
        col_lower = col.lower().strip()

        # Date mapping
        if any(word in col_lower for word in ['date', 'time', 'timestamp', 'datetime']):
            mapping_suggestions[col] = 'Date'

        # Price mappings
        elif 'open' in col_lower:
            mapping_suggestions[col] = 'Open'
        elif 'high' in col_lower:
            mapping_suggestions[col] = 'High'
        elif 'low' in col_lower:
            mapping_suggestions[col] = 'Low'
        elif any(word in col_lower for word in ['close', 'adj close', 'adjusted', 'price']) and 'open' not in col_lower:
            mapping_suggestions[col] = 'Close'
        elif any(word in col_lower for word in ['volume', 'vol']):
            mapping_suggestions[col] = 'Volume'
        else:
            mapping_suggestions[col] = f'UNKNOWN_{col}'

    print("📝 Suggested Column Mappings:")
    print("-" * 50)
    for original, suggested in mapping_suggestions.items():
        status = "✅" if suggested != f'UNKNOWN_{original}' else "❓"
        print(f"{status} '{original}' → '{suggested}'")

    return mapping_suggestions

def create_summary_table(audit_results):
    """Create summary table"""
    print(f"\n{'='*80}")
    print("📋 DATASET SUMMARY TABLE")
    print("="*80)

    # Create DataFrame for better formatting
    summary_data = []
    for result in audit_results:
        summary_data.append({
            'Ticker': result['Ticker'],
            'Shape': result['Shape'],
            'Columns': result['Column_Count'],
            'Missing': result['Missing_Values'],
            'Status': result['Status'][:20] + '...' if len(result['Status']) > 20 else result['Status']
        })

    summary_df = pd.DataFrame(summary_data)
    print(summary_df.to_string(index=False))

    # Statistics
    success_count = len([r for r in audit_results if r['Status'] == 'SUCCESS'])
    print(f"\n📊 Summary Statistics:")
    print(f"   ✅ Successful files: {success_count}/11")
    print(f"   ❌ Failed files: {11 - success_count}/11")

    return summary_df

def generate_preprocessing_code(mapping_suggestions):
    """Generate custom preprocessing code based on audit"""
    print(f"\n{'='*80}")
    print("🛠️  CUSTOM PREPROCESSING CODE GENERATOR")
    print("="*80)

    print("Based on audit, here's your custom column mapping:")
    print()
    print("```python")
    print("def smart_column_mapping(df):")
    print('    """Custom column mapping based on audit results"""')
    print("    column_mapping = {}")
    print()

    for original, suggested in mapping_suggestions.items():
        if suggested.startswith('UNKNOWN_'):
            print(f"    # TODO: Handle '{original}' - couldn't auto-map")
        else:
            print(f"    if '{original}' in df.columns:")
            print(f"        column_mapping['{original}'] = '{suggested}'")

    print()
    print("    return column_mapping")
    print("```")

# 🚀 MAIN EXECUTION
def main_audit():
    """Main audit execution"""
    print("🔍 Starting comprehensive dataset audit...")

    # Step 1: Audit all files
    audit_results, all_columns, file_status = audit_all_datasets()

    # Step 2: Analyze patterns
    analyze_column_patterns(audit_results, all_columns)

    # Step 3: Generate mapping suggestions
    mapping_suggestions = generate_column_mapping_suggestions(audit_results)

    # Step 4: Create summary
    summary_df = create_summary_table(audit_results)

    # Step 5: Generate custom code
    generate_preprocessing_code(mapping_suggestions)

    print(f"\n🎉 AUDIT COMPLETED!")
    print(f"💡 Now you can run preprocessing with confidence!")

    return audit_results, mapping_suggestions, summary_df

# 🚀 RUN THE AUDIT
if __name__ == "__main__":
    audit_results, mapping_suggestions, summary = main_audit()

🔍 Starting comprehensive dataset audit...
🔍 STARTING DATASET COLUMN AUDIT

[1/11] 📊 Auditing: AAPL_al.csv
--------------------------------------------------
✅ File loaded successfully
   Shape: (6471, 6)
   Columns: ['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Volume']
   Data types: {'Unnamed: 0': dtype('O'), 'Open': dtype('float64'), 'High': dtype('float64'), 'Low': dtype('float64'), 'Close': dtype('float64'), 'Volume': dtype('float64')}
   First row sample:
     Unnamed: 0: 1999-11-01
     Open: 80.0
     High: 80.69
     Low: 77.37
     Close: 77.62
     Volume: 2487300.0
   ✅ No missing values
   📅 Potential date columns: []

[2/11] 📊 Auditing: AMZN_yf.csv
--------------------------------------------------
✅ File loaded successfully
   Shape: (2515, 7)
   Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']
   Data types: {'Date': dtype('O'), 'Open': dtype('O'), 'High': dtype('O'), 'Low': dtype('O'), 'Close': dtype('O'), 'Volume': dtype('O'), 'Ticker': dtype('O

In [3]:
# ===================================================================
# 🛠️ STAGE 1: CUSTOM PREPROCESSING - Based on Audit Results
# ===================================================================

import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# 📁 Dataset classification based on audit
ALPHA_VANTAGE_FILES = [
    'AAPL_al.csv', 'BAC_al.csv', 'JPM_al.csv',
    'META_al.csv', 'NFLX_al.csv', 'NVDA_al.csv'
]

YFINANCE_CORRUPTED_FILES = [
    'AMZN_yf.csv', 'GOOGL_yf.csv', 'MSFT_yf.csv', 'TSLA_yf.csv'
]

BBCA_FILES = ['BBCA_yf.csv']

def extract_ticker(filename):
    """Extract ticker from filename"""
    return filename.split('_')[0]

def process_alpha_vantage_file(file_path):
    """Process Alpha Vantage format files"""
    print(f"🔧 Processing Alpha Vantage: {file_path}")

    df = pd.read_csv(file_path)
    ticker = extract_ticker(file_path)

    # Rename 'Unnamed: 0' to 'Date'
    df = df.rename(columns={'Unnamed: 0': 'Date'})

    # Add ticker column
    df['Ticker'] = ticker

    # Standardize column order
    df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]

    # Convert date
    df['Date'] = pd.to_datetime(df['Date'])

    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)

    print(f"   ✅ Success: {len(df)} records, Date range: {df['Date'].min()} to {df['Date'].max()}")
    return df

def process_yfinance_corrupted_file(file_path):
    """Process corrupted YFinance files"""
    print(f"🔧 Processing Corrupted YFinance: {file_path}")

    # Read file - header is corrupted, data starts from row 1
    df = pd.read_csv(file_path)
    ticker = extract_ticker(file_path)

    print(f"   Original shape: {df.shape}")
    print(f"   First row sample: {df.iloc[0].to_dict()}")

    # Check if first row contains ticker data (corruption pattern)
    if df.iloc[0]['Open'] == ticker or str(df.iloc[0]['Open']).upper() == ticker:
        print(f"   🚨 Detected corruption: removing header row")
        # Remove first row (contains ticker symbols)
        df = df.iloc[1:].copy()
        df = df.reset_index(drop=True)

    # Handle missing/corrupted columns
    if 'Date' in df.columns:
        # Remove rows where Date is NaN or contains ticker symbol
        df = df[df['Date'].notna()]
        df = df[df['Date'] != ticker]
        df = df[~df['Date'].astype(str).str.upper().eq(ticker)]

    # Clean data types
    numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    for col in numeric_cols:
        if col in df.columns:
            # Remove ticker symbols from numeric columns
            df[col] = df[col].astype(str).str.replace(ticker, '')
            df[col] = df[col].str.replace(ticker.upper(), '')
            # Convert to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Remove rows with all NaN numeric data
    df = df.dropna(subset=numeric_cols, how='all')

    # Add/fix ticker column
    df['Ticker'] = ticker

    # Standardize column order
    standard_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']
    df = df[standard_cols]

    # Convert date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])

    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)

    print(f"   ✅ Cleaned: {len(df)} records, Date range: {df['Date'].min()} to {df['Date'].max()}")
    return df

def process_bbca_file(file_path):
    """Process BBCA format file"""
    print(f"🔧 Processing BBCA: {file_path}")

    df = pd.read_csv(file_path)
    ticker = extract_ticker(file_path)

    print(f"   Original shape: {df.shape}")
    print(f"   Original columns: {list(df.columns)}")
    print(f"   First row: {df.iloc[0].to_dict()}")

    # Check for corruption pattern
    if df.iloc[0]['Open'] == 'BBCA.JK' or 'BBCA' in str(df.iloc[0]['Open']):
        print(f"   🚨 Detected corruption: removing header row")
        df = df.iloc[1:].copy()
        df = df.reset_index(drop=True)

    # Handle Price column (appears to be Date)
    if 'Price' in df.columns:
        # Rename Price to Date if it contains date-like values
        sample_val = str(df['Price'].iloc[0]) if len(df) > 0 else ""
        if any(char in sample_val for char in ['-', '/', '20']):  # Date-like pattern
            df = df.rename(columns={'Price': 'Date'})

    # Clean numeric columns
    numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    for col in numeric_cols:
        if col in df.columns:
            # Remove ticker symbols
            df[col] = df[col].astype(str).str.replace('BBCA.JK', '')
            df[col] = df[col].astype(str).str.replace('BBCA', '')
            # Convert to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Remove rows with all NaN
    df = df.dropna(subset=numeric_cols, how='all')

    # Add ticker
    df['Ticker'] = ticker

    # Standardize columns
    if 'Date' not in df.columns:
        # If still no Date column, create index-based dates (last resort)
        df['Date'] = pd.date_range(start='2014-01-01', periods=len(df), freq='D')
        print(f"   ⚠️ No date column found, created synthetic dates")

    # Reorder columns
    df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]

    # Convert date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])

    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)

    print(f"   ✅ Processed: {len(df)} records")
    return df

def validate_processed_data(df, ticker):
    """Validate processed data quality"""
    issues = []

    # Check required columns
    required_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        issues.append(f"Missing columns: {missing_cols}")

    # Check data types
    if 'Date' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['Date']):
        issues.append("Date column not datetime")

    # Check for negative prices
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        if col in df.columns and (df[col] < 0).any():
            issues.append(f"Negative values in {col}")

    # Check for missing data
    missing_data = df.isnull().sum()
    critical_missing = missing_data[missing_data > len(df) * 0.1]  # >10% missing
    if len(critical_missing) > 0:
        issues.append(f"High missing data: {dict(critical_missing)}")

    # Check date range
    if 'Date' in df.columns:
        date_range = df['Date'].max() - df['Date'].min()
        if date_range.days < 365:  # Less than 1 year
            issues.append(f"Short date range: {date_range.days} days")

    return issues

def save_processed_data(df, ticker, output_dir='processed_data'):
    """Save processed data"""
    os.makedirs(output_dir, exist_ok=True)

    filename = f"{output_dir}/{ticker}_processed.csv"
    df.to_csv(filename, index=False)

    print(f"   💾 Saved: {filename}")
    return filename

def generate_processing_report(results):
    """Generate comprehensive processing report"""
    print(f"\n{'='*80}")
    print("📋 STAGE 1 PROCESSING REPORT")
    print("="*80)

    successful = [r for r in results if r['status'] == 'SUCCESS']
    failed = [r for r in results if r['status'] != 'SUCCESS']

    print(f"✅ Successfully processed: {len(successful)}/11 files")
    print(f"❌ Failed: {len(failed)}/11 files")

    if successful:
        print(f"\n📊 Successful Files:")
        for result in successful:
            print(f"   ✅ {result['ticker']}: {result['records']} records ({result['date_range']})")

    if failed:
        print(f"\n❌ Failed Files:")
        for result in failed:
            print(f"   ❌ {result['ticker']}: {result['error']}")

    # Data quality issues
    quality_issues = []
    for result in successful:
        if result['issues']:
            quality_issues.extend([(result['ticker'], issue) for issue in result['issues']])

    if quality_issues:
        print(f"\n⚠️ Data Quality Issues:")
        for ticker, issue in quality_issues:
            print(f"   ⚠️ {ticker}: {issue}")

    return {'successful': len(successful), 'failed': len(failed), 'quality_issues': len(quality_issues)}

def main_preprocessing():
    """Main preprocessing function"""
    print("🚀 STARTING CUSTOM PREPROCESSING BASED ON AUDIT")
    print("="*80)

    results = []

    # Process Alpha Vantage files
    print(f"\n📂 Processing Alpha Vantage Files ({len(ALPHA_VANTAGE_FILES)} files)")
    for file_path in ALPHA_VANTAGE_FILES:
        try:
            df = process_alpha_vantage_file(file_path)
            ticker = extract_ticker(file_path)
            issues = validate_processed_data(df, ticker)
            save_processed_data(df, ticker)

            results.append({
                'ticker': ticker,
                'status': 'SUCCESS',
                'records': len(df),
                'date_range': f"{df['Date'].min().date()} to {df['Date'].max().date()}",
                'issues': issues
            })
        except Exception as e:
            results.append({
                'ticker': extract_ticker(file_path),
                'status': 'ERROR',
                'error': str(e),
                'records': 0,
                'date_range': 'N/A',
                'issues': []
            })
            print(f"   ❌ Error processing {file_path}: {str(e)}")

    # Process corrupted YFinance files
    print(f"\n📂 Processing Corrupted YFinance Files ({len(YFINANCE_CORRUPTED_FILES)} files)")
    for file_path in YFINANCE_CORRUPTED_FILES:
        try:
            df = process_yfinance_corrupted_file(file_path)
            ticker = extract_ticker(file_path)
            issues = validate_processed_data(df, ticker)
            save_processed_data(df, ticker)

            results.append({
                'ticker': ticker,
                'status': 'SUCCESS',
                'records': len(df),
                'date_range': f"{df['Date'].min().date()} to {df['Date'].max().date()}",
                'issues': issues
            })
        except Exception as e:
            results.append({
                'ticker': extract_ticker(file_path),
                'status': 'ERROR',
                'error': str(e),
                'records': 0,
                'date_range': 'N/A',
                'issues': []
            })
            print(f"   ❌ Error processing {file_path}: {str(e)}")

    # Process BBCA file
    print(f"\n📂 Processing BBCA File ({len(BBCA_FILES)} files)")
    for file_path in BBCA_FILES:
        try:
            df = process_bbca_file(file_path)
            ticker = extract_ticker(file_path)
            issues = validate_processed_data(df, ticker)
            save_processed_data(df, ticker)

            results.append({
                'ticker': ticker,
                'status': 'SUCCESS',
                'records': len(df),
                'date_range': f"{df['Date'].min().date()} to {df['Date'].max().date()}",
                'issues': issues
            })
        except Exception as e:
            results.append({
                'ticker': extract_ticker(file_path),
                'status': 'ERROR',
                'error': str(e),
                'records': 0,
                'date_range': 'N/A',
                'issues': []
            })
            print(f"   ❌ Error processing {file_path}: {str(e)}")

    # Generate report
    report = generate_processing_report(results)

    print(f"\n🎉 STAGE 1 PREPROCESSING COMPLETED!")
    print(f"📂 Clean data ready in: processed_data/ directory")
    print(f"🔜 Ready for STAGE 2: Feature Engineering")

    return results, report

# 🚀 RUN PREPROCESSING
if __name__ == "__main__":
    results, report = main_preprocessing()

🚀 STARTING CUSTOM PREPROCESSING BASED ON AUDIT

📂 Processing Alpha Vantage Files (6 files)
🔧 Processing Alpha Vantage: AAPL_al.csv
   ✅ Success: 6471 records, Date range: 1999-11-01 00:00:00 to 2025-07-24 00:00:00
   💾 Saved: processed_data/AAPL_processed.csv
🔧 Processing Alpha Vantage: BAC_al.csv
   ✅ Success: 6471 records, Date range: 1999-11-01 00:00:00 to 2025-07-24 00:00:00
   💾 Saved: processed_data/BAC_processed.csv
🔧 Processing Alpha Vantage: JPM_al.csv
   ✅ Success: 6471 records, Date range: 1999-11-01 00:00:00 to 2025-07-24 00:00:00
   💾 Saved: processed_data/JPM_processed.csv
🔧 Processing Alpha Vantage: META_al.csv
   ✅ Success: 3314 records, Date range: 2012-05-18 00:00:00 to 2025-07-24 00:00:00
   💾 Saved: processed_data/META_processed.csv
🔧 Processing Alpha Vantage: NFLX_al.csv
   ✅ Success: 5830 records, Date range: 2002-05-23 00:00:00 to 2025-07-24 00:00:00
   💾 Saved: processed_data/NFLX_processed.csv
🔧 Processing Alpha Vantage: NVDA_al.csv
   ✅ Success: 6471 records, 


# Stage 2 : Feature Engineering

In [4]:
!pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=8639749a9e3b1d6f88f7cc9bd328505902575a07e4db28393f1a3e1a4887946b
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [5]:
# ===================================================================
# 🧪 STAGE 2: FEATURE ENGINEERING - FIXED VERSION
# ===================================================================

import pandas as pd
import numpy as np
import ta
import os
import warnings
warnings.filterwarnings('ignore')

def load_processed_data(processed_dir='processed_data'):
    """Load all processed data from Stage 1"""
    print("📂 Loading processed data from Stage 1...")

    datasets = {}
    processed_files = [f for f in os.listdir(processed_dir) if f.endswith('_processed.csv')]

    for file in processed_files:
        ticker = file.replace('_processed.csv', '')
        file_path = os.path.join(processed_dir, file)

        try:
            df = pd.read_csv(file_path)
            df['Date'] = pd.to_datetime(df['Date'])
            df = df.sort_values('Date').reset_index(drop=True)
            datasets[ticker] = df
            print(f"   ✅ Loaded {ticker}: {len(df)} records")
        except Exception as e:
            print(f"   ❌ Error loading {ticker}: {str(e)}")

    print(f"📊 Total datasets loaded: {len(datasets)}")
    return datasets

def add_moving_averages(df):
    """Add Simple and Exponential Moving Averages"""
    print("   📈 Adding Moving Averages...")

    # Simple Moving Averages
    df['SMA_5'] = ta.trend.sma_indicator(df['Close'], window=5)
    df['SMA_10'] = ta.trend.sma_indicator(df['Close'], window=10)
    df['SMA_20'] = ta.trend.sma_indicator(df['Close'], window=20)
    df['SMA_50'] = ta.trend.sma_indicator(df['Close'], window=50)

    # Exponential Moving Averages
    df['EMA_5'] = ta.trend.ema_indicator(df['Close'], window=5)
    df['EMA_10'] = ta.trend.ema_indicator(df['Close'], window=10)
    df['EMA_20'] = ta.trend.ema_indicator(df['Close'], window=20)
    df['EMA_50'] = ta.trend.ema_indicator(df['Close'], window=50)

    # Moving Average Crossovers
    df['SMA_5_10_Cross'] = (df['SMA_5'] > df['SMA_10']).astype(int)
    df['EMA_5_20_Cross'] = (df['EMA_5'] > df['EMA_20']).astype(int)

    print(f"      ✅ Added 10 Moving Average indicators")
    return df

def add_rsi_indicators(df):
    """Add RSI (Relative Strength Index) indicators"""
    print("   📊 Adding RSI indicators...")

    # Standard RSI
    df['RSI_14'] = ta.momentum.rsi(df['Close'], window=14)
    df['RSI_7'] = ta.momentum.rsi(df['Close'], window=7)
    df['RSI_21'] = ta.momentum.rsi(df['Close'], window=21)

    # RSI signals
    df['RSI_Overbought'] = (df['RSI_14'] > 70).astype(int)
    df['RSI_Oversold'] = (df['RSI_14'] < 30).astype(int)
    df['RSI_Neutral'] = ((df['RSI_14'] >= 30) & (df['RSI_14'] <= 70)).astype(int)

    print(f"      ✅ Added 6 RSI indicators")
    return df

def add_macd_indicators(df):
    """Add MACD (Moving Average Convergence Divergence) indicators"""
    print("   📉 Adding MACD indicators...")

    # MACD components
    df['MACD'] = ta.trend.macd(df['Close'])
    df['MACD_Signal'] = ta.trend.macd_signal(df['Close'])
    df['MACD_Histogram'] = ta.trend.macd_diff(df['Close'])

    # MACD signals
    df['MACD_Bullish'] = (df['MACD'] > df['MACD_Signal']).astype(int)
    df['MACD_Bearish'] = (df['MACD'] < df['MACD_Signal']).astype(int)

    print(f"      ✅ Added 5 MACD indicators")
    return df

def add_bollinger_bands(df):
    """Add Bollinger Bands indicators"""
    print("   📏 Adding Bollinger Bands...")

    # Bollinger Bands
    df['BB_High'] = ta.volatility.bollinger_hband(df['Close'])
    df['BB_Low'] = ta.volatility.bollinger_lband(df['Close'])
    df['BB_Mid'] = ta.volatility.bollinger_mavg(df['Close'])
    df['BB_Width'] = df['BB_High'] - df['BB_Low']
    df['BB_Position'] = (df['Close'] - df['BB_Low']) / (df['BB_High'] - df['BB_Low'])

    # Bollinger Bands signals
    df['BB_Squeeze'] = (df['BB_Width'] < df['BB_Width'].rolling(20).mean()).astype(int)
    df['BB_Upper_Break'] = (df['Close'] > df['BB_High']).astype(int)
    df['BB_Lower_Break'] = (df['Close'] < df['BB_Low']).astype(int)

    print(f"      ✅ Added 8 Bollinger Bands indicators")
    return df

def add_volume_indicators(df):
    """Add Volume-based indicators - FIXED VERSION"""
    print("   📊 Adding Volume indicators...")

    # Custom Volume Moving Averages (since ta.volume.volume_sma doesn't exist)
    df['Volume_MA_10'] = df['Volume'].rolling(window=10).mean()
    df['Volume_MA_20'] = df['Volume'].rolling(window=20).mean()

    # Volume Weighted Average Price (manual calculation)
    df['VWAP'] = (df['Close'] * df['Volume']).rolling(20).sum() / df['Volume'].rolling(20).sum()

    # On Balance Volume
    df['OBV'] = ta.volume.on_balance_volume(df['Close'], df['Volume'])

    # Volume Price Trend
    df['VPT'] = ta.volume.volume_price_trend(df['Close'], df['Volume'])

    # Accumulation/Distribution Line
    df['ADL'] = ta.volume.acc_dist_index(df['High'], df['Low'], df['Close'], df['Volume'])

    # Volume signals
    df['High_Volume'] = (df['Volume'] > df['Volume_MA_20'] * 1.5).astype(int)
    df['Low_Volume'] = (df['Volume'] < df['Volume_MA_20'] * 0.5).astype(int)

    print(f"      ✅ Added 8 Volume indicators")
    return df

def add_momentum_indicators(df):
    """Add Momentum indicators"""
    print("   🚀 Adding Momentum indicators...")

    # Rate of Change
    df['ROC_5'] = ta.momentum.roc(df['Close'], window=5)
    df['ROC_10'] = ta.momentum.roc(df['Close'], window=10)
    df['ROC_20'] = ta.momentum.roc(df['Close'], window=20)

    # Stochastic Oscillator
    df['Stoch_K'] = ta.momentum.stoch(df['High'], df['Low'], df['Close'])
    df['Stoch_D'] = ta.momentum.stoch_signal(df['High'], df['Low'], df['Close'])

    # Williams %R
    df['Williams_R'] = ta.momentum.williams_r(df['High'], df['Low'], df['Close'])

    # Momentum signals
    df['Strong_Momentum'] = (df['ROC_10'] > 2).astype(int)
    df['Weak_Momentum'] = (df['ROC_10'] < -2).astype(int)

    print(f"      ✅ Added 8 Momentum indicators")
    return df

def add_volatility_indicators(df):
    """Add Volatility indicators"""
    print("   📊 Adding Volatility indicators...")

    # Average True Range
    df['ATR'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'])

    # Volatility (Rolling Standard Deviation)
    df['Volatility_10'] = df['Close'].rolling(window=10).std()
    df['Volatility_20'] = df['Close'].rolling(window=20).std()

    # Price Range indicators
    df['Daily_Range'] = df['High'] - df['Low']
    df['Daily_Range_Pct'] = (df['Daily_Range'] / df['Close']) * 100

    # Volatility signals
    df['High_Volatility'] = (df['Volatility_20'] > df['Volatility_20'].rolling(50).mean() * 1.2).astype(int)
    df['Low_Volatility'] = (df['Volatility_20'] < df['Volatility_20'].rolling(50).mean() * 0.8).astype(int)

    print(f"      ✅ Added 7 Volatility indicators")
    return df

def add_price_patterns(df):
    """Add Price Pattern indicators"""
    print("   📈 Adding Price Pattern indicators...")

    # Daily Returns
    df['Daily_Return'] = df['Close'].pct_change()
    df['Daily_Return_Abs'] = abs(df['Daily_Return'])

    # Gap indicators
    df['Gap_Up'] = (df['Open'] > df['Close'].shift(1)).astype(int)
    df['Gap_Down'] = (df['Open'] < df['Close'].shift(1)).astype(int)

    # Doji pattern (Open ≈ Close)
    df['Doji'] = (abs(df['Open'] - df['Close']) / df['Close'] < 0.01).astype(int)

    # Hammer/Shooting Star patterns
    body_size = abs(df['Close'] - df['Open'])
    upper_shadow = df['High'] - np.maximum(df['Open'], df['Close'])
    lower_shadow = np.minimum(df['Open'], df['Close']) - df['Low']

    df['Hammer'] = ((lower_shadow > 2 * body_size) & (upper_shadow < body_size)).astype(int)
    df['Shooting_Star'] = ((upper_shadow > 2 * body_size) & (lower_shadow < body_size)).astype(int)

    # Support/Resistance levels
    df['Near_High_52w'] = (df['Close'] > df['Close'].rolling(252).max() * 0.95).astype(int)
    df['Near_Low_52w'] = (df['Close'] < df['Close'].rolling(252).min() * 1.05).astype(int)

    print(f"      ✅ Added 10 Price Pattern indicators")
    return df

def add_advanced_indicators(df):
    """Add Advanced technical indicators"""
    print("   🎯 Adding Advanced indicators...")

    # Fibonacci Retracement levels
    high_252 = df['High'].rolling(252).max()
    low_252 = df['Low'].rolling(252).min()
    fib_range = high_252 - low_252

    df['Fib_23_6'] = high_252 - (fib_range * 0.236)
    df['Fib_38_2'] = high_252 - (fib_range * 0.382)
    df['Fib_61_8'] = high_252 - (fib_range * 0.618)

    # Ichimoku Cloud components
    high_9 = df['High'].rolling(9).max()
    low_9 = df['Low'].rolling(9).min()
    high_26 = df['High'].rolling(26).max()
    low_26 = df['Low'].rolling(26).min()

    df['Tenkan_Sen'] = (high_9 + low_9) / 2
    df['Kijun_Sen'] = (high_26 + low_26) / 2
    df['Senkou_A'] = ((df['Tenkan_Sen'] + df['Kijun_Sen']) / 2).shift(26)

    # Commodity Channel Index
    df['CCI'] = ta.trend.cci(df['High'], df['Low'], df['Close'])

    print(f"      ✅ Added 9 Advanced indicators")
    return df

def create_target_labels(df):
    """Create target labels for Random Forest (Stage 4)"""
    print("   🎯 Creating target labels...")

    # Next day price changes
    df['Next_Open'] = df['Open'].shift(-1)
    df['Next_Close'] = df['Close'].shift(-1)
    df['Next_High'] = df['High'].shift(-1)
    df['Next_Low'] = df['Low'].shift(-1)

    # Price change percentages
    df['Price_Change_Pct'] = ((df['Next_Close'] - df['Close']) / df['Close']) * 100

    # Trend labels (sesuai arsitektur.md)
    def classify_trend(price_change):
        if pd.isna(price_change):
            return 'UNKNOWN'
        elif price_change > 1.0:  # >1% increase
            return 'UP'
        elif price_change < -1.0:  # >1% decrease
            return 'DOWN'
        else:
            return 'STAY'

    df['Trend_Label'] = df['Price_Change_Pct'].apply(classify_trend)

    # 5-day ahead targets (for LSTM)
    for i in range(1, 6):  # 1 to 5 days ahead
        df[f'Target_Open_Day{i}'] = df['Open'].shift(-i)
        df[f'Target_Close_Day{i}'] = df['Close'].shift(-i)

    print(f"      ✅ Added target labels for prediction")
    return df

def validate_features(df, ticker):
    """Validate feature engineering results"""
    print(f"   🔍 Validating features for {ticker}...")

    issues = []

    # Check for infinite values
    inf_cols = df.columns[df.isin([np.inf, -np.inf]).any()].tolist()
    if inf_cols:
        issues.append(f"Infinite values in: {inf_cols}")
        # Replace inf with NaN
        df = df.replace([np.inf, -np.inf], np.nan)

    # Check percentage of missing values
    missing_pct = (df.isnull().sum() / len(df)) * 100
    high_missing = missing_pct[missing_pct > 50].index.tolist()
    if high_missing:
        issues.append(f"High missing data (>50%): {high_missing}")

    # Check feature count
    feature_cols = [col for col in df.columns if col not in ['Date', 'Ticker']]
    print(f"      📊 Total features created: {len(feature_cols)}")

    # Check data range
    if len(df) < 100:
        issues.append(f"Insufficient data: {len(df)} records")

    if issues:
        print(f"      ⚠️ Validation issues: {issues}")
    else:
        print(f"      ✅ Validation passed")

    return df, issues

def save_feature_engineered_data(df, ticker, output_dir='feature_engineered_data'):
    """Save feature engineered data"""
    os.makedirs(output_dir, exist_ok=True)

    filename = f"{output_dir}/{ticker}_fe.csv"
    df.to_csv(filename, index=False)

    print(f"   💾 Saved: {filename}")
    return filename

def generate_feature_report(processed_results):
    """Generate comprehensive feature engineering report - FIXED VERSION"""
    print(f"\n{'='*80}")
    print("📋 STAGE 2 FEATURE ENGINEERING REPORT")
    print("="*80)

    successful = [r for r in processed_results if r['status'] == 'SUCCESS']
    failed = [r for r in processed_results if r['status'] != 'SUCCESS']

    print(f"✅ Successfully processed: {len(successful)}/11 files")
    print(f"❌ Failed: {len(failed)}/11 files")

    # Initialize total_features
    total_features = 0

    if successful:
        print(f"\n📊 Feature Engineering Summary:")
        total_features = successful[0]['feature_count'] if successful else 0
        for result in successful:
            print(f"   ✅ {result['ticker']}: {result['feature_count']} features, {result['records']} records")

        print(f"\n🎯 Average features per dataset: {total_features}")

        # Quality issues summary
        all_issues = []
        for result in successful:
            all_issues.extend(result['issues'])

        if all_issues:
            print(f"\n⚠️ Data Quality Issues ({len(all_issues)} total):")
            issue_counts = {}
            for issue in all_issues:
                issue_type = issue.split(':')[0]
                issue_counts[issue_type] = issue_counts.get(issue_type, 0) + 1

            for issue_type, count in issue_counts.items():
                print(f"   ⚠️ {issue_type}: {count} occurrences")

    if failed:
        print(f"\n❌ Failed Files:")
        for result in failed:
            print(f"   ❌ {result['ticker']}: {result['error']}")

    return {'successful': len(successful), 'failed': len(failed), 'avg_features': total_features}

def main_feature_engineering():
    """Main feature engineering execution"""
    print("🧪 STARTING STAGE 2: FEATURE ENGINEERING - FIXED VERSION")
    print("="*80)

    # Load processed data from Stage 1
    datasets = load_processed_data()

    if not datasets:
        print("❌ No processed data found! Run Stage 1 first.")
        return None, None

    results = []

    for ticker, df in datasets.items():
        print(f"\n{'='*60}")
        print(f"🔬 Feature Engineering: {ticker}")
        print(f"{'='*60}")
        print(f"📊 Input: {len(df)} records, {len(df.columns)} columns")

        try:
            # Apply all feature engineering functions
            df = add_moving_averages(df)
            df = add_rsi_indicators(df)
            df = add_macd_indicators(df)
            df = add_bollinger_bands(df)
            df = add_volume_indicators(df)  # FIXED VERSION
            df = add_momentum_indicators(df)
            df = add_volatility_indicators(df)
            df = add_price_patterns(df)
            df = add_advanced_indicators(df)
            df = create_target_labels(df)

            # Validate results
            df, issues = validate_features(df, ticker)

            # Save feature engineered data
            saved_file = save_feature_engineered_data(df, ticker)

            feature_count = len([col for col in df.columns if col not in ['Date', 'Ticker']])

            results.append({
                'ticker': ticker,
                'status': 'SUCCESS',
                'records': len(df),
                'feature_count': feature_count,
                'issues': issues,
                'saved_file': saved_file
            })

            print(f"✅ {ticker} feature engineering completed!")
            print(f"   📊 Output: {len(df)} records, {feature_count} features")

        except Exception as e:
            results.append({
                'ticker': ticker,
                'status': 'ERROR',
                'error': str(e),
                'records': 0,
                'feature_count': 0,
                'issues': [],
                'saved_file': None
            })
            print(f"❌ Error processing {ticker}: {str(e)}")

    # Generate comprehensive report
    report = generate_feature_report(results)

    print(f"\n🎉 STAGE 2 FEATURE ENGINEERING COMPLETED!")
    print(f"📂 Feature engineered data ready in: feature_engineered_data/ directory")
    print(f"🔜 Ready for STAGE 3: LSTM Model Training")

    return results, report

# 🚀 RUN FEATURE ENGINEERING
if __name__ == "__main__":
    results, report = main_feature_engineering()

print("\n🎯 STAGE 2 EXECUTION COMPLETED!")
print("📝 Next: STAGE 3 - LSTM Model Training")

🧪 STARTING STAGE 2: FEATURE ENGINEERING - FIXED VERSION
📂 Loading processed data from Stage 1...
   ✅ Loaded NFLX: 5830 records
   ✅ Loaded AMZN: 2514 records
   ✅ Loaded JPM: 6471 records
   ✅ Loaded BAC: 6471 records
   ✅ Loaded AAPL: 6471 records
   ✅ Loaded GOOGL: 2514 records
   ✅ Loaded BBCA: 2592 records
   ✅ Loaded MSFT: 2514 records
   ✅ Loaded NVDA: 6471 records
   ✅ Loaded META: 3314 records
   ✅ Loaded TSLA: 2514 records
📊 Total datasets loaded: 11

🔬 Feature Engineering: NFLX
📊 Input: 5830 records, 7 columns
   📈 Adding Moving Averages...
      ✅ Added 10 Moving Average indicators
   📊 Adding RSI indicators...
      ✅ Added 6 RSI indicators
   📉 Adding MACD indicators...
      ✅ Added 5 MACD indicators
   📏 Adding Bollinger Bands...
      ✅ Added 8 Bollinger Bands indicators
   📊 Adding Volume indicators...
      ✅ Added 8 Volume indicators
   🚀 Adding Momentum indicators...
      ✅ Added 8 Momentum indicators
   📊 Adding Volatility indicators...
      ✅ Added 7 Volatility

# Stage 3 : Training Data Model LSTM

In [1]:
# ===================================================================
# 🧠 STAGE 3: GPU-OPTIMIZED UNIFIED LSTM TRAINING (T4 15GB)
# ===================================================================

import pandas as pd
import numpy as np
import os
import gc
import warnings
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# TensorFlow with GPU optimization
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.mixed_precision import set_global_policy

warnings.filterwarnings('ignore')

def setup_gpu_optimization():
    """Setup GPU optimization for T4"""
    print("🚀 Setting up GPU optimization for T4...")

    # Enable mixed precision for better performance
    set_global_policy('mixed_float16')

    # Configure GPU memory growth
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"   ✅ Found {len(gpus)} GPU(s), memory growth enabled")
        except RuntimeError as e:
            print(f"   ⚠️ GPU setup error: {e}")
    else:
        print("   ⚠️ No GPU found, using CPU")

    # Clear session and force garbage collection
    tf.keras.backend.clear_session()
    gc.collect()

    return gpus is not None and len(gpus) > 0

def load_optimized_datasets(fe_dir='feature_engineered_data', sample_ratio=0.8):
    """Load datasets with memory optimization"""
    print(f"📂 Loading datasets with optimization (sample_ratio={sample_ratio})...")

    fe_files = [f for f in os.listdir(fe_dir) if f.endswith('_fe.csv')]
    datasets = []

    for file in fe_files:
        ticker = file.replace('_fe.csv', '')
        file_path = os.path.join(fe_dir, file)

        try:
            # Load with optimized dtypes
            df = pd.read_csv(file_path, low_memory=False)
            df['Date'] = pd.to_datetime(df['Date'])
            df['Ticker'] = ticker

            # Sample data to reduce memory usage
            if sample_ratio < 1.0:
                sample_size = int(len(df) * sample_ratio)
                df = df.tail(sample_size)  # Take most recent data

            # Optimize dtypes to save memory
            for col in df.select_dtypes(include=['float64']).columns:
                df[col] = pd.to_numeric(df[col], downcast='float')

            for col in df.select_dtypes(include=['int64']).columns:
                df[col] = pd.to_numeric(df[col], downcast='integer')

            datasets.append(df)
            print(f"   ✅ {ticker}: {len(df)} records, {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        except Exception as e:
            print(f"   ❌ Error loading {ticker}: {str(e)}")

    # Combine datasets
    combined_df = pd.concat(datasets, ignore_index=True)
    combined_df = combined_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)

    # Final memory optimization
    gc.collect()

    print(f"✅ Combined dataset: {len(combined_df)} records, {combined_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    return combined_df

def select_essential_features(df):
    """Select only essential features to reduce memory"""
    print("🎯 Selecting essential features for GPU training...")

    # Core essential features (most predictive)
    essential_features = [
        # Price data
        'Open', 'High', 'Low', 'Close', 'Volume',

        # Key moving averages
        'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20',

        # Critical momentum indicators
        'RSI_14', 'MACD', 'MACD_Signal',

        # Bollinger Bands
        'BB_Position', 'BB_Width',

        # Volume indicators
        'Volume_MA_10', 'OBV',

        # Volatility
        'ATR', 'Volatility_20',

        # Price patterns
        'Daily_Return', 'ROC_10'
    ]

    # Check which features exist
    existing_features = [col for col in essential_features if col in df.columns]
    print(f"   ✅ Selected {len(existing_features)} essential features")
    print(f"   📋 Features: {existing_features}")

    return existing_features

def create_memory_efficient_sequences(df, features, sequence_length=60, batch_size=1000):
    """Create sequences in batches to manage memory"""
    print(f"📦 Creating sequences in batches (batch_size={batch_size})...")

    # Encode ticker
    label_encoder = LabelEncoder()
    df['Ticker_Encoded'] = label_encoder.fit_transform(df['Ticker'])

    final_features = features + ['Ticker_Encoded']

    # Handle missing values efficiently
    df[final_features] = df[final_features].fillna(method='ffill').fillna(method='bfill')

    # Create target columns if not exist
    target_cols = []
    for day in range(1, 6):
        open_col = f'Target_Open_Day{day}'
        close_col = f'Target_Close_Day{day}'

        if open_col not in df.columns:
            df[open_col] = df.groupby('Ticker')['Open'].shift(-day)
        if close_col not in df.columns:
            df[close_col] = df.groupby('Ticker')['Close'].shift(-day)

        target_cols.extend([open_col, close_col])

    # Create sequences by ticker to maintain chronological order
    all_X = []
    all_y = []

    for ticker in df['Ticker'].unique():
        ticker_data = df[df['Ticker'] == ticker].copy()
        ticker_data = ticker_data.sort_values('Date').reset_index(drop=True)

        # Create sequences for this ticker
        ticker_X = []
        ticker_y = []

        for i in range(sequence_length, len(ticker_data) - 5):
            # Input sequence
            x_seq = ticker_data[final_features].iloc[i-sequence_length:i].values

            # Target
            y_seq = ticker_data[target_cols].iloc[i].values

            if not np.isnan(y_seq).any():
                ticker_X.append(x_seq)
                ticker_y.append(y_seq)

        if ticker_X:
            all_X.extend(ticker_X)
            all_y.extend(ticker_y)

        print(f"   {ticker}: {len(ticker_X)} sequences")

    X = np.array(all_X, dtype=np.float32)  # Use float32 to save memory
    y = np.array(all_y, dtype=np.float32)

    print(f"   ✅ Total sequences: {len(X)}")
    print(f"   📊 X shape: {X.shape}, Y shape: {y.shape}")
    print(f"   💾 Memory usage: {X.nbytes / 1024**2:.1f} MB (X) + {y.nbytes / 1024**2:.1f} MB (y)")

    return X, y, label_encoder, final_features, target_cols

def build_gpu_optimized_model(input_shape, output_dim=10):
    """Build GPU-optimized LSTM model"""
    print(f"🏗️ Building GPU-optimized LSTM model...")

    model = Sequential([
        # Optimized LSTM layers for T4
        LSTM(96, return_sequences=True, input_shape=input_shape,
             dtype='float32', recurrent_dropout=0.1),
        Dropout(0.3),

        LSTM(48, return_sequences=False, dtype='float32', recurrent_dropout=0.1),
        Dropout(0.2),

        # Dense layers
        Dense(24, activation='relu', dtype='float32'),
        Dropout(0.2),
        Dense(output_dim, activation='linear', dtype='float32')
    ])

    # Optimizer with mixed precision
    optimizer = Adam(learning_rate=0.001)

    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae']
    )

    print(f"   ✅ Model built: {model.count_params():,} parameters")
    return model

def train_with_memory_management(model, X, y, validation_split=0.2, epochs=30, batch_size=32):
    """Train model with memory management"""
    print(f"🎯 Training with memory management...")

    # Split data
    split_idx = int(len(X) * (1 - validation_split))
    X_train, X_val = X[:split_idx], X[split_idx:]
    y_train, y_val = y[:split_idx], y[split_idx:]

    print(f"   📊 Train: {len(X_train)}, Val: {len(X_val)}")

    # Callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, min_lr=1e-6)
    ]

    # Train with generator to save memory
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1,
        shuffle=False
    )

    # Clean up
    del X_train, X_val, y_train, y_val
    gc.collect()

    return history

def evaluate_and_save_model(model, X_test, y_test, feature_scaler, target_scaler,
                           ticker_encoder, feature_cols, target_cols):
    """Evaluate and save the model"""
    print(f"📊 Evaluating model...")

    # Predictions
    y_pred_scaled = model.predict(X_test, batch_size=64)

    # Inverse transform
    y_pred = target_scaler.inverse_transform(y_pred_scaled)
    y_actual = target_scaler.inverse_transform(y_test)

    # Metrics
    mse = mean_squared_error(y_actual, y_pred)
    mae = mean_absolute_error(y_actual, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_actual, y_pred)

    print(f"   📈 Performance:")
    print(f"      RMSE: {rmse:.4f}")
    print(f"      MAE: {mae:.4f}")
    print(f"      R²: {r2:.4f}")

    # Save model
    print(f"💾 Saving model...")
    os.makedirs('unified_lstm_model', exist_ok=True)

    model.save('unified_lstm_model/unified_lstm_model.h5')

    with open('unified_lstm_model/feature_scaler.pkl', 'wb') as f:
        pickle.dump(feature_scaler, f)

    with open('unified_lstm_model/target_scaler.pkl', 'wb') as f:
        pickle.dump(target_scaler, f)

    with open('unified_lstm_model/ticker_encoder.pkl', 'wb') as f:
        pickle.dump(ticker_encoder, f)

    model_info = {
        'feature_columns': feature_cols,
        'target_columns': target_cols,
        'performance': {'rmse': rmse, 'mae': mae, 'r2': r2}
    }

    with open('unified_lstm_model/model_info.pkl', 'wb') as f:
        pickle.dump(model_info, f)

    print(f"   ✅ Model saved to: unified_lstm_model/")

    return {
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'predictions': y_pred,
        'actual': y_actual
    }

def main_gpu_optimized_training():
    """Main GPU-optimized training function"""
    print("🚀 STARTING GPU-OPTIMIZED UNIFIED LSTM TRAINING")
    print("="*70)

    try:
        # Setup GPU
        gpu_available = setup_gpu_optimization()

        # Load data
        combined_df = load_optimized_datasets(sample_ratio=0.9)  # Use 90% of data

        # Select features
        features = select_essential_features(combined_df)

        # Create sequences
        X, y, ticker_encoder, feature_cols, target_cols = create_memory_efficient_sequences(
            combined_df, features, sequence_length=60
        )

        # Scale data
        print("📏 Scaling data...")

        # Reshape for scaling
        original_shape = X.shape
        X_reshaped = X.reshape(-1, X.shape[-1])

        feature_scaler = MinMaxScaler()
        X_scaled_reshaped = feature_scaler.fit_transform(X_reshaped)
        X_scaled = X_scaled_reshaped.reshape(original_shape).astype(np.float32)

        target_scaler = MinMaxScaler()
        y_scaled = target_scaler.fit_transform(y).astype(np.float32)

        # Clean up original data
        del X, y, X_reshaped, X_scaled_reshaped
        gc.collect()

        # Split data
        test_size = 0.15
        split_idx = int(len(X_scaled) * (1 - test_size))

        X_train_val = X_scaled[:split_idx]
        X_test = X_scaled[split_idx:]
        y_train_val = y_scaled[:split_idx]
        y_test = y_scaled[split_idx:]

        print(f"📊 Data split: Train+Val: {len(X_train_val)}, Test: {len(X_test)}")

        # Build model
        model = build_gpu_optimized_model(
            input_shape=(X_train_val.shape[1], X_train_val.shape[2]),
            output_dim=len(target_cols)
        )

        # Train model
        history = train_with_memory_management(
            model, X_train_val, y_train_val,
            validation_split=0.2, epochs=25, batch_size=64
        )

        # Evaluate and save
        results = evaluate_and_save_model(
            model, X_test, y_test, feature_scaler, target_scaler,
            ticker_encoder, feature_cols, target_cols
        )

        print(f"\n🎉 TRAINING COMPLETED SUCCESSFULLY!")
        print(f"📊 Final Performance: RMSE={results['rmse']:.4f}, R²={results['r2']:.4f}")
        print(f"🔜 Ready for STAGE 4: Random Forest Training")

        return {
            'status': 'SUCCESS',
            'performance': results,
            'gpu_used': gpu_available
        }

    except Exception as e:
        print(f"\n❌ TRAINING FAILED: {str(e)}")
        import traceback
        traceback.print_exc()

        return {'status': 'ERROR', 'error': str(e)}

# 🚀 RUN GPU-OPTIMIZED TRAINING
if __name__ == "__main__":
    result = main_gpu_optimized_training()

print("\n✅ STAGE 3 EXECUTION COMPLETED!")

🚀 STARTING GPU-OPTIMIZED UNIFIED LSTM TRAINING
🚀 Setting up GPU optimization for T4...
   ✅ Found 1 GPU(s), memory growth enabled
📂 Loading datasets with optimization (sample_ratio=0.9)...
   ✅ NFLX: 5247 records, 2.2 MB
   ✅ TSLA: 2262 records, 0.9 MB
   ✅ AAPL: 5823 records, 2.4 MB
   ✅ GOOGL: 2262 records, 0.9 MB
   ✅ META: 2982 records, 1.2 MB
   ✅ BBCA: 2332 records, 1.0 MB
   ✅ NVDA: 5823 records, 2.4 MB
   ✅ MSFT: 2262 records, 0.9 MB
   ✅ BAC: 5823 records, 2.4 MB
   ✅ JPM: 5823 records, 2.4 MB
   ✅ AMZN: 2262 records, 0.9 MB
✅ Combined dataset: 42901 records, 17.8 MB
🎯 Selecting essential features for GPU training...
   ✅ Selected 20 essential features
   📋 Features: ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Position', 'BB_Width', 'Volume_MA_10', 'OBV', 'ATR', 'Volatility_20', 'Daily_Return', 'ROC_10']
📦 Creating sequences in batches (batch_size=1000)...
   AAPL: 5758 sequences
   AMZN: 2197 sequence



   📈 Performance:
      RMSE: 127.3650
      MAE: 95.9342
      R²: 0.4464
💾 Saving model...
   ✅ Model saved to: unified_lstm_model/

🎉 TRAINING COMPLETED SUCCESSFULLY!
📊 Final Performance: RMSE=127.3650, R²=0.4464
🔜 Ready for STAGE 4: Random Forest Training

✅ STAGE 3 EXECUTION COMPLETED!


# Stage 4 RF training

## Versi Tensoflow

In [5]:
# ===================================================================
# 🌲 STAGE 4: FIXED RANDOM FOREST - Versi Tensoflow
# ===================================================================

import pandas as pd
import numpy as np
import os
import warnings
import pickle
import gc
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# TensorFlow with compatibility fixes
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError, MeanAbsolutePercentageError

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def setup_tensorflow_compatibility():
    """Setup TensorFlow compatibility for model loading"""
    print("🔧 Setting up TensorFlow compatibility...")

    # Clear session
    tf.keras.backend.clear_session()

    # GPU setup
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"   ✅ GPU memory growth enabled")
        except RuntimeError as e:
            print(f"   ⚠️ GPU setup warning: {e}")

    gc.collect()

def load_lstm_model_with_custom_objects():
    """Load LSTM model with custom objects for compatibility"""
    print("🧠 Loading LSTM model with compatibility fixes...")

    model_dir = 'unified_lstm_model'
    lstm_model_path = f'{model_dir}/unified_lstm_model.h5'

    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"LSTM model directory not found: {model_dir}")

    if not os.path.exists(lstm_model_path):
        raise FileNotFoundError(f"LSTM model file not found: {lstm_model_path}")

    try:
        # Define custom objects for compatibility
        custom_objects = {
            'mse': 'mean_squared_error',
            'mae': 'mean_absolute_error',
            'mape': 'mean_absolute_percentage_error',
            'MeanSquaredError': MeanSquaredError,
            'MeanAbsoluteError': MeanAbsoluteError,
            'MeanAbsolutePercentageError': MeanAbsolutePercentageError
        }

        # Try loading with custom objects
        print("   🔄 Loading model with custom objects...")
        lstm_model = load_model(lstm_model_path, custom_objects=custom_objects)
        print(f"   ✅ LSTM model loaded successfully with custom objects")

    except Exception as e1:
        print(f"   ⚠️ Custom objects failed: {str(e1)}")

        try:
            # Try loading with compile=False
            print("   🔄 Loading model without compilation...")
            lstm_model = load_model(lstm_model_path, compile=False)

            # Recompile manually
            from tensorflow.keras.optimizers import Adam
            lstm_model.compile(
                optimizer=Adam(learning_rate=0.001),
                loss='mse',
                metrics=['mae']
            )
            print(f"   ✅ LSTM model loaded and recompiled successfully")

        except Exception as e2:
            raise Exception(f"Failed to load LSTM model. Tried custom objects: {str(e1)}. Tried compile=False: {str(e2)}")

    return lstm_model

def load_lstm_components_fixed():
    """Load LSTM components with fixes"""
    print("📦 Loading LSTM components with compatibility fixes...")

    model_dir = 'unified_lstm_model'

    try:
        # Load LSTM model with fixes
        lstm_model = load_lstm_model_with_custom_objects()

        # Load other components (these should work fine)
        with open(f'{model_dir}/feature_scaler.pkl', 'rb') as f:
            feature_scaler = pickle.load(f)
        print(f"   ✅ Feature scaler loaded")

        with open(f'{model_dir}/target_scaler.pkl', 'rb') as f:
            target_scaler = pickle.load(f)
        print(f"   ✅ Target scaler loaded")

        with open(f'{model_dir}/ticker_encoder.pkl', 'rb') as f:
            ticker_encoder = pickle.load(f)
        print(f"   ✅ Ticker encoder loaded")

        with open(f'{model_dir}/model_info.pkl', 'rb') as f:
            model_info = pickle.load(f)
        print(f"   ✅ Model info loaded")

        print(f"   📊 Feature columns: {len(model_info['feature_columns'])}")
        print(f"   📊 Target columns: {len(model_info['target_columns'])}")

        return {
            'lstm_model': lstm_model,
            'feature_scaler': feature_scaler,
            'target_scaler': target_scaler,
            'ticker_encoder': ticker_encoder,
            'model_info': model_info
        }

    except Exception as e:
        raise Exception(f"Failed to load LSTM components: {str(e)}")

def load_feature_data_optimized():
    """Load feature data optimized for processing"""
    print("📂 Loading feature engineered data...")

    fe_dir = 'feature_engineered_data'
    if not os.path.exists(fe_dir):
        raise FileNotFoundError(f"Feature directory not found: {fe_dir}")

    all_datasets = []
    fe_files = [f for f in os.listdir(fe_dir) if f.endswith('_fe.csv')]

    for file in fe_files:
        ticker = file.replace('_fe.csv', '')
        file_path = os.path.join(fe_dir, file)

        try:
            df = pd.read_csv(file_path, low_memory=False)
            df['Date'] = pd.to_datetime(df['Date'])
            df['Ticker'] = ticker

            # Take recent data (2000 records per stock for speed)
            df = df.sort_values('Date').tail(2000).reset_index(drop=True)

            # Optimize dtypes
            for col in df.select_dtypes(include=['float64']).columns:
                df[col] = pd.to_numeric(df[col], downcast='float')

            all_datasets.append(df)
            print(f"   ✅ {ticker}: {len(df)} records")

        except Exception as e:
            print(f"   ❌ Error loading {ticker}: {str(e)}")

    combined_df = pd.concat(all_datasets, ignore_index=True)
    combined_df = combined_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)

    gc.collect()
    print(f"✅ Combined data: {len(combined_df)} records")

    return combined_df

def generate_lstm_predictions_safe(combined_df, lstm_components):
    """Generate LSTM predictions with error handling"""
    print("🎯 Generating LSTM predictions safely...")

    lstm_model = lstm_components['lstm_model']
    feature_scaler = lstm_components['feature_scaler']
    ticker_encoder = lstm_components['ticker_encoder']
    feature_columns = lstm_components['model_info']['feature_columns']

    sequence_length = 60
    all_predictions = []
    prediction_metadata = []

    for ticker in combined_df['Ticker'].unique()[:8]:  # Limit to 8 stocks for speed
        print(f"   Processing {ticker}...")

        ticker_data = combined_df[combined_df['Ticker'] == ticker].copy()
        ticker_data = ticker_data.sort_values('Date').reset_index(drop=True)

        # Encode ticker safely
        try:
            ticker_encoded = ticker_encoder.transform([ticker])[0]
        except ValueError:
            print(f"   ⚠️ Unknown ticker {ticker}, skipping...")
            continue

        ticker_data['Ticker_Encoded'] = ticker_encoded

        # Handle missing values
        for col in feature_columns:
            if col in ticker_data.columns:
                ticker_data[col] = ticker_data[col].fillna(method='ffill').fillna(method='bfill')
                if ticker_data[col].isnull().any():
                    ticker_data[col] = ticker_data[col].fillna(0)

        # Create sequences (limit to last 500 for speed)
        max_sequences = 500
        start_idx = max(sequence_length, len(ticker_data) - max_sequences - sequence_length)

        ticker_sequences = []
        ticker_indices = []

        for i in range(start_idx, len(ticker_data)):
            try:
                sequence = ticker_data[feature_columns].iloc[i-sequence_length:i].values

                if sequence.shape == (sequence_length, len(feature_columns)) and not np.isnan(sequence).any():
                    ticker_sequences.append(sequence)
                    ticker_indices.append(i)
            except:
                continue

        if ticker_sequences:
            # Process sequences
            X_ticker = np.array(ticker_sequences[:max_sequences], dtype=np.float32)

            # Scale features
            original_shape = X_ticker.shape
            X_reshaped = X_ticker.reshape(-1, X_ticker.shape[-1])

            try:
                X_scaled_reshaped = feature_scaler.transform(X_reshaped)
                X_scaled = X_scaled_reshaped.reshape(original_shape)

                # Generate predictions in small batches
                batch_size = 50
                ticker_predictions = []

                for batch_start in range(0, len(X_scaled), batch_size):
                    batch_end = min(batch_start + batch_size, len(X_scaled))
                    batch_X = X_scaled[batch_start:batch_end]

                    try:
                        batch_pred = lstm_model.predict(batch_X, batch_size=16, verbose=0)
                        ticker_predictions.extend(batch_pred)
                    except Exception as e:
                        print(f"      ⚠️ Prediction error for batch: {str(e)}")
                        continue

                # Store predictions
                for j, pred in enumerate(ticker_predictions):
                    if j < len(ticker_indices):
                        data_idx = ticker_indices[j]
                        all_predictions.append(pred)
                        prediction_metadata.append({
                            'ticker': ticker,
                            'date': ticker_data['Date'].iloc[data_idx],
                            'original_index': data_idx
                        })

                print(f"      ✅ {len(ticker_predictions)} predictions generated")

            except Exception as e:
                print(f"      ❌ Error processing {ticker}: {str(e)}")
                continue

            # Clean up memory
            del X_ticker, X_scaled
            gc.collect()

    if not all_predictions:
        raise Exception("No predictions were generated successfully!")

    predictions_array = np.array(all_predictions, dtype=np.float32)
    print(f"   ✅ Total predictions: {len(predictions_array)}")

    return predictions_array, prediction_metadata

def create_rf_dataset_simplified(combined_df, lstm_predictions, prediction_metadata):
    """Create simplified RF dataset"""
    print("🔧 Creating simplified RF dataset...")

    rf_data = []

    for i, (pred, meta) in enumerate(zip(lstm_predictions, prediction_metadata)):
        ticker = meta['ticker']
        date = meta['date']

        # Find corresponding data
        ticker_data = combined_df[
            (combined_df['Ticker'] == ticker) &
            (combined_df['Date'] == date)
        ]

        if len(ticker_data) == 0:
            continue

        row = ticker_data.iloc[0]

        # LSTM features
        lstm_features = {
            'LSTM_Day1_Open': float(pred[0]), 'LSTM_Day1_Close': float(pred[1]),
            'LSTM_Day2_Open': float(pred[2]), 'LSTM_Day2_Close': float(pred[3]),
            'LSTM_Day3_Open': float(pred[4]), 'LSTM_Day3_Close': float(pred[5]),
            'LSTM_Day4_Open': float(pred[6]), 'LSTM_Day4_Close': float(pred[7]),
            'LSTM_Day5_Open': float(pred[8]), 'LSTM_Day5_Close': float(pred[9])
        }

        # Essential technical features
        tech_features = {}
        essential_indicators = [
            'RSI_14', 'MACD', 'MACD_Signal', 'BB_Position',
            'Volume_MA_10', 'ROC_10', 'ATR', 'Daily_Return',
            'SMA_10', 'SMA_20', 'EMA_10', 'Volatility_20'
        ]

        for indicator in essential_indicators:
            if indicator in row:
                value = row[indicator]
                tech_features[f'Tech_{indicator}'] = float(value) if pd.notna(value) else 0.0

        # Basic market features
        market_features = {
            'Current_Close': float(row['Close']),
            'Current_Volume': float(row['Volume']),
            'Price_Range': float(row['High'] - row['Low'])
        }

        # Create trend labels (simplified)
        current_close = row['Close']
        targets = {}

        # Simple future price lookup
        ticker_full_data = combined_df[combined_df['Ticker'] == ticker].sort_values('Date')
        current_row_df = ticker_full_data[ticker_full_data['Date'] == date]

        if len(current_row_df) > 0:
            current_idx = current_row_df.index[0]

            for day in range(1, 6):
                future_rows = ticker_full_data[ticker_full_data.index > current_idx].head(day)
                if len(future_rows) >= day:
                    future_close = future_rows.iloc[-1]['Close']
                    price_change_pct = ((future_close - current_close) / current_close) * 100

                    if price_change_pct > 1.0:
                        trend = 'UP'
                    elif price_change_pct < -1.0:
                        trend = 'DOWN'
                    else:
                        trend = 'STAY'

                    targets[f'Day{day}_Trend'] = trend
                else:
                    targets[f'Day{day}_Trend'] = 'UNKNOWN'

        # Combine all features
        row_data = {
            'Date': date,
            'Ticker': ticker,
            **lstm_features,
            **tech_features,
            **market_features,
            **targets
        }

        rf_data.append(row_data)

    rf_df = pd.DataFrame(rf_data)

    # Remove unknown targets
    for day in range(1, 6):
        rf_df = rf_df[rf_df[f'Day{day}_Trend'] != 'UNKNOWN']

    rf_df = rf_df.fillna(0)

    print(f"   ✅ RF dataset: {len(rf_df)} samples")

    return rf_df

def train_simplified_random_forests(rf_df):
    """Train simplified Random Forest models"""
    print("🌲 Training simplified Random Forest models...")

    feature_cols = [col for col in rf_df.columns
                   if not col.startswith('Day') and col not in ['Date', 'Ticker']]

    X = rf_df[feature_cols].copy().astype(np.float32)

    print(f"   📊 Features: {len(feature_cols)}")
    print(f"   📊 Samples: {len(X)}")

    trained_models = {}
    evaluation_results = {}

    for day in range(1, 6):
        target_col = f'Day{day}_Trend'
        y = rf_df[target_col].copy()

        print(f"\n   🎯 Training {target_col}...")

        class_dist = y.value_counts()
        print(f"      📊 Classes: {dict(class_dist)}")

        if len(class_dist) < 2:
            print(f"      ⚠️ Skipping {target_col} - insufficient classes")
            continue

        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )

            # Simple RF with good defaults
            rf_model = RandomForestClassifier(
                n_estimators=150,
                max_depth=15,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )

            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            f1_macro = f1_score(y_test, y_pred, average='macro')
            f1_weighted = f1_score(y_test, y_pred, average='weighted')

            print(f"      📈 Accuracy: {accuracy:.4f}")
            print(f"      📈 F1-Macro: {f1_macro:.4f}")
            print(f"      📈 F1-Weighted: {f1_weighted:.4f}")

            trained_models[target_col] = {
                'model': rf_model,
                'feature_columns': feature_cols
            }

            evaluation_results[target_col] = {
                'accuracy': accuracy,
                'f1_macro': f1_macro,
                'f1_weighted': f1_weighted,
                'y_test': y_test,
                'y_pred': y_pred,
                'class_distribution': dict(class_dist)
            }

        except Exception as e:
            print(f"      ❌ Error training {target_col}: {str(e)}")
            continue

    print(f"\n   ✅ Trained {len(trained_models)} models successfully!")

    return trained_models, evaluation_results

def evaluate_and_save_results(trained_models, evaluation_results):
    """Evaluate and save final results"""
    print("📊 Evaluating and saving results...")

    if not trained_models:
        print("   ❌ No models to evaluate!")
        return None

    # Calculate ensemble metrics
    accuracies = [evaluation_results[col]['accuracy'] for col in evaluation_results.keys()]
    f1_macros = [evaluation_results[col]['f1_macro'] for col in evaluation_results.keys()]
    f1_weighteds = [evaluation_results[col]['f1_weighted'] for col in evaluation_results.keys()]

    ensemble_metrics = {
        'average_accuracy': np.mean(accuracies),
        'average_f1_macro': np.mean(f1_macros),
        'average_f1_weighted': np.mean(f1_weighteds),
        'models_trained': len(trained_models)
    }

    print(f"   📈 Ensemble Performance:")
    print(f"      Average Accuracy: {ensemble_metrics['average_accuracy']:.4f}")
    print(f"      Average F1-Macro: {ensemble_metrics['average_f1_macro']:.4f}")
    print(f"      Models trained: {ensemble_metrics['models_trained']}/5")

    # Save models
    model_dir = 'random_forest_models'
    os.makedirs(model_dir, exist_ok=True)

    for target_col, model_data in trained_models.items():
        model_path = f"{model_dir}/{target_col}_rf_model.pkl"
        joblib.dump(model_data, model_path, compress=3)
        print(f"   💾 Saved: {model_path}")

    # Save ensemble info
    ensemble_info = {
        'ensemble_metrics': ensemble_metrics,
        'evaluation_results': evaluation_results,
        'training_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_type': 'RandomForest_Simplified_Compatible'
    }

    ensemble_path = f"{model_dir}/ensemble_info.pkl"
    with open(ensemble_path, 'wb') as f:
        pickle.dump(ensemble_info, f)

    print(f"   💾 Ensemble info saved: {ensemble_path}")

    return ensemble_metrics

def main_fixed_rf_training():
    """Main fixed RF training function"""
    print("🌲 STARTING STAGE 4: FIXED RANDOM FOREST TRAINING")
    print("="*80)
    print(f"🔧 Mode: Compatibility Fixed")
    print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*80)

    start_time = datetime.now()

    try:
        # Step 1: Setup TensorFlow
        setup_tensorflow_compatibility()

        # Step 2: Load LSTM components with fixes
        lstm_components = load_lstm_components_fixed()

        # Step 3: Load feature data
        combined_df = load_feature_data_optimized()

        # Step 4: Generate predictions safely
        lstm_predictions, prediction_metadata = generate_lstm_predictions_safe(
            combined_df, lstm_components
        )

        # Step 5: Create RF dataset
        rf_df = create_rf_dataset_simplified(
            combined_df, lstm_predictions, prediction_metadata
        )

        # Memory cleanup
        del lstm_predictions, prediction_metadata, combined_df
        gc.collect()

        # Step 6: Train models
        trained_models, evaluation_results = train_simplified_random_forests(rf_df)

        # Step 7: Evaluate and save
        ensemble_metrics = evaluate_and_save_results(trained_models, evaluation_results)

        end_time = datetime.now()
        training_time = (end_time - start_time).total_seconds() / 60

        print(f"\n🎉 STAGE 4 COMPLETED SUCCESSFULLY!")
        print(f"⏱️ Training time: {training_time:.1f} minutes")
        print(f"🎯 Models trained: {len(trained_models) if trained_models else 0}/5")

        if ensemble_metrics:
            print(f"📊 Average F1-Score: {ensemble_metrics['average_f1_macro']:.4f}")

        print(f"🚀 LSTM + RANDOM FOREST PIPELINE READY!")

        return {
            'status': 'SUCCESS',
            'training_time_minutes': training_time,
            'models_trained': len(trained_models) if trained_models else 0,
            'ensemble_metrics': ensemble_metrics
        }

    except Exception as e:
        end_time = datetime.now()
        training_time = (end_time - start_time).total_seconds() / 60

        print(f"\n❌ STAGE 4 TRAINING FAILED!")
        print(f"Error: {str(e)}")
        print(f"⏱️ Time elapsed: {training_time:.1f} minutes")

        import traceback
        traceback.print_exc()

        return {
            'status': 'ERROR',
            'error': str(e),
            'training_time_minutes': training_time
        }

# 🚀 RUN FIXED RANDOM FOREST TRAINING
if __name__ == "__main__":
    result = main_fixed_rf_training()

print("\n🎯 STAGE 4 EXECUTION COMPLETED!")

🌲 STARTING STAGE 4: FIXED RANDOM FOREST TRAINING
🔧 Mode: Compatibility Fixed
⏰ Start time: 2025-07-25 06:34:06
🔧 Setting up TensorFlow compatibility...
   ✅ GPU memory growth enabled




📦 Loading LSTM components with compatibility fixes...
🧠 Loading LSTM model with compatibility fixes...
   🔄 Loading model with custom objects...
   ✅ LSTM model loaded successfully with custom objects
   ✅ Feature scaler loaded
   ✅ Target scaler loaded
   ✅ Ticker encoder loaded
   ✅ Model info loaded
   📊 Feature columns: 21
   📊 Target columns: 10
📂 Loading feature engineered data...
   ✅ NFLX: 2000 records
   ✅ TSLA: 2000 records
   ✅ AAPL: 2000 records
   ✅ GOOGL: 2000 records
   ✅ META: 2000 records
   ✅ BBCA: 2000 records
   ✅ NVDA: 2000 records
   ✅ MSFT: 2000 records
   ✅ BAC: 2000 records
   ✅ JPM: 2000 records
   ✅ AMZN: 2000 records
✅ Combined data: 22000 records
🎯 Generating LSTM predictions safely...
   Processing AAPL...
      ✅ 500 predictions generated
   Processing AMZN...
      ✅ 500 predictions generated
   Processing BAC...
      ✅ 500 predictions generated
   Processing BBCA...
      ✅ 500 predictions generated
   Processing GOOGL...
      ✅ 500 predictions genera

## Versi Skicit-learn

In [None]:
# ===================================================================
# 🚀 FIXED LAYOUT STOCK PREDICTOR - wasirawasenju
# ===================================================================

import pandas as pd
import numpy as np
import os
import warnings
import pickle
import joblib
from datetime import datetime, timedelta
import yfinance as yf
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

class SimpleStockPredictor:
    """Simple Stock Predictor using only RF models + Technical Analysis"""

    def __init__(self):
        self.rf_models = None
        self.is_loaded = False

    def load_rf_models_only(self):
        """Load only Random Forest models (skip LSTM)"""
        print("🔄 Loading Random Forest models...")

        try:
            rf_dir = 'random_forest_models'

            if not os.path.exists(rf_dir):
                return False, f"RF models directory not found: {rf_dir}"

            rf_models = {}
            models_loaded = 0

            # Load available RF models
            for day in range(1, 6):
                model_path = f"{rf_dir}/Day{day}_Trend_rf_model.pkl"

                if os.path.exists(model_path):
                    try:
                        rf_models[f'Day{day}'] = joblib.load(model_path)
                        models_loaded += 1
                        print(f"   ✅ Day{day} RF model loaded")
                    except Exception as e:
                        print(f"   ⚠️ Day{day} model error: {str(e)}")
                else:
                    print(f"   ❌ Day{day} model not found")

            if models_loaded == 0:
                return False, "No RF models could be loaded"

            self.rf_models = {
                'models': rf_models,
                'models_loaded': models_loaded
            }

            self.is_loaded = True
            print(f"✅ Loaded {models_loaded}/5 RF models successfully!")

            return True, f"Loaded {models_loaded}/5 RF models"

        except Exception as e:
            return False, f"Error loading RF models: {str(e)}"

    def download_stock_data(self, ticker, period="1y"):
        """Download stock data from yfinance"""
        print(f"📥 Downloading {ticker}...")

        try:
            stock = yf.Ticker(ticker)
            df = stock.history(period=period, auto_adjust=True)

            if df.empty:
                raise ValueError(f"No data for {ticker}")

            df = df.reset_index()
            df['Date'] = pd.to_datetime(df['Date'])

            # Remove timezone if present
            if hasattr(df['Date'].dtype, 'tz') and df['Date'].dt.tz is not None:
                df['Date'] = df['Date'].dt.tz_localize(None)

            print(f"✅ Got {len(df)} records for {ticker}")
            return df

        except Exception as e:
            raise Exception(f"Download failed: {str(e)}")

    def calculate_indicators(self, df):
        """Calculate essential technical indicators"""
        print("🔧 Calculating indicators...")

        # Moving averages
        df['SMA_10'] = df['Close'].rolling(10).mean()
        df['SMA_20'] = df['Close'].rolling(20).mean()
        df['EMA_10'] = df['Close'].ewm(span=10).mean()
        df['EMA_20'] = df['Close'].ewm(span=20).mean()

        # RSI
        delta = df['Close'].diff()
        gain = delta.where(delta > 0, 0).rolling(14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
        rs = gain / loss
        df['RSI_14'] = 100 - (100 / (1 + rs))

        # MACD
        ema12 = df['Close'].ewm(span=12).mean()
        ema26 = df['Close'].ewm(span=26).mean()
        df['MACD'] = ema12 - ema26
        df['MACD_Signal'] = df['MACD'].ewm(span=9).mean()

        # Bollinger Bands
        bb_mid = df['Close'].rolling(20).mean()
        bb_std = df['Close'].rolling(20).std()
        df['BB_High'] = bb_mid + (bb_std * 2)
        df['BB_Low'] = bb_mid - (bb_std * 2)
        df['BB_Position'] = (df['Close'] - df['BB_Low']) / (df['BB_High'] - df['BB_Low'])
        df['BB_Width'] = (df['BB_High'] - df['BB_Low']) / bb_mid

        # Volume & momentum
        df['Volume_MA_10'] = df['Volume'].rolling(10).mean()
        df['Volatility_20'] = df['Close'].rolling(20).std()
        df['Daily_Return'] = df['Close'].pct_change() * 100
        df['ROC_10'] = df['Close'].pct_change(10) * 100

        # Binary features
        df['High_Volume'] = (df['Volume'] > df['Volume_MA_10'] * 1.5).astype(int)
        df['Low_Volume'] = (df['Volume'] < df['Volume_MA_10'] * 0.5).astype(int)
        df['Strong_Momentum'] = (df['ROC_10'] > 5).astype(int)
        df['Weak_Momentum'] = (df['ROC_10'] < -5).astype(int)

        # OBV (simplified)
        df['OBV'] = (df['Volume'] * np.sign(df['Close'].diff())).cumsum()

        # ATR (simplified)
        df['ATR'] = ((df['High'] - df['Low']).rolling(14).mean())

        # Fill NaN
        df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)

        print("✅ Indicators calculated")
        return df

    def predict_prices_technical(self, df):
        """Predict prices using technical analysis (no LSTM)"""
        print("🎯 Predicting prices with technical analysis...")

        current_price = df['Close'].iloc[-1]

        # Calculate trends
        sma_trend = df['SMA_10'].iloc[-1] - df['SMA_10'].iloc[-5]
        ema_trend = df['EMA_10'].iloc[-1] - df['EMA_10'].iloc[-5]
        momentum = df['ROC_10'].iloc[-1]
        rsi = df['RSI_14'].iloc[-1]

        # Trend strength
        trend_strength = (sma_trend + ema_trend) / 2

        # RSI bias (oversold/overbought)
        if rsi > 70:
            rsi_bias = -0.5  # Overbought, expect decline
        elif rsi < 30:
            rsi_bias = 0.5   # Oversold, expect rise
        else:
            rsi_bias = 0

        # Generate 5-day predictions
        predictions = {}
        base_price = current_price

        for day in range(1, 6):
            # Decay trend over time
            decay = 0.85 ** (day - 1)

            # Price change calculation
            daily_change = (trend_strength * decay) + (momentum * 0.01 * decay) + (rsi_bias * decay)

            # Limit max change to 5% per day
            max_change = base_price * 0.05
            daily_change = max(min(daily_change, max_change), -max_change)

            # Calculate new prices
            predicted_close = base_price + daily_change

            # Simple open prediction (slight variation from previous close)
            if day == 1:
                predicted_open = current_price * (1 + np.random.normal(0, 0.002))  # Small random gap
            else:
                predicted_open = predictions[f'Day{day-1}']['Close'] * (1 + np.random.normal(0, 0.001))

            predictions[f'Day{day}'] = {
                'Open': max(predicted_open, 0.01),  # Ensure positive
                'Close': max(predicted_close, 0.01)
            }

            base_price = predicted_close

        return predictions

    def predict_trends_with_rf(self, df, price_predictions):
        """Predict trends using loaded RF models"""
        if not self.is_loaded or not self.rf_models:
            print("⚠️ No RF models available, using simple trend logic")
            return self._simple_trend_prediction(price_predictions)

        print("🌲 Predicting trends with RF models...")

        latest_data = df.iloc[-1]

        # Prepare features for RF
        rf_features = {}

        # Mock LSTM features (use our technical predictions)
        for day in range(1, 6):
            day_key = f'Day{day}'
            rf_features[f'LSTM_{day_key}_Open'] = price_predictions[day_key]['Open']
            rf_features[f'LSTM_{day_key}_Close'] = price_predictions[day_key]['Close']

        # Technical features
        tech_indicators = [
            'RSI_14', 'MACD', 'MACD_Signal', 'BB_Position', 'BB_Width',
            'Volume_MA_10', 'ROC_10', 'ATR', 'Volatility_20', 'Daily_Return',
            'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20', 'OBV',
            'High_Volume', 'Low_Volume', 'Strong_Momentum', 'Weak_Momentum'
        ]

        for indicator in tech_indicators:
            value = latest_data.get(indicator, 0)
            rf_features[f'Tech_{indicator}'] = float(value) if pd.notna(value) else 0.0

        # Market state features
        rf_features.update({
            'Current_Close': float(latest_data['Close']),
            'Current_Volume': float(latest_data['Volume']),
            'Current_High': float(latest_data['High']),
            'Current_Low': float(latest_data['Low']),
            'Price_Range': float(latest_data['High'] - latest_data['Low']),
            'Volume_Ratio': float(latest_data['Volume'] / latest_data.get('Volume_MA_10', latest_data['Volume']))
        })

        # Predict with RF models
        trends = {}
        confidences = {}

        for day in range(1, 6):
            day_key = f'Day{day}'

            if day_key in self.rf_models['models']:
                try:
                    model_data = self.rf_models['models'][day_key]
                    rf_model = model_data['model']
                    model_features = model_data['feature_columns']

                    # Align features
                    aligned_features = []
                    for feature in model_features:
                        aligned_features.append(rf_features.get(feature, 0.0))

                    # Predict
                    X_pred = np.array([aligned_features])
                    trend_pred = rf_model.predict(X_pred)[0]
                    trend_proba = rf_model.predict_proba(X_pred)[0]

                    trends[day_key] = trend_pred
                    confidences[day_key] = float(max(trend_proba))

                    print(f"   ✅ {day_key}: {trend_pred} (confidence: {max(trend_proba):.2f})")

                except Exception as e:
                    print(f"   ⚠️ {day_key} RF failed: {str(e)}")
                    trends[day_key] = 'STAY'
                    confidences[day_key] = 0.5
            else:
                print(f"   ❌ {day_key} model not available")
                trends[day_key] = 'STAY'
                confidences[day_key] = 0.3

        return trends, confidences

    def _simple_trend_prediction(self, price_predictions):
        """Fallback simple trend prediction"""
        trends = {}
        confidences = {}

        for day in range(1, 6):
            day_key = f'Day{day}'
            pred_close = price_predictions[day_key]['Close']
            pred_open = price_predictions[day_key]['Open']

            change_pct = ((pred_close - pred_open) / pred_open) * 100

            if change_pct > 1.0:
                trends[day_key] = 'UP'
            elif change_pct < -1.0:
                trends[day_key] = 'DOWN'
            else:
                trends[day_key] = 'STAY'

            confidences[day_key] = 0.6  # Default confidence

        return trends, confidences

    def predict_stock(self, ticker, period="1y"):
        """Complete prediction pipeline"""
        print(f"🎯 Predicting {ticker}...")

        try:
            # Download data
            df = self.download_stock_data(ticker, period)

            # Calculate indicators
            df = self.calculate_indicators(df)

            # Predict prices
            price_predictions = self.predict_prices_technical(df)

            # Predict trends
            trends, confidences = self.predict_trends_with_rf(df, price_predictions)

            # Format results
            today = datetime.now().date()
            prediction_dates = [(today + timedelta(days=i)) for i in range(1, 6)]

            predictions = {}
            for i, day_key in enumerate(['Day1', 'Day2', 'Day3', 'Day4', 'Day5']):
                predictions[day_key] = {
                    'date': prediction_dates[i].strftime('%Y-%m-%d'),
                    'predicted_open': price_predictions[day_key]['Open'],
                    'predicted_close': price_predictions[day_key]['Close'],
                    'trend': trends[day_key],
                    'confidence': confidences[day_key]
                }

            current_data = df.iloc[-1]
            stock_info = {
                'ticker': ticker.upper(),
                'current_price': float(current_data['Close']),
                'current_date': current_data['Date'].strftime('%Y-%m-%d'),
                'volume': float(current_data['Volume']),
                'prediction_generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC'),
                'data_points_used': len(df),
                'rf_models_available': self.rf_models['models_loaded'] if self.rf_models else 0
            }

            return {
                'status': 'SUCCESS',
                'stock_info': stock_info,
                'predictions': predictions,
                'historical_data': df.tail(30).to_dict('records')
            }

        except Exception as e:
            return {
                'status': 'ERROR',
                'error': str(e),
                'ticker': ticker
            }

# ===================================================================
# 🖥️ STREAMLIT DASHBOARD - FIXED LAYOUT
# ===================================================================

def main_dashboard():
    """Fixed layout dashboard"""

    st.set_page_config(
        page_title="📈 Stock Predictor - wasirawasenju",
        page_icon="🚀",
        layout="wide"
    )

    # Custom CSS
    st.markdown("""
    <style>
    .main-title {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 1rem;
    }
    .subtitle {
        font-size: 1.2rem;
        text-align: center;
        color: #666;
        margin-bottom: 2rem;
    }
    .user-info {
        background: linear-gradient(90deg, #f0f8ff, #e6f3ff);
        padding: 1rem;
        border-radius: 10px;
        border-left: 4px solid #1f77b4;
        margin-bottom: 1rem;
    }
    .prediction-card {
        background: #f8f9fa;
        padding: 1.5rem;
        border-radius: 10px;
        border: 1px solid #dee2e6;
        margin: 1rem 0;
    }
    .metric-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 1rem;
        border-radius: 10px;
        text-align: center;
        margin: 0.5rem 0;
    }
    </style>
    """, unsafe_allow_html=True)

    # Initialize session state
    if 'predictor' not in st.session_state:
        st.session_state.predictor = SimpleStockPredictor()
        st.session_state.models_loaded = False
        st.session_state.prediction_result = None
        st.session_state.current_ticker = ""

    # Header
    st.markdown('<h1 class="main-title">🚀 AI Stock Predictor</h1>', unsafe_allow_html=True)
    st.markdown('<p class="subtitle">Random Forest + Technical Analysis | No TensorFlow Required</p>', unsafe_allow_html=True)

    # User info
    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')
    st.markdown(f"""
    <div class="user-info">
        <h4>👤 User: wasirawasenju</h4>
        <p>📅 Current Time: {current_time}</p>
        <p>🏠 Environment: Local Development</p>
    </div>
    """, unsafe_allow_html=True)

    # Create two columns: sidebar-like left column and main content
    col_sidebar, col_main = st.columns([1, 3])

    # LEFT COLUMN (Control Panel)
    with col_sidebar:
        st.markdown("### 🎛️ Control Panel")

        # Model loading section
        st.markdown("#### 🤖 AI Models")

        if not st.session_state.models_loaded:
            if st.button("🔄 Load RF Models", type="primary", use_container_width=True):
                with st.spinner("Loading models..."):
                    success, message = st.session_state.predictor.load_rf_models_only()

                    if success:
                        st.session_state.models_loaded = True
                        st.success(f"✅ {message}")
                    else:
                        st.error(f"❌ {message}")
                        st.warning("Will use fallback prediction")
                        st.session_state.models_loaded = True  # Allow to continue
        else:
            st.success("✅ Models Ready")
            if st.session_state.predictor.rf_models:
                rf_count = st.session_state.predictor.rf_models['models_loaded']
                st.info(f"RF Models: {rf_count}/5")

        # Stock input section
        if st.session_state.models_loaded:
            st.markdown("#### 📊 Stock Selection")

            # Text input
            ticker_input = st.text_input(
                "Enter Ticker:",
                value=st.session_state.current_ticker,
                placeholder="AAPL, GOOGL, BBCA.JK...",
                key="ticker_input"
            )

            # Quick select buttons
            st.markdown("**Quick Select:**")

            # Create 2x3 grid for buttons
            col1, col2 = st.columns(2)

            with col1:
                if st.button("🍎 AAPL", use_container_width=True):
                    st.session_state.current_ticker = "AAPL"
                    st.rerun()

                if st.button("🔍 GOOGL", use_container_width=True):
                    st.session_state.current_ticker = "GOOGL"
                    st.rerun()

                if st.button("🏦 BBCA.JK", use_container_width=True):
                    st.session_state.current_ticker = "BBCA.JK"
                    st.rerun()

            with col2:
                if st.button("🚗 TSLA", use_container_width=True):
                    st.session_state.current_ticker = "TSLA"
                    st.rerun()

                if st.button("💻 MSFT", use_container_width=True):
                    st.session_state.current_ticker = "MSFT"
                    st.rerun()

                if st.button("🎬 NFLX", use_container_width=True):
                    st.session_state.current_ticker = "NFLX"
                    st.rerun()

            # Update current ticker from input
            if ticker_input != st.session_state.current_ticker:
                st.session_state.current_ticker = ticker_input

            # Predict button
            st.markdown("---")
            if st.button("🎯 Generate Prediction", type="primary", use_container_width=True):
                if st.session_state.current_ticker:
                    # Show prediction in main area
                    with col_main:
                        with st.spinner(f"🔍 Analyzing {st.session_state.current_ticker.upper()}..."):
                            result = st.session_state.predictor.predict_stock(st.session_state.current_ticker)

                        # Store result in session state
                        st.session_state.prediction_result = result
                        st.rerun()
                else:
                    st.warning("Please enter a ticker!")

    # RIGHT COLUMN (Main Content)
    with col_main:
        if st.session_state.prediction_result is None:
            # Default welcome content
            st.markdown("### 🎯 Welcome to AI Stock Predictor")

            st.markdown("""
            <div class="prediction-card">
                <h4>🚀 Get Started</h4>
                <p>1. Load the AI models using the control panel</p>
                <p>2. Enter a stock ticker or use quick select buttons</p>
                <p>3. Click "Generate Prediction" to see 5-day forecasts</p>
                <p>4. View detailed analysis, charts, and investment insights</p>
            </div>
            """, unsafe_allow_html=True)

            st.markdown("""
            <div class="prediction-card">
                <h4>📊 Features</h4>
                <ul>
                <li>🎯 <strong>5-Day Price Predictions</strong> - Open & Close forecasts</li>
                <li>📈 <strong>Trend Analysis</strong> - UP/DOWN/STAY classifications</li>
                <li>🤖 <strong>AI-Powered</strong> - Random Forest + Technical Analysis</li>
                <li>📱 <strong>Real-time Data</strong> - Live data from Yahoo Finance</li>
                <li>💡 <strong>Investment Insights</strong> - Automated recommendations</li>
                </ul>
            </div>
            """, unsafe_allow_html=True)

            # Sample prediction preview
            st.markdown("### 📋 Sample Prediction Output")

            sample_data = {
                "Date": ["2025-07-26", "2025-07-27", "2025-07-28", "2025-07-29", "2025-07-30"],
                "Open": ["$150.20", "$151.50", "$152.10", "$151.80", "$153.00"],
                "Close": ["$151.30", "$152.80", "$151.90", "$153.20", "$154.10"],
                "Trend": ["📈 UP", "📈 UP", "📉 DOWN", "📈 UP", "📈 UP"],
                "Confidence": ["85.2%", "78.9%", "72.1%", "81.5%", "77.3%"]
            }

            sample_df = pd.DataFrame(sample_data)
            st.dataframe(sample_df, use_container_width=True)

        else:
            # Display prediction results
            if st.session_state.prediction_result['status'] == 'SUCCESS':
                display_prediction_results(st.session_state.prediction_result)
            else:
                st.error(f"❌ Prediction Error: {st.session_state.prediction_result['error']}")

                # Clear error after showing
                if st.button("🔄 Clear Error"):
                    st.session_state.prediction_result = None
                    st.rerun()

def display_prediction_results(result):
    """Display prediction results in main area"""

    stock_info = result['stock_info']
    predictions = result['predictions']
    historical_data = result['historical_data']

    # Header with stock info
    st.markdown(f"### 📊 {stock_info['ticker']} - Prediction Results")

    # Metrics row
    col1, col2, col3, col4 = st.columns(4)

    with col1:
        st.markdown(f"""
        <div class="metric-card">
            <h4>💰 Current Price</h4>
            <h2>${stock_info['current_price']:.2f}</h2>
        </div>
        """, unsafe_allow_html=True)

    with col2:
        st.markdown(f"""
        <div class="metric-card">
            <h4>📊 Volume</h4>
            <h2>{stock_info['volume']:,.0f}</h2>
        </div>
        """, unsafe_allow_html=True)

    with col3:
        st.markdown(f"""
        <div class="metric-card">
            <h4>📅 Data Date</h4>
            <h2>{stock_info['current_date']}</h2>
        </div>
        """, unsafe_allow_html=True)

    with col4:
        st.markdown(f"""
        <div class="metric-card">
            <h4>🤖 RF Models</h4>
            <h2>{stock_info['rf_models_available']}/5</h2>
        </div>
        """, unsafe_allow_html=True)

    # 5-Day Predictions Table
    st.markdown("### 🔮 5-Day AI Predictions")

    pred_data = []
    for day, pred in predictions.items():
        trend_emoji = {"UP": "📈", "DOWN": "📉", "STAY": "➡️"}.get(pred['trend'], "❓")

        change = pred['predicted_close'] - pred['predicted_open']
        change_pct = (change / pred['predicted_open']) * 100

        pred_data.append({
            "📅 Date": pred['date'],
            "🌅 Open": f"${pred['predicted_open']:.2f}",
            "🌇 Close": f"${pred['predicted_close']:.2f}",
            "📊 Daily Change": f"${change:+.2f} ({change_pct:+.1f}%)",
            "📈 Trend": f"{trend_emoji} {pred['trend']}",
            "🎯 Confidence": f"{pred['confidence']:.1%}"
        })

    pred_df = pd.DataFrame(pred_data)
    st.dataframe(pred_df, use_container_width=True)

    # Price Chart
    st.markdown("### 📈 Interactive Price Chart")

    # Prepare chart data
    hist_df = pd.DataFrame(historical_data)
    hist_df['Date'] = pd.to_datetime(hist_df['Date'])

    # Create chart
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1,
        subplot_titles=(f'{stock_info["ticker"]} - Price Prediction', 'Trading Volume'),
        row_heights=[0.7, 0.3]
    )

    # Historical candlestick
    fig.add_trace(
        go.Candlestick(
            x=hist_df['Date'],
            open=hist_df['Open'],
            high=hist_df['High'],
            low=hist_df['Low'],
            close=hist_df['Close'],
            name='Historical Price',
            increasing_line_color='green',
            decreasing_line_color='red'
        ),
        row=1, col=1
    )

    # Predicted prices
    pred_dates = [datetime.strptime(pred['date'], '%Y-%m-%d') for pred in predictions.values()]
    pred_closes = [pred['predicted_close'] for pred in predictions.values()]

    # Connect last historical to first prediction
    last_date = hist_df['Date'].iloc[-1]
    last_price = hist_df['Close'].iloc[-1]

    fig.add_trace(
        go.Scatter(
            x=[last_date] + pred_dates,
            y=[last_price] + pred_closes,
            mode='lines+markers',
            name='Predicted Close',
            line=dict(color='red', width=4, dash='dash'),
            marker=dict(size=10, color='red', symbol='diamond')
        ),
        row=1, col=1
    )

    # Volume bars
    fig.add_trace(
        go.Bar(
            x=hist_df['Date'],
            y=hist_df['Volume'],
            name='Volume',
            marker_color='lightblue',
            opacity=0.7
        ),
        row=2, col=1
    )

    fig.update_layout(
        title=f"{stock_info['ticker']} - AI Prediction Analysis",
        height=700,
        showlegend=True,
        hovermode='x unified'
    )

    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_yaxes(title_text="Volume", row=2, col=1)
    fig.update_xaxes(title_text="Date", row=2, col=1)

    st.plotly_chart(fig, use_container_width=True)

    # Investment Analysis
    st.markdown("### 💡 Investment Analysis")

    col1, col2 = st.columns(2)

    with col1:
        # Trend analysis
        up_days = sum(1 for pred in predictions.values() if pred['trend'] == 'UP')
        down_days = sum(1 for pred in predictions.values() if pred['trend'] == 'DOWN')
        stay_days = sum(1 for pred in predictions.values() if pred['trend'] == 'STAY')
        avg_confidence = sum(pred['confidence'] for pred in predictions.values()) / len(predictions)

        if up_days > down_days:
            recommendation = "🟢 BULLISH OUTLOOK"
            rec_color = "#28a745"
            advice = f"Model predicts {up_days} bullish days. Consider buying opportunities."
        elif down_days > up_days:
            recommendation = "🔴 BEARISH OUTLOOK"
            rec_color = "#dc3545"
            advice = f"Model predicts {down_days} bearish days. Exercise caution."
        else:
            recommendation = "🟡 NEUTRAL OUTLOOK"
            rec_color = "#ffc107"
            advice = "Mixed signals detected. Wait for clearer trends."

        st.markdown(f"""
        <div class="prediction-card">
            <h4 style="color: {rec_color};">{recommendation}</h4>
            <p>{advice}</p>
            <p><strong>Average Confidence:</strong> {avg_confidence:.1%}</p>
            <p><strong>Trend Distribution:</strong> {up_days} UP, {down_days} DOWN, {stay_days} STAY</p>
        </div>
        """, unsafe_allow_html=True)

    with col2:
        # Price range analysis
        current_price = stock_info['current_price']
        min_pred = min(pred['predicted_close'] for pred in predictions.values())
        max_pred = max(pred['predicted_close'] for pred in predictions.values())

        price_range = max_pred - min_pred
        upside = ((max_pred - current_price) / current_price) * 100
        downside = ((current_price - min_pred) / current_price) * 100

        st.markdown(f"""
        <div class="prediction-card">
            <h4>💹 Price Range Analysis</h4>
            <p><strong>Current:</strong> ${current_price:.2f}</p>
            <p><strong>Predicted Range:</strong> ${min_pred:.2f} - ${max_pred:.2f}</p>
            <p><strong>Volatility:</strong> ${price_range:.2f} ({(price_range/current_price)*100:.1f}%)</p>
            <p><strong>Upside Potential:</strong> +{upside:.1f}%</p>
            <p><strong>Downside Risk:</strong> -{downside:.1f}%</p>
        </div>
        """, unsafe_allow_html=True)

    # Action buttons
    col1, col2, col3 = st.columns(3)

    with col1:
        if st.button("📥 Export JSON", use_container_width=True):
            import json
            export_data = {
                'stock_info': stock_info,
                'predictions': predictions,
                'recommendation': recommendation
            }

            st.download_button(
                label="Download Prediction Report",
                data=json.dumps(export_data, indent=2),
                file_name=f"{stock_info['ticker']}_prediction_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
                mime="application/json"
            )

    with col2:
        if st.button("🔄 New Prediction", use_container_width=True):
            st.session_state.prediction_result = None
            st.session_state.current_ticker = ""
            st.rerun()

    with col3:
        if st.button("📊 Analyze Another", use_container_width=True):
            st.session_state.prediction_result = None
            st.rerun()

    # Disclaimer
    st.markdown("---")
    st.warning("""
    ⚠️ **Disclaimer:** This AI prediction is for educational purposes only.
    Always conduct your own research and consult financial advisors before making investment decisions.
    Past performance does not guarantee future results.
    """)

if __name__ == "__main__":
    main_dashboard()

# Stage 5 : Dashboard dan real time prediction

In [None]:
# Install dependencies first
pip install streamlit plotly yfinance

# Run Streamlit dashboard
streamlit run app.py

In [None]:
# ===================================================================
# 🚀 STAGE 5: REAL-TIME PREDICTION API & DASHBOARD
# ===================================================================

import pandas as pd
import numpy as np
import os
import warnings
import pickle
import joblib
from datetime import datetime, timedelta
import yfinance as yf
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# TensorFlow for LSTM predictions
import tensorflow as tf
from tensorflow.keras.models import load_model

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

class StockPredictionSystem:
    """Complete Stock Prediction System with LSTM + Random Forest"""

    def __init__(self):
        self.lstm_components = None
        self.rf_models = None
        self.is_loaded = False

    def load_models(self):
        """Load all trained models and components"""
        print("🔄 Loading trained models...")

        try:
            # Load LSTM components
            self.lstm_components = self._load_lstm_components()

            # Load Random Forest models
            self.rf_models = self._load_rf_models()

            self.is_loaded = True
            print("✅ All models loaded successfully!")

        except Exception as e:
            print(f"❌ Error loading models: {str(e)}")
            raise e

    def _load_lstm_components(self):
        """Load LSTM model and components"""
        model_dir = 'unified_lstm_model'

        # Custom objects for compatibility
        custom_objects = {
            'mse': 'mean_squared_error',
            'mae': 'mean_absolute_error'
        }

        # Load LSTM model
        lstm_model = load_model(
            f'{model_dir}/unified_lstm_model.h5',
            custom_objects=custom_objects,
            compile=False
        )

        # Recompile
        from tensorflow.keras.optimizers import Adam
        lstm_model.compile(optimizer=Adam(0.001), loss='mse', metrics=['mae'])

        # Load scalers and encoders
        with open(f'{model_dir}/feature_scaler.pkl', 'rb') as f:
            feature_scaler = pickle.load(f)

        with open(f'{model_dir}/target_scaler.pkl', 'rb') as f:
            target_scaler = pickle.load(f)

        with open(f'{model_dir}/ticker_encoder.pkl', 'rb') as f:
            ticker_encoder = pickle.load(f)

        with open(f'{model_dir}/model_info.pkl', 'rb') as f:
            model_info = pickle.load(f)

        return {
            'model': lstm_model,
            'feature_scaler': feature_scaler,
            'target_scaler': target_scaler,
            'ticker_encoder': ticker_encoder,
            'model_info': model_info
        }

    def _load_rf_models(self):
        """Load Random Forest models"""
        model_dir = 'random_forest_models'

        rf_models = {}
        for day in range(1, 6):
            model_path = f"{model_dir}/Day{day}_Trend_rf_model.pkl"
            if os.path.exists(model_path):
                rf_models[f'Day{day}'] = joblib.load(model_path)

        # Load ensemble info
        with open(f'{model_dir}/ensemble_info.pkl', 'rb') as f:
            ensemble_info = pickle.load(f)

        return {
            'models': rf_models,
            'ensemble_info': ensemble_info
        }

    def download_stock_data(self, ticker, period="2y"):
        """Download stock data from yfinance with optimal period"""
        print(f"📥 Downloading {ticker} data from yfinance...")

        try:
            # Download data
            stock = yf.Ticker(ticker)
            df = stock.history(period=period)

            if df.empty:
                raise ValueError(f"No data found for ticker: {ticker}")

            # Reset index to make Date a column
            df = df.reset_index()
            df['Date'] = pd.to_datetime(df['Date'])

            # Ensure we have enough data (minimum 100 days for 60-day sequences)
            if len(df) < 100:
                print(f"⚠️ Limited data for {ticker}: {len(df)} days")

            print(f"✅ Downloaded {ticker}: {len(df)} records from {df['Date'].min().date()} to {df['Date'].max().date()}")

            return df

        except Exception as e:
            print(f"❌ Error downloading {ticker}: {str(e)}")
            raise e

    def calculate_technical_indicators(self, df):
        """Calculate essential technical indicators"""
        print("🔧 Calculating technical indicators...")

        try:
            # Moving Averages
            df['SMA_10'] = df['Close'].rolling(window=10).mean()
            df['SMA_20'] = df['Close'].rolling(window=20).mean()
            df['EMA_10'] = df['Close'].ewm(span=10).mean()
            df['EMA_20'] = df['Close'].ewm(span=20).mean()

            # RSI
            delta = df['Close'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            df['RSI_14'] = 100 - (100 / (1 + rs))

            # MACD
            ema_12 = df['Close'].ewm(span=12).mean()
            ema_26 = df['Close'].ewm(span=26).mean()
            df['MACD'] = ema_12 - ema_26
            df['MACD_Signal'] = df['MACD'].ewm(span=9).mean()

            # Bollinger Bands
            bb_period = 20
            bb_std = 2
            df['BB_Mid'] = df['Close'].rolling(window=bb_period).mean()
            bb_std_dev = df['Close'].rolling(window=bb_period).std()
            df['BB_High'] = df['BB_Mid'] + (bb_std_dev * bb_std)
            df['BB_Low'] = df['BB_Mid'] - (bb_std_dev * bb_std)
            df['BB_Position'] = (df['Close'] - df['BB_Low']) / (df['BB_High'] - df['BB_Low'])
            df['BB_Width'] = (df['BB_High'] - df['BB_Low']) / df['BB_Mid']

            # Volume indicators
            df['Volume_MA_10'] = df['Volume'].rolling(window=10).mean()
            df['OBV'] = (df['Volume'] * np.where(df['Close'] > df['Close'].shift(1), 1,
                                               np.where(df['Close'] < df['Close'].shift(1), -1, 0))).cumsum()

            # Volatility and momentum
            df['ATR'] = df[['High', 'Low', 'Close']].apply(
                lambda x: max(x['High'] - x['Low'],
                            abs(x['High'] - df['Close'].shift(1).iloc[x.name] if pd.notna(df['Close'].shift(1).iloc[x.name]) else x['High']),
                            abs(x['Low'] - df['Close'].shift(1).iloc[x.name] if pd.notna(df['Close'].shift(1).iloc[x.name]) else x['Low'])), axis=1
            ).rolling(window=14).mean()

            df['Volatility_20'] = df['Close'].rolling(window=20).std()
            df['Daily_Return'] = df['Close'].pct_change() * 100
            df['ROC_10'] = ((df['Close'] - df['Close'].shift(10)) / df['Close'].shift(10)) * 100

            # Additional features
            df['High_Volume'] = (df['Volume'] > df['Volume_MA_10'] * 1.5).astype(int)
            df['Low_Volume'] = (df['Volume'] < df['Volume_MA_10'] * 0.5).astype(int)
            df['Strong_Momentum'] = (df['ROC_10'] > 5).astype(int)
            df['Weak_Momentum'] = (df['ROC_10'] < -5).astype(int)

            # Fill NaN values
            df = df.fillna(method='ffill').fillna(method='bfill')

            print(f"✅ Technical indicators calculated: {len(df.columns)} total columns")

            return df

        except Exception as e:
            print(f"❌ Error calculating indicators: {str(e)}")
            raise e

    def prepare_lstm_sequence(self, df, ticker):
        """Prepare 60-day sequence for LSTM prediction"""
        if not self.is_loaded:
            raise ValueError("Models not loaded! Call load_models() first.")

        feature_columns = self.lstm_components['model_info']['feature_columns']

        # Add ticker encoding
        try:
            ticker_encoded = self.lstm_components['ticker_encoder'].transform([ticker.upper()])[0]
        except ValueError:
            # If ticker not in training data, use a default or map to similar
            print(f"⚠️ Ticker {ticker} not in training data, using default encoding")
            ticker_encoded = 0

        df['Ticker_Encoded'] = ticker_encoded

        # Get the most recent 60 days
        sequence_length = 60
        if len(df) < sequence_length:
            raise ValueError(f"Insufficient data: {len(df)} days, need at least {sequence_length}")

        # Get features that exist in the dataframe
        available_features = [col for col in feature_columns if col in df.columns]
        missing_features = [col for col in feature_columns if col not in df.columns]

        if missing_features:
            print(f"⚠️ Missing features: {missing_features}")
            # Fill missing features with zeros or reasonable defaults
            for feature in missing_features:
                df[feature] = 0

        # Get the last 60 days
        recent_data = df[feature_columns].tail(sequence_length)

        # Scale the sequence
        sequence_scaled = self.lstm_components['feature_scaler'].transform(recent_data.values)

        return sequence_scaled.reshape(1, sequence_length, -1)

    def generate_lstm_predictions(self, sequence):
        """Generate 5-day price predictions using LSTM"""
        if not self.is_loaded:
            raise ValueError("Models not loaded!")

        # Get LSTM prediction (scaled)
        lstm_pred_scaled = self.lstm_components['model'].predict(sequence, verbose=0)

        # Denormalize to actual prices
        lstm_pred_actual = self.lstm_components['target_scaler'].inverse_transform(lstm_pred_scaled)

        # Format predictions
        predictions = {
            'Day1': {'Open': float(lstm_pred_actual[0][0]), 'Close': float(lstm_pred_actual[0][1])},
            'Day2': {'Open': float(lstm_pred_actual[0][2]), 'Close': float(lstm_pred_actual[0][3])},
            'Day3': {'Open': float(lstm_pred_actual[0][4]), 'Close': float(lstm_pred_actual[0][5])},
            'Day4': {'Open': float(lstm_pred_actual[0][6]), 'Close': float(lstm_pred_actual[0][7])},
            'Day5': {'Open': float(lstm_pred_actual[0][8]), 'Close': float(lstm_pred_actual[0][9])}
        }

        return predictions

    def prepare_rf_features(self, df, lstm_predictions):
        """Prepare features for Random Forest trend prediction"""
        latest_data = df.iloc[-1]

        # LSTM features
        rf_features = {}
        for day in range(1, 6):
            day_key = f'Day{day}'
            rf_features[f'LSTM_{day_key}_Open'] = lstm_predictions[day_key]['Open']
            rf_features[f'LSTM_{day_key}_Close'] = lstm_predictions[day_key]['Close']

        # Technical indicators (current state)
        tech_indicators = [
            'RSI_14', 'MACD', 'MACD_Signal', 'BB_Position', 'BB_Width',
            'Volume_MA_10', 'ROC_10', 'ATR', 'Volatility_20', 'Daily_Return',
            'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20', 'OBV',
            'High_Volume', 'Low_Volume', 'Strong_Momentum', 'Weak_Momentum'
        ]

        for indicator in tech_indicators:
            if indicator in latest_data:
                rf_features[f'Tech_{indicator}'] = float(latest_data[indicator])
            else:
                rf_features[f'Tech_{indicator}'] = 0.0

        # Market state features
        rf_features.update({
            'Current_Close': float(latest_data['Close']),
            'Current_Volume': float(latest_data['Volume']),
            'Current_High': float(latest_data['High']),
            'Current_Low': float(latest_data['Low']),
            'Price_Range': float(latest_data['High'] - latest_data['Low']),
            'Volume_Ratio': float(latest_data['Volume'] / latest_data.get('Volume_MA_10', latest_data['Volume']))
        })

        return rf_features

    def predict_trends(self, rf_features):
        """Predict trends using Random Forest models"""
        if not self.is_loaded:
            raise ValueError("Models not loaded!")

        trends = {}
        confidence_scores = {}

        # Convert to DataFrame for prediction
        feature_df = pd.DataFrame([rf_features])

        for day in range(1, 6):
            day_key = f'Day{day}'

            if day_key in self.rf_models['models']:
                model_data = self.rf_models['models'][day_key]
                rf_model = model_data['model']

                # Ensure feature alignment
                model_features = model_data['feature_columns']

                # Align features (fill missing with 0)
                aligned_features = []
                for feature in model_features:
                    aligned_features.append(rf_features.get(feature, 0.0))

                # Predict
                X_pred = np.array([aligned_features])
                trend_pred = rf_model.predict(X_pred)[0]
                trend_proba = rf_model.predict_proba(X_pred)[0]

                trends[day_key] = trend_pred
                confidence_scores[day_key] = float(max(trend_proba))
            else:
                trends[day_key] = 'UNKNOWN'
                confidence_scores[day_key] = 0.0

        return trends, confidence_scores

    def predict_stock(self, ticker, period="2y"):
        """Complete stock prediction pipeline"""
        print(f"🎯 Starting prediction for {ticker}...")

        try:
            # Download data
            df = self.download_stock_data(ticker, period)

            # Calculate technical indicators
            df = self.calculate_technical_indicators(df)

            # Prepare LSTM sequence
            lstm_sequence = self.prepare_lstm_sequence(df, ticker)

            # Generate LSTM predictions
            lstm_predictions = self.generate_lstm_predictions(lstm_sequence)

            # Prepare RF features
            rf_features = self.prepare_rf_features(df, lstm_predictions)

            # Predict trends
            trends, confidence_scores = self.predict_trends(rf_features)

            # Generate prediction dates (starting from tomorrow)
            today = datetime.now().date()
            prediction_dates = [(today + timedelta(days=i)) for i in range(1, 6)]

            # Combine results
            predictions = {}
            for i, day_key in enumerate(['Day1', 'Day2', 'Day3', 'Day4', 'Day5']):
                predictions[day_key] = {
                    'date': prediction_dates[i].strftime('%Y-%m-%d'),
                    'predicted_open': lstm_predictions[day_key]['Open'],
                    'predicted_close': lstm_predictions[day_key]['Close'],
                    'trend': trends[day_key],
                    'confidence': confidence_scores[day_key]
                }

            # Add current stock info
            current_data = df.iloc[-1]
            stock_info = {
                'ticker': ticker.upper(),
                'current_price': float(current_data['Close']),
                'current_date': current_data['Date'].strftime('%Y-%m-%d'),
                'volume': float(current_data['Volume']),
                'market_cap': 'N/A',  # Would need additional API for this
                'prediction_generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')
            }

            result = {
                'status': 'SUCCESS',
                'stock_info': stock_info,
                'predictions': predictions,
                'historical_data': df.tail(30).to_dict('records')  # Last 30 days for charts
            }

            print(f"✅ Prediction completed for {ticker}")
            return result

        except Exception as e:
            print(f"❌ Prediction failed for {ticker}: {str(e)}")
            return {
                'status': 'ERROR',
                'error': str(e),
                'ticker': ticker
            }

# ===================================================================
# 🖥️ STREAMLIT DASHBOARD
# ===================================================================

def create_streamlit_dashboard():
    """Create Streamlit dashboard for stock predictions"""

    st.set_page_config(
        page_title="🚀 AI Stock Predictor",
        page_icon="📈",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    # Custom CSS
    st.markdown("""
    <style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .prediction-card {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 0.5rem 0;
    }
    .trend-up {
        color: #00ff00;
        font-weight: bold;
    }
    .trend-down {
        color: #ff0000;
        font-weight: bold;
    }
    .trend-stay {
        color: #ffa500;
        font-weight: bold;
    }
    </style>
    """, unsafe_allow_html=True)

    # Header
    st.markdown('<h1 class="main-header">🚀 AI Stock Predictor</h1>', unsafe_allow_html=True)
    st.markdown('<p style="text-align: center; font-size: 1.2rem;">LSTM + Random Forest | 5-Day Stock Price & Trend Prediction</p>', unsafe_allow_html=True)

    # Sidebar
    st.sidebar.header("📊 Prediction Settings")

    # Initialize session state
    if 'prediction_system' not in st.session_state:
        st.session_state.prediction_system = StockPredictionSystem()
        st.session_state.models_loaded = False

    # Load models button
    if not st.session_state.models_loaded:
        if st.sidebar.button("🔄 Load AI Models", type="primary"):
            with st.spinner("Loading trained models..."):
                try:
                    st.session_state.prediction_system.load_models()
                    st.session_state.models_loaded = True
                    st.sidebar.success("✅ Models loaded successfully!")
                except Exception as e:
                    st.sidebar.error(f"❌ Error loading models: {str(e)}")
                    return

    if not st.session_state.models_loaded:
        st.warning("🔄 Please load the AI models first using the sidebar button.")
        return

    # Stock input
    ticker_input = st.sidebar.text_input(
        "📈 Enter Stock Ticker (yfinance format)",
        placeholder="e.g., AAPL, GOOGL, TSLA, BBCA.JK",
        help="Use Yahoo Finance ticker format. For Indonesian stocks, add .JK (e.g., BBCA.JK)"
    )

    # Data period selection
    period_options = {
        "2 Years": "2y",
        "1 Year": "1y",
        "6 Months": "6mo",
        "3 Months": "3mo"
    }

    selected_period = st.sidebar.selectbox(
        "📅 Historical Data Period",
        options=list(period_options.keys()),
        index=0,
        help="More data = better predictions, but slower processing"
    )

    # Predict button
    if st.sidebar.button("🎯 Generate Prediction", type="primary"):
        if not ticker_input:
            st.sidebar.error("Please enter a stock ticker!")
            return

        # Main prediction area
        with st.container():
            with st.spinner(f"🔍 Analyzing {ticker_input.upper()}..."):
                result = st.session_state.prediction_system.predict_stock(
                    ticker_input,
                    period_options[selected_period]
                )

            if result['status'] == 'SUCCESS':
                display_prediction_results(result)
            else:
                st.error(f"❌ Prediction failed: {result.get('error', 'Unknown error')}")

def display_prediction_results(result):
    """Display prediction results in Streamlit"""

    stock_info = result['stock_info']
    predictions = result['predictions']
    historical_data = result['historical_data']

    # Stock info header
    col1, col2, col3, col4 = st.columns(4)

    with col1:
        st.metric(
            label="🏢 Stock",
            value=stock_info['ticker']
        )

    with col2:
        st.metric(
            label="💰 Current Price",
            value=f"${stock_info['current_price']:.2f}"
        )

    with col3:
        st.metric(
            label="📊 Volume",
            value=f"{stock_info['volume']:,.0f}"
        )

    with col4:
        st.metric(
            label="📅 Last Update",
            value=stock_info['current_date']
        )

    st.markdown("---")

    # 5-Day Predictions
    st.subheader("🔮 5-Day AI Predictions")

    # Create prediction table
    pred_data = []
    for day_key, pred in predictions.items():
        trend_emoji = {"UP": "📈", "DOWN": "📉", "STAY": "➡️"}.get(pred['trend'], "❓")
        trend_color = {"UP": "trend-up", "DOWN": "trend-down", "STAY": "trend-stay"}.get(pred['trend'], "")

        pred_data.append({
            "Date": pred['date'],
            "Predicted Open": f"${pred['predicted_open']:.2f}",
            "Predicted Close": f"${pred['predicted_close']:.2f}",
            "Daily Change": f"${pred['predicted_close'] - pred['predicted_open']:.2f}",
            "Trend": f"{trend_emoji} {pred['trend']}",
            "Confidence": f"{pred['confidence']:.1%}"
        })

    pred_df = pd.DataFrame(pred_data)
    st.dataframe(pred_df, use_container_width=True)

    # Price chart
    st.subheader("📈 Price Prediction Chart")

    # Prepare chart data
    historical_df = pd.DataFrame(historical_data)
    historical_df['Date'] = pd.to_datetime(historical_df['Date'])

    # Create price chart
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1,
        subplot_titles=('Price Prediction', 'Volume'),
        row_heights=[0.7, 0.3]
    )

    # Historical prices
    fig.add_trace(
        go.Scatter(
            x=historical_df['Date'],
            y=historical_df['Close'],
            mode='lines',
            name='Historical Close',
            line=dict(color='blue', width=2)
        ),
        row=1, col=1
    )

    # Predicted prices
    pred_dates = [datetime.strptime(pred['date'], '%Y-%m-%d') for pred in predictions.values()]
    pred_opens = [pred['predicted_open'] for pred in predictions.values()]
    pred_closes = [pred['predicted_close'] for pred in predictions.values()]

    fig.add_trace(
        go.Scatter(
            x=pred_dates,
            y=pred_opens,
            mode='lines+markers',
            name='Predicted Open',
            line=dict(color='orange', width=2, dash='dash'),
            marker=dict(size=8)
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(
            x=pred_dates,
            y=pred_closes,
            mode='lines+markers',
            name='Predicted Close',
            line=dict(color='red', width=2, dash='dash'),
            marker=dict(size=8)
        ),
        row=1, col=1
    )

    # Volume
    fig.add_trace(
        go.Bar(
            x=historical_df['Date'],
            y=historical_df['Volume'],
            name='Volume',
            marker_color='lightblue'
        ),
        row=2, col=1
    )

    fig.update_layout(
        title=f"{stock_info['ticker']} - 5-Day AI Prediction",
        xaxis_title="Date",
        yaxis_title="Price ($)",
        height=600,
        showlegend=True
    )

    st.plotly_chart(fig, use_container_width=True)

    # Trend analysis
    st.subheader("📊 Trend Analysis")

    col1, col2 = st.columns(2)

    with col1:
        # Trend distribution
        trend_counts = {}
        for pred in predictions.values():
            trend = pred['trend']
            trend_counts[trend] = trend_counts.get(trend, 0) + 1

        fig_pie = px.pie(
            values=list(trend_counts.values()),
            names=list(trend_counts.keys()),
            title="5-Day Trend Distribution",
            color_discrete_map={
                'UP': '#00ff00',
                'DOWN': '#ff0000',
                'STAY': '#ffa500'
            }
        )

        st.plotly_chart(fig_pie, use_container_width=True)

    with col2:
        # Confidence scores
        confidence_data = [
            {"Day": day_key, "Confidence": pred['confidence']}
            for day_key, pred in predictions.items()
        ]

        fig_conf = px.bar(
            pd.DataFrame(confidence_data),
            x='Day',
            y='Confidence',
            title="Prediction Confidence by Day",
            color='Confidence',
            color_continuous_scale='viridis'
        )

        fig_conf.update_layout(yaxis_title="Confidence Score")
        st.plotly_chart(fig_conf, use_container_width=True)

    # Investment recommendations
    st.subheader("💡 AI Investment Insights")

    # Calculate overall trend
    up_days = sum(1 for pred in predictions.values() if pred['trend'] == 'UP')
    down_days = sum(1 for pred in predictions.values() if pred['trend'] == 'DOWN')
    stay_days = sum(1 for pred in predictions.values() if pred['trend'] == 'STAY')

    avg_confidence = sum(pred['confidence'] for pred in predictions.values()) / len(predictions)

    if up_days > down_days:
        recommendation = "🟢 BULLISH"
        recommendation_text = f"Model predicts {up_days} UP days vs {down_days} DOWN days. Consider BUYING."
    elif down_days > up_days:
        recommendation = "🔴 BEARISH"
        recommendation_text = f"Model predicts {down_days} DOWN days vs {up_days} UP days. Consider SELLING or avoiding."
    else:
        recommendation = "🟡 NEUTRAL"
        recommendation_text = f"Mixed signals. {up_days} UP, {down_days} DOWN, {stay_days} STAY days. HOLD or wait for clearer signals."

    col1, col2 = st.columns(2)

    with col1:
        st.markdown(f"""
        <div class="prediction-card">
        <h3>{recommendation}</h3>
        <p>{recommendation_text}</p>
        <p><strong>Average Confidence:</strong> {avg_confidence:.1%}</p>
        </div>
        """, unsafe_allow_html=True)

    with col2:
        st.markdown(f"""
        <div class="prediction-card">
        <h3>📋 Key Metrics</h3>
        <ul>
        <li><strong>Bullish Days:</strong> {up_days}/5</li>
        <li><strong>Bearish Days:</strong> {down_days}/5</li>
        <li><strong>Neutral Days:</strong> {stay_days}/5</li>
        <li><strong>Model Confidence:</strong> {avg_confidence:.1%}</li>
        </ul>
        </div>
        """, unsafe_allow_html=True)

    # Disclaimer
    st.markdown("---")
    st.markdown("""
    **⚠️ Disclaimer:** This AI prediction is for educational purposes only.
    Always conduct your own research and consider consulting with financial advisors before making investment decisions.
    Past performance does not guarantee future results.
    """)

    # Export option
    if st.button("📥 Export Predictions as JSON"):
        st.download_button(
            label="Download JSON",
            data=pd.Series(result).to_json(),
            file_name=f"{stock_info['ticker']}_prediction_{datetime.now().strftime('%Y%m%d')}.json",
            mime="application/json"
        )

# ===================================================================
# 🚀 MAIN EXECUTION
# ===================================================================

if __name__ == "__main__":
    # Check if running in Streamlit
    try:
        import streamlit as st
        create_streamlit_dashboard()
    except ImportError:
        print("Streamlit not found. Running as standalone API...")

        # Standalone API example
        predictor = StockPredictionSystem()
        predictor.load_models()

        # Example prediction
        test_ticker = "AAPL"
        result = predictor.predict_stock(test_ticker)

        if result['status'] == 'SUCCESS':
            print(f"\n🎉 Prediction successful for {test_ticker}!")
            for day_key, pred in result['predictions'].items():
                print(f"{day_key} ({pred['date']}): {pred['trend']} - Close: ${pred['predicted_close']:.2f} (Confidence: {pred['confidence']:.1%})")
        else:
            print(f"❌ Prediction failed: {result['error']}")

print("\n🎯 STAGE 5 PREDICTION SYSTEM READY!")

# Stage 6 : Evaluasi Model

## LSTM Evaluasi

In [11]:
"""
===================================================================
🧠 SIMPLE LSTM MAE EVALUATION
===================================================================
User: wasirawasenju
Date: 2025-07-25 07:28:21 UTC
Environment: Google Colab
Focus: LSTM MAE + Visualization ONLY
===================================================================
"""

# ===================================================================
# 📦 SIMPLE SETUP
# ===================================================================

!pip install -q plotly tensorflow pandas numpy scikit-learn

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
import pickle
import os
from datetime import datetime
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import load_model

warnings.filterwarnings('ignore')
tf.get_logger().setLevel('ERROR')

print("🧠 SIMPLE LSTM MAE EVALUATION")
print("="*60)
print(f"👤 User: wasirawasenju")
print(f"📅 Date: 2025-07-25 07:28:21 UTC")
print(f"🎯 Focus: MAE + Visualization")
print("="*60)

# ===================================================================
# 🔍 QUICK FILE CHECK
# ===================================================================

def quick_check():
    """Quick check of LSTM files"""
    print("📁 Quick file check...")

    lstm_dir = 'unified_lstm_model'
    if os.path.exists(lstm_dir):
        files = os.listdir(lstm_dir)
        print(f"   ✅ Found {len(files)} files in {lstm_dir}/")

        key_files = ['unified_lstm_model.h5', 'feature_scaler.pkl', 'target_scaler.pkl']
        for file in key_files:
            if file in files:
                print(f"      ✅ {file}")
            else:
                print(f"      ❌ {file}")
        return True
    else:
        print(f"   ❌ {lstm_dir}/ not found")
        return False

quick_check()

# ===================================================================
# 🧠 SIMPLE LSTM EVALUATOR
# ===================================================================

class SimpleLSTMEvaluator:
    """Super simple LSTM MAE evaluator"""

    def __init__(self):
        self.model = None
        self.feature_scaler = None
        self.target_scaler = None
        self.model_info = None
        self.mae_results = {}

    def load_lstm_simple(self):
        """Load LSTM components - simplified"""
        print("🔄 Loading LSTM...")

        try:
            # Load model
            print("   📦 Loading model...")
            self.model = load_model('unified_lstm_model/unified_lstm_model.h5', compile=False)

            # Recompile
            from tensorflow.keras.optimizers import Adam
            self.model.compile(optimizer=Adam(0.001), loss='mse', metrics=['mae'])
            print("      ✅ Model loaded & compiled")

            # Load scalers
            print("   📦 Loading scalers...")
            with open('unified_lstm_model/feature_scaler.pkl', 'rb') as f:
                self.feature_scaler = pickle.load(f)

            with open('unified_lstm_model/target_scaler.pkl', 'rb') as f:
                self.target_scaler = pickle.load(f)

            with open('unified_lstm_model/model_info.pkl', 'rb') as f:
                self.model_info = pickle.load(f)

            print("      ✅ Scalers loaded")
            print(f"      📊 Feature count: {len(self.model_info['feature_columns'])}")

            return True

        except Exception as e:
            print(f"      ❌ Error: {str(e)}")
            return False

    def create_simple_test_data(self):
        """Create simple test data"""
        print("📊 Creating simple test data...")

        # Simple synthetic data for testing
        np.random.seed(42)

        # Create sample sequences
        test_sequences = []
        test_targets = []

        feature_count = len(self.model_info['feature_columns'])

        for i in range(10):  # Just 10 test cases
            # Random sequence data
            sequence = np.random.randn(60, feature_count)
            target = np.random.randn(10) * 100 + 150  # Price-like values

            test_sequences.append(sequence)
            test_targets.append(target)

        print(f"   ✅ Created {len(test_sequences)} test sequences")
        return test_sequences, test_targets

    def evaluate_mae_simple(self):
        """Simple MAE evaluation"""
        print("\n🎯 EVALUATING MAE...")

        if not self.model:
            print("❌ Model not loaded")
            return False

        # Get test data
        test_sequences, test_targets = self.create_simple_test_data()

        # Scale test sequences
        scaled_sequences = []
        for seq in test_sequences:
            scaled_seq = self.feature_scaler.transform(seq)
            scaled_sequences.append(scaled_seq)

        scaled_sequences = np.array(scaled_sequences)

        print(f"   📊 Input shape: {scaled_sequences.shape}")

        # Generate predictions
        print("   🔄 Generating predictions...")
        pred_scaled = self.model.predict(scaled_sequences, verbose=0)

        # Denormalize predictions
        predictions = self.target_scaler.inverse_transform(pred_scaled)

        print(f"   📊 Predictions shape: {predictions.shape}")

        # Calculate MAE for each test case
        mae_values = []
        for i, (pred, actual) in enumerate(zip(predictions, test_targets)):
            mae = mean_absolute_error(actual, pred)
            mae_values.append(mae)
            print(f"      Test {i+1}: MAE = {mae:.4f}")

        # Overall MAE
        all_predictions = predictions.flatten()
        all_actuals = np.array(test_targets).flatten()
        overall_mae = mean_absolute_error(all_actuals, all_predictions)

        # Store results
        self.mae_results = {
            'individual_mae': mae_values,
            'overall_mae': overall_mae,
            'predictions': predictions,
            'actuals': test_targets,
            'mean_mae': np.mean(mae_values),
            'std_mae': np.std(mae_values),
            'min_mae': np.min(mae_values),
            'max_mae': np.max(mae_values)
        }

        print(f"\n📊 MAE RESULTS:")
        print(f"   🎯 Overall MAE: {overall_mae:.4f}")
        print(f"   📊 Mean MAE: {np.mean(mae_values):.4f}")
        print(f"   📊 Std MAE: {np.std(mae_values):.4f}")
        print(f"   📊 Min MAE: {np.min(mae_values):.4f}")
        print(f"   📊 Max MAE: {np.max(mae_values):.4f}")

        return True

    def create_mae_visualization(self):
        """Create simple MAE visualization"""
        print("\n📊 CREATING MAE VISUALIZATION...")

        if not self.mae_results:
            print("❌ No MAE results to visualize")
            return

        # Create 2x2 subplot
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'MAE by Test Case',
                'Predictions vs Actuals (Sample)',
                'MAE Distribution',
                'Overall MAE Summary'
            ),
            specs=[[{"type": "bar"}, {"type": "scatter"}],
                   [{"type": "histogram"}, {"type": "indicator"}]]
        )

        # 1. MAE by test case
        test_cases = [f"Test {i+1}" for i in range(len(self.mae_results['individual_mae']))]

        fig.add_trace(
            go.Bar(
                x=test_cases,
                y=self.mae_results['individual_mae'],
                name='MAE',
                marker_color='blue'
            ),
            row=1, col=1
        )

        # 2. Predictions vs Actuals (first test case)
        if len(self.mae_results['predictions']) > 0:
            sample_pred = self.mae_results['predictions'][0]
            sample_actual = self.mae_results['actuals'][0]

            fig.add_trace(
                go.Scatter(
                    x=list(range(len(sample_actual))),
                    y=sample_actual,
                    mode='lines+markers',
                    name='Actual',
                    line=dict(color='red'),
                    marker=dict(size=8)
                ),
                row=1, col=2
            )

            fig.add_trace(
                go.Scatter(
                    x=list(range(len(sample_pred))),
                    y=sample_pred,
                    mode='lines+markers',
                    name='Predicted',
                    line=dict(color='blue', dash='dash'),
                    marker=dict(size=8)
                ),
                row=1, col=2
            )

        # 3. MAE distribution
        fig.add_trace(
            go.Histogram(
                x=self.mae_results['individual_mae'],
                name='MAE Distribution',
                marker_color='green',
                nbinsx=5
            ),
            row=2, col=1
        )

        # 4. Overall MAE indicator
        fig.add_trace(
            go.Indicator(
                mode="gauge+number",
                value=self.mae_results['overall_mae'],
                title={'text': "Overall MAE"},
                gauge={
                    'axis': {'range': [0, self.mae_results['max_mae'] * 1.2]},
                    'bar': {'color': "darkblue"},
                    'steps': [
                        {'range': [0, self.mae_results['overall_mae'] * 0.5], 'color': "lightgray"},
                        {'range': [self.mae_results['overall_mae'] * 0.5, self.mae_results['overall_mae']], 'color': "gray"}
                    ],
                    'threshold': {
                        'line': {'color': "red", 'width': 4},
                        'thickness': 0.75,
                        'value': self.mae_results['overall_mae']
                    }
                }
            ),
            row=2, col=2
        )

        fig.update_layout(
            title="🧠 LSTM MAE Evaluation Results",
            height=800,
            showlegend=True
        )

        fig.show()

        # Simple bar chart for MAE summary
        print("   📊 Creating MAE summary chart...")

        fig2 = go.Figure()

        metrics = ['Overall MAE', 'Mean MAE', 'Min MAE', 'Max MAE']
        values = [
            self.mae_results['overall_mae'],
            self.mae_results['mean_mae'],
            self.mae_results['min_mae'],
            self.mae_results['max_mae']
        ]

        fig2.add_trace(
            go.Bar(
                x=metrics,
                y=values,
                marker_color=['red', 'blue', 'green', 'orange'],
                text=[f"{val:.4f}" for val in values],
                textposition='auto'
            )
        )

        fig2.update_layout(
            title="📊 MAE Summary Statistics",
            xaxis_title="Metrics",
            yaxis_title="MAE Value",
            height=400
        )

        fig2.show()

        print("   ✅ Visualization created!")

# ===================================================================
# 🚀 SIMPLE MAIN EXECUTION
# ===================================================================

def run_simple_evaluation():
    """Run simple LSTM MAE evaluation"""
    print("\n🚀 RUNNING SIMPLE LSTM MAE EVALUATION")
    print("="*50)

    # Initialize
    evaluator = SimpleLSTMEvaluator()

    # Step 1: Load LSTM
    print("🔄 Step 1: Loading LSTM...")
    if not evaluator.load_lstm_simple():
        print("❌ Cannot load LSTM model")
        return

    # Step 2: Evaluate MAE
    print("\n🔄 Step 2: Evaluating MAE...")
    if not evaluator.evaluate_mae_simple():
        print("❌ MAE evaluation failed")
        return

    # Step 3: Create visualization
    print("\n🔄 Step 3: Creating visualization...")
    evaluator.create_mae_visualization()

    # Final summary
    print(f"\n🎉 EVALUATION COMPLETED!")
    print(f"📊 Final MAE: {evaluator.mae_results['overall_mae']:.4f}")
    print(f"📊 Test Cases: {len(evaluator.mae_results['individual_mae'])}")

    return evaluator.mae_results

# ===================================================================
# 🎯 RUN IT!
# ===================================================================

# Execute the simple evaluation
results = run_simple_evaluation()

print("\n" + "="*60)
print("🧠 SIMPLE LSTM MAE EVALUATION COMPLETE!")
print("="*60)

🧠 SIMPLE LSTM MAE EVALUATION
👤 User: wasirawasenju
📅 Date: 2025-07-25 07:28:21 UTC
🎯 Focus: MAE + Visualization
📁 Quick file check...
   ✅ Found 5 files in unified_lstm_model/
      ✅ unified_lstm_model.h5
      ✅ feature_scaler.pkl
      ✅ target_scaler.pkl

🚀 RUNNING SIMPLE LSTM MAE EVALUATION
🔄 Step 1: Loading LSTM...
🔄 Loading LSTM...
   📦 Loading model...
      ✅ Model loaded & compiled
   📦 Loading scalers...
      ✅ Scalers loaded
      📊 Feature count: 21

🔄 Step 2: Evaluating MAE...

🎯 EVALUATING MAE...
📊 Creating simple test data...
   ✅ Created 10 test sequences
   📊 Input shape: (10, 60, 21)
   🔄 Generating predictions...
   📊 Predictions shape: (10, 10)
      Test 1: MAE = 77.4268
      Test 2: MAE = 89.2965
      Test 3: MAE = 91.1387
      Test 4: MAE = 89.5054
      Test 5: MAE = 82.1781
      Test 6: MAE = 117.2152
      Test 7: MAE = 97.4072
      Test 8: MAE = 83.0177
      Test 9: MAE = 94.0276
      Test 10: MAE = 65.3021

📊 MAE RESULTS:
   🎯 Overall MAE: 88.6515
 

   📊 Creating MAE summary chart...


   ✅ Visualization created!

🎉 EVALUATION COMPLETED!
📊 Final MAE: 88.6515
📊 Test Cases: 10

🧠 SIMPLE LSTM MAE EVALUATION COMPLETE!


## RF evaluasi

In [13]:
"""
===================================================================
🌲 SIMPLE RANDOM FOREST F1 SCORE EVALUATION
===================================================================
User: wasirawasenju
Date: 2025-07-25 07:43:56 UTC
Environment: Google Colab
Focus: RF F1 Score ONLY - Simple Version
===================================================================
"""

# ===================================================================
# 📦 SIMPLE SETUP
# ===================================================================

!pip install -q plotly pandas numpy scikit-learn joblib

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
import pickle
import joblib
import os
from datetime import datetime
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

warnings.filterwarnings('ignore')

print("🌲 SIMPLE RF F1 SCORE EVALUATION")
print("="*60)
print(f"👤 User: wasirawasenju")
print(f"📅 Date: 2025-07-25 07:43:56 UTC")
print(f"🎯 Focus: F1 Score + Simple Visualization")
print("="*60)

# ===================================================================
# 🔍 QUICK RF FILE CHECK
# ===================================================================

def quick_rf_check():
    """Quick check of RF model files"""
    print("📁 Quick RF file check...")

    rf_dir = 'random_forest_models'
    if os.path.exists(rf_dir):
        files = os.listdir(rf_dir)
        print(f"   ✅ Found {len(files)} files in {rf_dir}/")

        # Check for Day models
        day_models = []
        for day in range(1, 6):
            day_file = f"Day{day}_Trend_rf_model.pkl"
            if day_file in files:
                day_models.append(f"Day{day}")
                print(f"      ✅ {day_file}")
            else:
                print(f"      ❌ {day_file}")

        return len(day_models) > 0, day_models
    else:
        print(f"   ❌ {rf_dir}/ not found")
        return False, []

rf_available, available_days = quick_rf_check()

# ===================================================================
# 🌲 SIMPLE RF EVALUATOR
# ===================================================================

class SimpleRFEvaluator:
    """Super simple RF F1 Score evaluator"""

    def __init__(self):
        self.rf_models = {}
        self.f1_results = {}

    def load_rf_models_simple(self):
        """Load RF models - simplified"""
        print("🔄 Loading RF models...")

        if not rf_available:
            print("❌ No RF models available")
            return False

        loaded_count = 0

        for day in available_days:
            try:
                model_path = f"random_forest_models/{day}_Trend_rf_model.pkl"
                model_data = joblib.load(model_path)
                self.rf_models[day] = model_data
                loaded_count += 1
                print(f"   ✅ {day} model loaded")
            except Exception as e:
                print(f"   ❌ {day} failed: {str(e)}")

        print(f"   📊 Loaded {loaded_count} RF models")
        return loaded_count > 0

    def create_simple_test_data(self):
        """Create simple test data for RF evaluation"""
        print("📊 Creating simple RF test data...")

        # Simple synthetic classification data
        np.random.seed(42)

        test_data = {}

        for day_key in self.rf_models.keys():
            print(f"   🔄 Creating test data for {day_key}...")

            # Get model info
            model_data = self.rf_models[day_key]
            rf_model = model_data['model']
            feature_columns = model_data['feature_columns']

            # Generate test features
            n_samples = 50  # Simple 50 test cases
            n_features = len(feature_columns)

            # Random feature values (normalized-like)
            X_test = np.random.randn(n_samples, n_features)

            # Generate realistic trend labels
            # Bias towards certain classes for realistic distribution
            trend_probs = np.random.rand(n_samples)
            y_test = []

            for prob in trend_probs:
                if prob < 0.4:
                    y_test.append('STAY')  # 40% STAY
                elif prob < 0.7:
                    y_test.append('UP')    # 30% UP
                else:
                    y_test.append('DOWN')  # 30% DOWN

            test_data[day_key] = {
                'X_test': X_test,
                'y_test': y_test,
                'feature_columns': feature_columns
            }

            print(f"      ✅ {n_samples} test samples for {day_key}")

        return test_data

    def evaluate_f1_simple(self):
        """Simple F1 Score evaluation"""
        print("\n🎯 EVALUATING F1 SCORES...")

        if not self.rf_models:
            print("❌ RF models not loaded")
            return False

        # Get test data
        test_data = self.create_simple_test_data()

        day_results = {}
        all_predictions = []
        all_actuals = []

        for day_key in self.rf_models.keys():
            print(f"   🔄 Evaluating {day_key}...")

            try:
                # Get model and test data
                model_data = self.rf_models[day_key]
                rf_model = model_data['model']
                test_info = test_data[day_key]

                X_test = test_info['X_test']
                y_test = test_info['y_test']

                # Generate predictions
                y_pred = rf_model.predict(X_test)
                y_proba = rf_model.predict_proba(X_test)

                # Calculate F1 scores
                f1_macro = f1_score(y_test, y_pred, average='macro')
                f1_weighted = f1_score(y_test, y_pred, average='weighted')
                accuracy = accuracy_score(y_test, y_pred)

                # Calculate per-class F1
                f1_per_class = f1_score(y_test, y_pred, average=None, labels=['UP', 'DOWN', 'STAY'])

                # Average confidence
                max_probas = np.max(y_proba, axis=1)
                avg_confidence = np.mean(max_probas)

                day_results[day_key] = {
                    'f1_macro': f1_macro,
                    'f1_weighted': f1_weighted,
                    'accuracy': accuracy,
                    'f1_up': f1_per_class[0] if len(f1_per_class) > 0 else 0,
                    'f1_down': f1_per_class[1] if len(f1_per_class) > 1 else 0,
                    'f1_stay': f1_per_class[2] if len(f1_per_class) > 2 else 0,
                    'avg_confidence': avg_confidence,
                    'predictions': y_pred.tolist(),
                    'actuals': y_test,
                    'test_count': len(y_test)
                }

                # Add to overall collections
                all_predictions.extend(y_pred)
                all_actuals.extend(y_test)

                print(f"      📊 F1 Macro: {f1_macro:.3f}")
                print(f"      📊 F1 Weighted: {f1_weighted:.3f}")
                print(f"      📊 Accuracy: {accuracy:.3f}")
                print(f"      📊 Confidence: {avg_confidence:.3f}")

            except Exception as e:
                print(f"      ❌ Error: {str(e)}")

        # Calculate overall metrics
        if all_predictions:
            overall_f1_macro = f1_score(all_actuals, all_predictions, average='macro')
            overall_f1_weighted = f1_score(all_actuals, all_predictions, average='weighted')
            overall_accuracy = accuracy_score(all_actuals, all_predictions)

            overall_results = {
                'f1_macro': overall_f1_macro,
                'f1_weighted': overall_f1_weighted,
                'accuracy': overall_accuracy,
                'total_predictions': len(all_predictions),
                'models_evaluated': len(day_results)
            }

            self.f1_results = {
                'day_results': day_results,
                'overall_results': overall_results,
                'all_predictions': all_predictions,
                'all_actuals': all_actuals
            }

            print(f"\n📊 OVERALL F1 RESULTS:")
            print(f"   🎯 Overall F1 Macro: {overall_f1_macro:.3f}")
            print(f"   🎯 Overall F1 Weighted: {overall_f1_weighted:.3f}")
            print(f"   🎯 Overall Accuracy: {overall_accuracy:.3f}")
            print(f"   📊 Total Predictions: {len(all_predictions)}")
            print(f"   📊 Models Evaluated: {len(day_results)}")

            return True
        else:
            print("❌ No predictions generated")
            return False

    def create_f1_visualization(self):
        """Create simple F1 Score visualization"""
        print("\n📊 CREATING F1 VISUALIZATION...")

        if not self.f1_results:
            print("❌ No F1 results to visualize")
            return

        # Create 2x2 subplot
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'F1 Macro Score by Day',
                'F1 Score by Class (Day1)',
                'Accuracy vs F1 Score',
                'Overall Performance Summary'
            ),
            specs=[[{"type": "bar"}, {"type": "bar"}],
                   [{"type": "scatter"}, {"type": "bar"}]]
        )

        day_results = self.f1_results['day_results']

        # 1. F1 Macro by Day
        days = list(day_results.keys())
        f1_macros = [day_results[day]['f1_macro'] for day in days]

        fig.add_trace(
            go.Bar(
                x=days,
                y=f1_macros,
                name='F1 Macro',
                marker_color='blue',
                text=[f"{val:.3f}" for val in f1_macros],
                textposition='auto'
            ),
            row=1, col=1
        )

        # 2. F1 by Class (Day1 example)
        if len(days) > 0:
            sample_day = days[0]
            sample_result = day_results[sample_day]

            classes = ['UP', 'DOWN', 'STAY']
            class_f1s = [
                sample_result['f1_up'],
                sample_result['f1_down'],
                sample_result['f1_stay']
            ]

            fig.add_trace(
                go.Bar(
                    x=classes,
                    y=class_f1s,
                    name=f'F1 by Class ({sample_day})',
                    marker_color=['green', 'red', 'orange'],
                    text=[f"{val:.3f}" for val in class_f1s],
                    textposition='auto'
                ),
                row=1, col=2
            )

        # 3. Accuracy vs F1 Scatter
        accuracies = [day_results[day]['accuracy'] for day in days]

        fig.add_trace(
            go.Scatter(
                x=accuracies,
                y=f1_macros,
                mode='markers+text',
                text=days,
                textposition='top center',
                name='Accuracy vs F1',
                marker=dict(size=12, color='purple')
            ),
            row=2, col=1
        )

        # 4. Overall Performance Summary
        overall = self.f1_results['overall_results']
        metrics = ['F1 Macro', 'F1 Weighted', 'Accuracy']
        values = [
            overall['f1_macro'],
            overall['f1_weighted'],
            overall['accuracy']
        ]

        fig.add_trace(
            go.Bar(
                x=metrics,
                y=values,
                name='Overall Performance',
                marker_color=['blue', 'green', 'red'],
                text=[f"{val:.3f}" for val in values],
                textposition='auto'
            ),
            row=2, col=2
        )

        fig.update_layout(
            title="🌲 Random Forest F1 Score Evaluation",
            height=800,
            showlegend=True
        )

        fig.show()

        # Simple F1 summary chart
        print("   📊 Creating F1 summary chart...")

        fig2 = go.Figure()

        # F1 scores by day
        fig2.add_trace(
            go.Bar(
                x=days,
                y=f1_macros,
                name='F1 Macro Score',
                marker_color='blue',
                text=[f"{val:.3f}" for val in f1_macros],
                textposition='auto'
            )
        )

        # Add benchmark line
        benchmark = 0.65  # Good F1 score benchmark
        fig2.add_hline(
            y=benchmark,
            line_dash="dash",
            line_color="red",
            annotation_text=f"Good F1 Benchmark ({benchmark})"
        )

        fig2.update_layout(
            title="📊 F1 Macro Scores - RF Models Performance",
            xaxis_title="RF Models",
            yaxis_title="F1 Macro Score",
            height=400,
            yaxis=dict(range=[0, 1])
        )

        fig2.show()

        # Confusion Matrix for overall performance
        self._create_confusion_matrix()

        print("   ✅ F1 visualization created!")

    def _create_confusion_matrix(self):
        """Create simple confusion matrix visualization"""
        print("   📊 Creating confusion matrix...")

        # Calculate confusion matrix
        y_true = self.f1_results['all_actuals']
        y_pred = self.f1_results['all_predictions']

        cm = confusion_matrix(y_true, y_pred, labels=['UP', 'DOWN', 'STAY'])

        # Create heatmap
        fig = go.Figure()

        fig.add_trace(
            go.Heatmap(
                z=cm,
                x=['UP', 'DOWN', 'STAY'],
                y=['UP', 'DOWN', 'STAY'],
                colorscale='Blues',
                text=cm,
                texttemplate="%{text}",
                textfont={"size": 16},
                hoverongaps=False
            )
        )

        fig.update_layout(
            title="🎯 Overall Confusion Matrix - RF Models",
            xaxis_title="Predicted",
            yaxis_title="Actual",
            height=500
        )

        fig.show()

# ===================================================================
# 🚀 SIMPLE MAIN EXECUTION
# ===================================================================

def run_simple_rf_evaluation():
    """Run simple RF F1 evaluation"""
    print("\n🚀 RUNNING SIMPLE RF F1 EVALUATION")
    print("="*50)

    if not rf_available:
        print("❌ No RF models available")
        return None

    # Initialize
    evaluator = SimpleRFEvaluator()

    # Step 1: Load RF models
    print("🔄 Step 1: Loading RF models...")
    if not evaluator.load_rf_models_simple():
        print("❌ Cannot load RF models")
        return None

    # Step 2: Evaluate F1
    print("\n🔄 Step 2: Evaluating F1 scores...")
    if not evaluator.evaluate_f1_simple():
        print("❌ F1 evaluation failed")
        return None

    # Step 3: Create visualization
    print("\n🔄 Step 3: Creating F1 visualization...")
    evaluator.create_f1_visualization()

    # Final summary
    overall = evaluator.f1_results['overall_results']
    print(f"\n🎉 RF F1 EVALUATION COMPLETED!")
    print(f"📊 Overall F1 Macro: {overall['f1_macro']:.3f}")
    print(f"📊 Overall F1 Weighted: {overall['f1_weighted']:.3f}")
    print(f"📊 Overall Accuracy: {overall['accuracy']:.3f}")
    print(f"📊 Models Evaluated: {overall['models_evaluated']}")

    return evaluator.f1_results

# ===================================================================
# 🎯 EXECUTE
# ===================================================================

if rf_available:
    results = run_simple_rf_evaluation()
else:
    print("❌ Cannot run evaluation - no RF models found")
    results = None

print("\n" + "="*60)
print("🌲 SIMPLE RF F1 EVALUATION COMPLETE!")
print("="*60)

🌲 SIMPLE RF F1 SCORE EVALUATION
👤 User: wasirawasenju
📅 Date: 2025-07-25 07:43:56 UTC
🎯 Focus: F1 Score + Simple Visualization
📁 Quick RF file check...
   ✅ Found 6 files in random_forest_models/
      ✅ Day1_Trend_rf_model.pkl
      ✅ Day2_Trend_rf_model.pkl
      ✅ Day3_Trend_rf_model.pkl
      ✅ Day4_Trend_rf_model.pkl
      ✅ Day5_Trend_rf_model.pkl

🚀 RUNNING SIMPLE RF F1 EVALUATION
🔄 Step 1: Loading RF models...
🔄 Loading RF models...
   ✅ Day1 model loaded
   ✅ Day2 model loaded
   ✅ Day3 model loaded
   ✅ Day4 model loaded
   ✅ Day5 model loaded
   📊 Loaded 5 RF models

🔄 Step 2: Evaluating F1 scores...

🎯 EVALUATING F1 SCORES...
📊 Creating simple RF test data...
   🔄 Creating test data for Day1...
      ✅ 50 test samples for Day1
   🔄 Creating test data for Day2...
      ✅ 50 test samples for Day2
   🔄 Creating test data for Day3...
      ✅ 50 test samples for Day3
   🔄 Creating test data for Day4...
      ✅ 50 test samples for Day4
   🔄 Creating test data for Day5...
      ✅ 

   📊 Creating F1 summary chart...


   📊 Creating confusion matrix...


   ✅ F1 visualization created!

🎉 RF F1 EVALUATION COMPLETED!
📊 Overall F1 Macro: 0.196
📊 Overall F1 Weighted: 0.200
📊 Overall Accuracy: 0.252
📊 Models Evaluated: 5

🌲 SIMPLE RF F1 EVALUATION COMPLETE!
