In [5]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


Collecting pyarrow>=14.0.0 (from -r requirements.txt (line 6))
  Downloading pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting lightgbm>=4.0.0 (from -r requirements.txt (line 9))
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting xgboost>=2.0.0 (from -r requirements.txt (line 10))
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting scikit-learn>=1.3.0 (from -r requirements.txt (line 11))
  Downloading scikit_learn-1.7.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting catboost>=1.2.0 (from -r requirements.txt (line 12))
  Downloading catboost-1.2.8-cp313-cp313-manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tensorflow>=2.13.0 (from -r requirements.txt (line 15))
  Downloading tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting ta-lib (from -r requirements.txt (line 20))
  Downlo

1.load data 

In [3]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
import time
import os
import warnings
warnings.filterwarnings('ignore')

class TeslaDataCollector:
    """
    Collect 6 months of 5-minute bar data for Tesla
    Note: yfinance has limitations on historical 5-minute data
    """
    def __init__(self, ticker='TSLA'):
        self.ticker = ticker
        self.data_cache_dir = 'tesla_data_cache'
        
        # Create cache directory
        if not os.path.exists(self.data_cache_dir):
            os.makedirs(self.data_cache_dir)
    
    def fetch_5min_data_polygon(self, start_date, end_date, api_key=None):
        """
        Fetch 5-minute data using Polygon.io API (recommended for 6 months)
        Requires API key but provides reliable historical data
        """
        if not api_key:
            print("‚ö†Ô∏è Polygon API key not provided. Using fallback method...")
            return None
        
        try:
            from polygon import RESTClient
            client = RESTClient(api_key)
            
            bars = []
            current_date = start_date
            
            while current_date < end_date:
                next_date = min(current_date + timedelta(days=30), end_date)
                
                print(f"  Fetching {current_date.date()} to {next_date.date()}...")
                
                aggs = client.get_aggs(
                    ticker=self.ticker,
                    multiplier=5,
                    timespan="minute",
                    from_=current_date.strftime('%Y-%m-%d'),
                    to=next_date.strftime('%Y-%m-%d'),
                    limit=50000
                )
                
                bars.extend(aggs)
                current_date = next_date
                time.sleep(0.2)  # Rate limiting
            
            df = pd.DataFrame(bars)
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
            
            return df
            
        except Exception as e:
            print(f"‚ùå Polygon API error: {e}")
            return None
    
    def fetch_5min_data_alpaca(self, start_date, end_date, api_key=None, secret_key=None):
        """
        Fetch 5-minute data using Alpaca API (free with account)
        Good alternative for 6 months of 5-minute data
        """
        if not api_key or not secret_key:
            print("‚ö†Ô∏è Alpaca API credentials not provided. Using fallback method...")
            return None
        
        try:
            from alpaca.data.historical import StockHistoricalDataClient
            from alpaca.data.requests import StockBarsRequest
            from alpaca.data.timeframe import TimeFrame
            
            client = StockHistoricalDataClient(api_key, secret_key)
            
            request_params = StockBarsRequest(
                symbol_or_symbols=self.ticker,
                timeframe=TimeFrame.Minute,
                start=start_date,
                end=end_date
            )
            
            bars = client.get_stock_bars(request_params)
            df = bars.df.reset_index()
            
            # Resample to 5-minute bars
            df = df.set_index('timestamp')
            df_5min = df.resample('5T').agg({
                'open': 'first',
                'high': 'max',
                'low': 'min',
                'close': 'last',
                'volume': 'sum',
                'trade_count': 'sum'
            })
            
            return df_5min.reset_index()
            
        except Exception as e:
            print(f"‚ùå Alpaca API error: {e}")
            return None
    
    def fetch_5min_data_fallback(self, months_back=6):
        """
        Fallback method: Combine multiple data sources
        Since yfinance limits 5-minute data to 60 days
        """
        print("Using fallback method (limited to 60 days of 5-min data)")
        
        all_data = []
        
        # 1. Get last 60 days of 5-minute data from yfinance
        print("Fetching recent 60 days of 5-minute data...")
        
        end_date = datetime.now()
        start_date = end_date - timedelta(days=60)
        
        ticker = yf.Ticker(self.ticker)
        df_5min = ticker.history(
            start=start_date,
            end=end_date,
            interval='5m'
        )
        
        if not df_5min.empty:
            df_5min = df_5min.reset_index()
            df_5min.columns = [col.lower() for col in df_5min.columns]
            
            if 'datetime' in df_5min.columns:
                df_5min = df_5min.rename(columns={'datetime': 'timestamp'})
            elif 'index' in df_5min.columns:
                df_5min = df_5min.rename(columns={'index': 'timestamp'})
            
            all_data.append(df_5min)
            print(f"  ‚úÖ Fetched {len(df_5min)} 5-minute bars")
        
        # 2. Get older data at 15-minute intervals and interpolate
        print("Fetching older data at 15-minute intervals...")
        
        older_end = start_date
        older_start = older_end - timedelta(days=120)  # 4 more months
        
        df_15min = ticker.history(
            start=older_start,
            end=older_end,
            interval='15m'
        )
        
        if not df_15min.empty:
            df_15min = df_15min.reset_index()
            df_15min.columns = [col.lower() for col in df_15min.columns]
            
            if 'datetime' in df_15min.columns:
                df_15min = df_15min.rename(columns={'datetime': 'timestamp'})
            elif 'index' in df_15min.columns:
                df_15min = df_15min.rename(columns={'index': 'timestamp'})
            
            # Interpolate to approximate 5-minute bars
            df_15min_resampled = self.interpolate_to_5min(df_15min)
            all_data.append(df_15min_resampled)
            print(f"  ‚úÖ Interpolated {len(df_15min_resampled)} 5-minute bars from 15-min data")
        
        # Combine all data
        if all_data:
            df_combined = pd.concat(all_data, axis=0)
            df_combined = df_combined.drop_duplicates(subset=['timestamp'])
            df_combined = df_combined.sort_values('timestamp')
            return df_combined
        
        return pd.DataFrame()
    
    def interpolate_to_5min(self, df_15min):
        """
        Interpolate 15-minute bars to approximate 5-minute bars
        This is an approximation for backtesting when real 5-min data isn't available
        """
        df = df_15min.set_index('timestamp')
        
        # Resample to 5-minute intervals
        df_5min = pd.DataFrame()
        
        # For each 15-minute bar, create three 5-minute bars
        for idx, row in df.iterrows():
            # Simulate intraday price movement
            open_price = row['open']
            close_price = row['close']
            high_price = row['high']
            low_price = row['low']
            volume = row['volume']
            
            # Create 3 intermediate 5-minute bars
            mid1 = open_price + (close_price - open_price) * 0.33
            mid2 = open_price + (close_price - open_price) * 0.67
            
            bars = []
            
            # First 5-min bar
            bars.append({
                'timestamp': idx,
                'open': open_price,
                'high': max(open_price, mid1) + np.random.uniform(0, (high_price - close_price) * 0.3),
                'low': min(open_price, mid1) - np.random.uniform(0, (open_price - low_price) * 0.3),
                'close': mid1,
                'volume': volume * 0.33
            })
            
            # Second 5-min bar
            bars.append({
                'timestamp': idx + timedelta(minutes=5),
                'open': mid1,
                'high': max(mid1, mid2) + np.random.uniform(0, (high_price - close_price) * 0.3),
                'low': min(mid1, mid2) - np.random.uniform(0, (open_price - low_price) * 0.3),
                'close': mid2,
                'volume': volume * 0.33
            })
            
            # Third 5-min bar
            bars.append({
                'timestamp': idx + timedelta(minutes=10),
                'open': mid2,
                'high': max(mid2, close_price) + np.random.uniform(0, (high_price - close_price) * 0.3),
                'low': min(mid2, close_price) - np.random.uniform(0, (open_price - low_price) * 0.3),
                'close': close_price,
                'volume': volume * 0.34
            })
            
            df_5min = pd.concat([df_5min, pd.DataFrame(bars)])
        
        return df_5min.reset_index(drop=True)
    
    def fetch_complete_dataset(self, api_key=None, api_secret=None):
        """
        Main method to fetch 6 months of 5-minute data
        Tries multiple sources in order of preference
        """
        print("\n" + "="*60)
        print("Fetching 6 Months of 5-Minute Bar Data")
        print("="*60 + "\n")
        
        end_date = datetime.now()
        start_date = end_date - timedelta(days=180)  # 6 months
        
        # Check cache first
        cache_file = os.path.join(self.data_cache_dir, f'{self.ticker}_5min_6months.csv')
        
        if os.path.exists(cache_file):
            # Check if cache is recent (less than 1 day old)
            file_time = datetime.fromtimestamp(os.path.getmtime(cache_file))
            if datetime.now() - file_time < timedelta(days=1):
                print("üìÅ Loading from cache...")
                df = pd.read_csv(cache_file)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                print(f"‚úÖ Loaded {len(df)} bars from cache")
                return df
        
        # Try Polygon API first (best option)
        if api_key and 'polygon' in api_key.lower():
            print("Attempting to fetch via Polygon API...")
            df = self.fetch_5min_data_polygon(start_date, end_date, api_key)
            if df is not None and not df.empty:
                df.to_csv(cache_file, index=False)
                print(f"‚úÖ Successfully fetched {len(df)} bars via Polygon")
                return df
        
        # Try Alpaca API second
        if api_key and api_secret:
            print("Attempting to fetch via Alpaca API...")
            df = self.fetch_5min_data_alpaca(start_date, end_date, api_key, api_secret)
            if df is not None and not df.empty:
                df.to_csv(cache_file, index=False)
                print(f"‚úÖ Successfully fetched {len(df)} bars via Alpaca")
                return df
        
        # Use fallback method
        print("Using fallback method (yfinance + interpolation)...")
        df = self.fetch_5min_data_fallback(months_back=6)
        
        if not df.empty:
            df.to_csv(cache_file, index=False)
            print(f"‚úÖ Successfully created {len(df)} bars via fallback method")
        
        return df
    
    def add_market_features(self, df):
        """
        Add VIX, sentiment, and other market features
        """
        print("\nAdding market features...")
        
        # Fetch VIX data
        vix = yf.Ticker("^VIX")
        end_date = df['timestamp'].max()
        start_date = df['timestamp'].min()
        
        vix_data = vix.history(
            start=start_date,
            end=end_date,
            interval='1d'  # Daily VIX is sufficient
        )
        
        if not vix_data.empty:
            # Resample VIX to match our 5-minute data
            vix_data = vix_data['Close']
            vix_data = vix_data.resample('5T').fillna(method='ffill')
            
            df = df.set_index('timestamp')
            df['vix'] = vix_data
            df = df.reset_index()
        else:
            # Default VIX value
            df['vix'] = 20.0
        
        # Add sentiment score (simplified - in production use news API)
        df['sentiment_score'] = 0.5 + 0.3 * np.sin(np.arange(len(df)) / 1000)
        df['sentiment_score'] = np.clip(df['sentiment_score'], -1, 1)
        
        # Add transaction count estimate
        df['transactions'] = (df['volume'] / 100).astype(int) + np.random.randint(50, 200, len(df))
        
        # Forward fill any NaN values
        df = df.fillna(method='ffill').fillna(method='bfill')
        
        print(f"‚úÖ Added market features")
        
        return df
    
    def validate_data(self, df):
        """
        Validate and clean the data
        """
        print("\nValidating data...")
        
        # Check for required columns
        required_cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
        for col in required_cols:
            if col not in df.columns:
                print(f"‚ùå Missing required column: {col}")
                return None
        
        # Remove any rows where OHLC relationship is violated
        invalid_mask = (df['high'] < df['low']) | \
                      (df['high'] < df['open']) | \
                      (df['high'] < df['close']) | \
                      (df['low'] > df['open']) | \
                      (df['low'] > df['close'])
        
        if invalid_mask.any():
            print(f"  Removing {invalid_mask.sum()} invalid OHLC rows")
            df = df[~invalid_mask]
        
        # Remove duplicates
        df = df.drop_duplicates(subset=['timestamp'])
        
        # Sort by timestamp
        df = df.sort_values('timestamp')
        
        # Calculate some statistics
        date_range = (df['timestamp'].max() - df['timestamp'].min()).days
        bars_per_day = len(df) / max(date_range, 1)
        
        print(f"\nüìä Data Statistics:")
        print(f"  Date range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
        print(f"  Total bars: {len(df):,}")
        print(f"  Trading days: ~{date_range}")
        print(f"  Bars per day: ~{bars_per_day:.0f}")
        print(f"  Expected (78 bars/day): {date_range * 78:,}")
        
        if len(df) < date_range * 50:  # Less than 50 bars per day suggests missing data
            print("  ‚ö†Ô∏è Warning: Data might be incomplete")
        
        return df

# Main function to load data
def load_6months_5min_data(api_key=None, api_secret=None):
    """
    Main function to load 6 months of 5-minute bar data
    
    Args:
        api_key: Optional API key for Polygon or Alpaca
        api_secret: Optional API secret for Alpaca
    
    Returns:
        DataFrame with 6 months of 5-minute bars
    """
    
    print("\n" + "="*80)
    print(" LOADING 6 MONTHS OF 5-MINUTE DATA FOR TESLA ".center(80, "="))
    print("="*80 + "\n")
    
    # Initialize collector
    collector = TeslaDataCollector('TSLA')
    
    # Fetch the data
    df = collector.fetch_complete_dataset(api_key, api_secret)
    
    if df.empty:
        print("‚ùå Failed to fetch data")
        return None
    
    # Add market features
    df = collector.add_market_features(df)
    
    # Validate data
    df = collector.validate_data(df)
    
    if df is not None:
        # Save the final dataset
        output_file = 'tesla_6months_5min_data.csv'
        df.to_csv(output_file, index=False)
        print(f"\n‚úÖ Data saved to {output_file}")
        
        # Also save as pickle for faster loading
        df.to_pickle('tesla_6months_5min_data.pkl')
        print(f"‚úÖ Data saved to tesla_6months_5min_data.pkl")
        
        print(f"\nüìà Final Dataset:")
        print(f"  Shape: {df.shape}")
        print(f"  Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
        print(f"  Columns: {df.columns.tolist()}")
        
        # Calculate expected training sequences
        lookback_days = 20
        bars_per_day = 78
        sequence_length = lookback_days * bars_per_day
        num_sequences = (len(df) - sequence_length) // bars_per_day
        
        print(f"\nüîÆ For LSTM Training:")
        print(f"  Lookback period: {lookback_days} days")
        print(f"  Sequence length: {sequence_length} bars")
        print(f"  Potential training sequences: ~{num_sequences}")
        
        if num_sequences < 100:
            print("  ‚ö†Ô∏è Warning: May need more data for robust training")
        else:
            print("  ‚úÖ Sufficient sequences for training")
    
    return df

# Quick loader function for subsequent use
def quick_load():
    """
    Quick loader for already downloaded data
    """
    try:
        # Try pickle first (faster)
        df = pd.read_pickle('tesla_6months_5min_data.pkl')
        print(f"‚úÖ Loaded {len(df)} bars from pickle file")
        return df
    except:
        try:
            # Fall back to CSV
            df = pd.read_csv('tesla_6months_5min_data.csv')
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            print(f"‚úÖ Loaded {len(df)} bars from CSV file")
            return df
        except:
            print("‚ùå No saved data found. Running full data collection...")
            return load_6months_5min_data()

# Usage examples
if __name__ == "__main__":
    
    # Option 1: Load without API (uses fallback method)
    df = load_6months_5min_data()
    
    # Option 2: Load with Polygon API (best quality)
    # df = load_6months_5min_data(api_key='your_polygon_api_key')
    
    # Option 3: Load with Alpaca API (free with account)
    # df = load_6months_5min_data(
    #     api_key='your_alpaca_api_key',
    #     api_secret='your_alpaca_secret_key'
    # )
    
    # Option 4: Quick load if data already exists
    # df = quick_load()
    
    if df is not None:
        print("\n‚úÖ Data ready for hyperparameter tuning!")




Fetching 6 Months of 5-Minute Bar Data

Using fallback method (yfinance + interpolation)...
Using fallback method (limited to 60 days of 5-min data)
Fetching recent 60 days of 5-minute data...
  ‚úÖ Fetched 3267 5-minute bars
Fetching older data at 15-minute intervals...


$TSLA: possibly delisted; no price data found  (15m 2025-05-13 18:48:55.159326 -> 2025-09-10 18:48:55.159326) (Yahoo error = "15m data not available for startTime=1747176535 and endTime=1757544535. The requested range must be within the last 60 days.")


‚úÖ Successfully created 3267 bars via fallback method

Adding market features...
‚úÖ Added market features

Validating data...

üìä Data Statistics:
  Date range: 2025-09-11 to 2025-11-07
  Total bars: 3,267
  Trading days: ~57
  Bars per day: ~57
  Expected (78 bars/day): 4,446

‚úÖ Data saved to tesla_6months_5min_data.csv
‚úÖ Data saved to tesla_6months_5min_data.pkl

üìà Final Dataset:
  Shape: (3267, 11)
  Memory usage: 0.27 MB
  Columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'vix', 'sentiment_score', 'transactions']

üîÆ For LSTM Training:
  Lookback period: 20 days
  Sequence length: 1560 bars
  Potential training sequences: ~21

‚úÖ Data ready for hyperparameter tuning!


# Just run this to get your data:
df = load_6months_5min_data()

# Data is automatically saved to:
# - tesla_6months_5min_data.csv
# - tesla_6months_5min_data.pkl

# For subsequent runs, use quick loader:
df = quick_load()

2.hyperparameter tunning

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import RobustScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l1_l2
import tensorflow as tf
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class AdvancedHyperparameterTuner:
    """
    Advanced hyperparameter tuning for OHLC prediction with 6 months of data
    """
    def __init__(self, lookback_days=20, interval_minutes=5):
        self.lookback_days = lookback_days
        self.interval_minutes = interval_minutes
        self.bars_per_day = 390 // interval_minutes  # 78 for 5-min bars
        self.sequence_length = self.bars_per_day * lookback_days
        self.best_params = None
        self.study = None
        
    def create_features(self, df):
        """
        Create advanced features for LSTM training
        """
        # Create a copy to avoid modifying original
        df = df.copy()
        
        # Handle timezone-aware timestamps
        if 'timestamp' in df.columns:
            # Convert timezone-aware to timezone-naive UTC
            if hasattr(df['timestamp'].iloc[0], 'tz'):
                df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize(None)
            else:
                df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        features = pd.DataFrame(index=df.index)
        
        # Core OHLCV features
        features['open'] = df['open'].astype(float)
        features['high'] = df['high'].astype(float)
        features['low'] = df['low'].astype(float)
        features['close'] = df['close'].astype(float)
        features['volume'] = df['volume'].astype(float)
        
        # Price-based features
        features['returns'] = df['close'].pct_change()
        features['log_returns'] = np.log(df['close'] / df['close'].shift(1))
        features['high_low_pct'] = (df['high'] - df['low']) / (df['close'] + 1e-10)
        features['close_open_pct'] = (df['close'] - df['open']) / (df['open'] + 1e-10)
        
        # Candlestick patterns
        body_size = df['high'] - df['low'] + 1e-10
        features['upper_shadow'] = (df['high'] - np.maximum(df['open'], df['close'])) / body_size
        features['lower_shadow'] = (np.minimum(df['open'], df['close']) - df['low']) / body_size
        features['body_size'] = np.abs(df['close'] - df['open']) / body_size
        
        # Volume features
        volume_sma = df['volume'].rolling(20, min_periods=1).mean()
        features['volume_sma_ratio'] = df['volume'] / (volume_sma + 1e-10)
        volume_std = df['volume'].rolling(20, min_periods=1).std()
        features['volume_std'] = volume_std / (volume_sma + 1e-10)
        
        if 'transactions' in df.columns:
            features['transactions_per_volume'] = df['transactions'] / (df['volume'] + 1)
        else:
            features['transactions_per_volume'] = df['volume'] / 100000  # Estimate
        
        # Volatility features
        features['volatility_5'] = features['returns'].rolling(5, min_periods=1).std()
        features['volatility_20'] = features['returns'].rolling(20, min_periods=1).std()
        features['volatility_ratio'] = features['volatility_5'] / (features['volatility_20'] + 1e-10)
        
        # Technical indicators
        features['rsi'] = self.calculate_rsi(df['close'])
        features['macd'], features['macd_signal'] = self.calculate_macd(df['close'])
        features['bb_position'] = self.calculate_bollinger_position(df['close'])
        features['stoch_k'], features['stoch_d'] = self.calculate_stochastic(df)
        
        # Moving averages
        for period in [5, 10, 20]:  # Reduced to avoid too many features
            sma = df['close'].rolling(period, min_periods=1).mean()
            features[f'sma_{period}'] = sma
            features[f'sma_{period}_ratio'] = df['close'] / (sma + 1e-10)
        
        # VIX features
        if 'vix' in df.columns:
            features['vix'] = df['vix'].astype(float)
            features['vix_sma_20'] = df['vix'].rolling(20, min_periods=1).mean()
            features['vix_ratio'] = df['vix'] / (features['vix_sma_20'] + 1e-10)
        else:
            features['vix'] = 20.0
            features['vix_sma_20'] = 20.0
            features['vix_ratio'] = 1.0
        
        # Sentiment
        if 'sentiment_score' in df.columns:
            features['sentiment'] = df['sentiment_score'].astype(float)
            features['sentiment_sma'] = df['sentiment_score'].rolling(20, min_periods=1).mean()
        else:
            features['sentiment'] = 0.5
            features['sentiment_sma'] = 0.5
        
        # Time-based features (important for intraday patterns)
        if 'timestamp' in df.columns:
            # Use the timezone-naive timestamp
            ts = df['timestamp']
            features['hour'] = ts.dt.hour
            features['minute'] = ts.dt.minute
            features['day_of_week'] = ts.dt.dayofweek
            features['is_morning'] = (features['hour'] < 11).astype(int)
            features['is_afternoon'] = (features['hour'] >= 14).astype(int)
            
            # Minutes from market open (9:30 AM)
            features['minutes_from_open'] = (features['hour'] - 9) * 60 + (features['minute'] - 30)
            features['minutes_from_open'] = features['minutes_from_open'].clip(0, 390)
            features['minutes_to_close'] = 390 - features['minutes_from_open']
            
            # Normalize time features
            features['hour_norm'] = features['hour'] / 24
            features['minute_norm'] = features['minute'] / 60
            features['day_of_week_norm'] = features['day_of_week'] / 6
        else:
            # Default time features
            features['hour'] = 12
            features['minute'] = 0
            features['day_of_week'] = 2
            features['is_morning'] = 0
            features['is_afternoon'] = 0
            features['minutes_from_open'] = 150
            features['minutes_to_close'] = 240
            features['hour_norm'] = 0.5
            features['minute_norm'] = 0.0
            features['day_of_week_norm'] = 0.33
        
        # Fill NaN values
        features = features.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Replace infinities
        features = features.replace([np.inf, -np.inf], 0)
        
        return features
    
    def calculate_rsi(self, prices, period=14):
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))
        return rsi.fillna(50)  # Neutral RSI when not enough data
    
    def calculate_macd(self, prices, fast=12, slow=26, signal=9):
        exp1 = prices.ewm(span=fast, adjust=False).mean()
        exp2 = prices.ewm(span=slow, adjust=False).mean()
        macd = exp1 - exp2
        macd_signal = macd.ewm(span=signal, adjust=False).mean()
        return macd.fillna(0), macd_signal.fillna(0)
    
    def calculate_bollinger_position(self, prices, period=20):
        sma = prices.rolling(period, min_periods=1).mean()
        std = prices.rolling(period, min_periods=1).std()
        std = std.fillna(prices.std())  # Use overall std when not enough data
        upper = sma + (2 * std)
        lower = sma - (2 * std)
        position = (prices - lower) / (upper - lower + 1e-10)
        return position.fillna(0.5).clip(0, 1)
    
    def calculate_stochastic(self, df, k_period=14, d_period=3):
        low_min = df['low'].rolling(k_period, min_periods=1).min()
        high_max = df['high'].rolling(k_period, min_periods=1).max()
        k_percent = 100 * ((df['close'] - low_min) / (high_max - low_min + 1e-10))
        d_percent = k_percent.rolling(d_period, min_periods=1).mean()
        return k_percent.fillna(50), d_percent.fillna(50)
    
    def prepare_sequences(self, features, target_cols=['open', 'high', 'low', 'close']):
        """
        Prepare sequences for LSTM training with proper time series handling
        """
        X, y = [], []
        
        # Ensure we have enough data
        min_required = self.sequence_length + self.bars_per_day
        if len(features) < min_required:
            print(f"‚ö†Ô∏è Insufficient data: {len(features)} bars, need {min_required}")
            return np.array([]), np.array([])
        
        # Create sequences
        for i in range(self.sequence_length, len(features) - self.bars_per_day):
            # Input sequence
            sequence = features.iloc[i-self.sequence_length:i].values
            
            # Check for NaN or Inf in sequence
            if np.any(np.isnan(sequence)) or np.any(np.isinf(sequence)):
                continue
            
            X.append(sequence)
            
            # Target: next day's OHLC
            next_day_data = features.iloc[i:i+self.bars_per_day]
            if len(next_day_data) == self.bars_per_day:
                ohlc = np.array([
                    next_day_data['open'].iloc[0],
                    next_day_data['high'].max(),
                    next_day_data['low'].min(),
                    next_day_data['close'].iloc[-1]
                ])
                
                # Check for valid OHLC values
                if np.all(np.isfinite(ohlc)) and ohlc[1] >= ohlc[2]:  # high >= low
                    y.append(ohlc)
                else:
                    X.pop()  # Remove the corresponding input if target is invalid
        
        return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)
    
    def create_model(self, trial, input_shape):
        """
        Create model with Optuna trial parameters
        """
        # Model architecture choices
        model_type = trial.suggest_categorical('model_type', ['lstm', 'gru'])  # Removed bilstm for stability
        n_layers = trial.suggest_int('n_layers', 2, 3)  # Reduced max layers
        
        # Regularization
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4, step=0.05)
        use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
        
        # Optimization
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
        
        model = Sequential()
        
        # First layer
        first_units = trial.suggest_int('units_layer_0', 64, 256, step=32)
        
        if model_type == 'lstm':
            model.add(LSTM(
                first_units,
                return_sequences=(n_layers > 1),
                input_shape=input_shape
            ))
        else:  # gru
            model.add(GRU(
                first_units,
                return_sequences=(n_layers > 1),
                input_shape=input_shape
            ))
        
        model.add(Dropout(dropout_rate))
        if use_batch_norm and n_layers > 1:
            model.add(BatchNormalization())
        
        # Hidden layers
        for i in range(1, n_layers):
            units = trial.suggest_int(f'units_layer_{i}', 32, 128, step=32)
            return_seq = (i < n_layers - 1)
            
            if model_type == 'lstm':
                model.add(LSTM(units, return_sequences=return_seq))
            else:
                model.add(GRU(units, return_sequences=return_seq))
            
            model.add(Dropout(dropout_rate * 0.8))
            if use_batch_norm and return_seq:
                model.add(BatchNormalization())
        
        # Dense layers
        dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
        model.add(Dense(dense_units, activation='relu'))
        model.add(Dropout(dropout_rate * 0.5))
        
        # Output layer
        model.add(Dense(4, activation='linear'))
        
        # Compile
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss='mse',
            metrics=['mae']
        )
        
        return model
    
    def objective(self, trial, X_train, y_train, X_val, y_val):
        """
        Objective function for Optuna optimization
        """
        try:
            # Clear previous models
            tf.keras.backend.clear_session()
            
            # Create model
            model = self.create_model(trial, (X_train.shape[1], X_train.shape[2]))
            
            # Training parameters
            batch_size = trial.suggest_int('batch_size', 16, 64, step=16)
            
            # Callbacks
            early_stop = EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True,
                verbose=0
            )
            
            # Train model
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=50,  # Reduced for faster tuning
                batch_size=batch_size,
                callbacks=[early_stop],
                verbose=0
            )
            
            # Get best validation loss
            val_loss = min(history.history['val_loss'])
            
            return val_loss
            
        except Exception as e:
            print(f"Trial failed: {e}")
            return float('inf')
    
    def tune(self, df, n_trials=100, n_jobs=1):
        """
        Run hyperparameter tuning
        """
        print("\n" + "="*60)
        print("Advanced Hyperparameter Tuning")
        print("="*60 + "\n")
        
        # Prepare features
        print("Creating features...")
        features = self.create_features(df)
        print(f"Features shape: {features.shape}")
        
        # Prepare sequences
        print("Preparing sequences...")
        X, y = self.prepare_sequences(features)
        
        if len(X) == 0:
            print("‚ùå Insufficient data for tuning")
            return None
        
        print(f"X shape: {X.shape}, y shape: {y.shape}")
        
        # Split data
        val_split = 0.2
        split_idx = int(len(X) * (1 - val_split))
        
        X_train = X[:split_idx]
        y_train = y[:split_idx]
        X_val = X[split_idx:]
        y_val = y[split_idx:]
        
        print(f"Training samples: {len(X_train)}")
        print(f"Validation samples: {len(X_val)}")
        
        # Scale the data
        scaler_X = RobustScaler()
        scaler_y = RobustScaler()
        
        X_train_scaled = scaler_X.fit_transform(
            X_train.reshape(-1, X_train.shape[-1])
        ).reshape(X_train.shape)
        
        X_val_scaled = scaler_X.transform(
            X_val.reshape(-1, X_val.shape[-1])
        ).reshape(X_val.shape)
        
        y_train_scaled = scaler_y.fit_transform(y_train)
        y_val_scaled = scaler_y.transform(y_val)
        
        # Save scalers
        joblib.dump(scaler_X, 'scaler_features.pkl')
        joblib.dump(scaler_y, 'scaler_target.pkl')
        
        # Create study
        self.study = optuna.create_study(
            direction='minimize',
            study_name='tesla_ohlc_tuning'
        )
        
        # Optimize
        print(f"\nRunning {n_trials} trials...")
        self.study.optimize(
            lambda trial: self.objective(
                trial, 
                X_train_scaled, y_train_scaled,
                X_val_scaled, y_val_scaled
            ),
            n_trials=n_trials,
            n_jobs=n_jobs
        )
        
        # Get best parameters
        self.best_params = self.study.best_params
        
        print("\n" + "="*60)
        print("Best Hyperparameters Found")
        print("="*60)
        
        for key, value in self.best_params.items():
            print(f"  {key}: {value}")
        
        print(f"\nBest validation loss: {self.study.best_value:.6f}")
        
        # Save results
        with open('best_hyperparameters.json', 'w') as f:
            json.dump(self.best_params, f, indent=4)
        
        print(f"\n‚úÖ Best parameters saved to best_hyperparameters.json")
        
        return self.best_params

# Main function
def run_hyperparameter_tuning(df=None, n_trials=100):
    """
    Main function to run hyperparameter tuning
    """
    print("\n" + "="*80)
    print(" HYPERPARAMETER TUNING FOR TESLA OHLC PREDICTION ".center(80, "="))
    print("="*80 + "\n")
    
    # Load data if not provided
    if df is None:
        print("Loading data...")
        try:
            df = pd.read_pickle('tesla_6months_5min_data.pkl')
            print(f"‚úÖ Loaded {len(df)} bars from saved data")
        except:
            try:
                df = pd.read_csv('tesla_6months_5min_data.csv')
                print(f"‚úÖ Loaded {len(df)} bars from CSV")
            except:
                print("‚ùå No saved data found. Please run data collection first.")
                return None
    
    # Initialize tuner
    tuner = AdvancedHyperparameterTuner(
        lookback_days=20,
        interval_minutes=5
    )
    
    # Run tuning
    best_params = tuner.tune(df, n_trials=n_trials)
    
    if best_params:
        print("\n" + "="*80)
        print(" TUNING COMPLETE ".center(80, "="))
        print("="*80)
    
    return best_params

if __name__ == "__main__":
    # Run with fewer trials for testing
    best_params = run_hyperparameter_tuning(n_trials=20)  # Reduced for testing

2025-11-09 19:05:02.661207: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-09 19:05:24.029431: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-09 19:05:35.920548: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.




Loading data...
‚úÖ Loaded 3267 bars from saved data

Advanced Hyperparameter Tuning

Creating features...


ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True, at position 2877

# Load your 6 months of data
df = pd.read_pickle('tesla_6months_5min_data.pkl')

# Run hyperparameter tuning
best_params = run_hyperparameter_tuning(df, n_trials=100)

# Results are saved to:
# - best_hyperparameters.json (best parameters)
# - optuna_study.pkl (full study object)
# - scaler_features.pkl (feature scaler)
# - scaler_target.pkl (target scaler)
# - hyperparameter_tuning_results.png (visualization)

In [None]:
# Load your 6 months of data
df = pd.read_pickle('tesla_6months_5min_data.pkl')

# Run hyperparameter tuning
best_params = run_hyperparameter_tuning(df, n_trials=100)


train and predict

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import RobustScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l1_l2
import tensorflow as tf
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class AdvancedHyperparameterTuner:
    """
    Advanced hyperparameter tuning for OHLC prediction with 6 months of data
    """
    def __init__(self, lookback_days=20, interval_minutes=5):
        self.lookback_days = lookback_days
        self.interval_minutes = interval_minutes
        self.bars_per_day = 390 // interval_minutes  # 78 for 5-min bars
        self.sequence_length = self.bars_per_day * lookback_days
        self.best_params = None
        self.study = None
        
    def create_features(self, df):
        """
        Create advanced features for LSTM training
        """
        # Create a copy to avoid modifying original
        df = df.copy()
        
        features = pd.DataFrame(index=df.index)
        
        # Core OHLCV features
        features['open'] = df['open'].astype(float)
        features['high'] = df['high'].astype(float)
        features['low'] = df['low'].astype(float)
        features['close'] = df['close'].astype(float)
        features['volume'] = df['volume'].astype(float)
        
        # Price-based features
        features['returns'] = df['close'].pct_change()
        features['log_returns'] = np.log(df['close'] / df['close'].shift(1))
        features['high_low_pct'] = (df['high'] - df['low']) / (df['close'] + 1e-10)
        features['close_open_pct'] = (df['close'] - df['open']) / (df['open'] + 1e-10)
        
        # Candlestick patterns
        body_size = df['high'] - df['low'] + 1e-10
        features['upper_shadow'] = (df['high'] - np.maximum(df['open'], df['close'])) / body_size
        features['lower_shadow'] = (np.minimum(df['open'], df['close']) - df['low']) / body_size
        features['body_size'] = np.abs(df['close'] - df['open']) / body_size
        
        # Volume features
        volume_sma = df['volume'].rolling(20, min_periods=1).mean()
        features['volume_sma_ratio'] = df['volume'] / (volume_sma + 1e-10)
        volume_std = df['volume'].rolling(20, min_periods=1).std()
        features['volume_std'] = volume_std / (volume_sma + 1e-10)
        
        if 'transactions' in df.columns:
            features['transactions_per_volume'] = df['transactions'] / (df['volume'] + 1)
        else:
            features['transactions_per_volume'] = df['volume'] / 100000  # Estimate
        
        # Volatility features
        features['volatility_5'] = features['returns'].rolling(5, min_periods=1).std()
        features['volatility_20'] = features['returns'].rolling(20, min_periods=1).std()
        features['volatility_ratio'] = features['volatility_5'] / (features['volatility_20'] + 1e-10)
        
        # Technical indicators
        features['rsi'] = self.calculate_rsi(df['close'])
        features['macd'], features['macd_signal'] = self.calculate_macd(df['close'])
        features['bb_position'] = self.calculate_bollinger_position(df['close'])
        features['stoch_k'], features['stoch_d'] = self.calculate_stochastic(df)
        
        # Moving averages
        for period in [5, 10, 20]:  # Reduced to avoid too many features
            sma = df['close'].rolling(period, min_periods=1).mean()
            features[f'sma_{period}'] = sma
            features[f'sma_{period}_ratio'] = df['close'] / (sma + 1e-10)
        
        # VIX features
        if 'vix' in df.columns:
            features['vix'] = df['vix'].astype(float)
            features['vix_sma_20'] = df['vix'].rolling(20, min_periods=1).mean()
            features['vix_ratio'] = df['vix'] / (features['vix_sma_20'] + 1e-10)
        else:
            features['vix'] = 20.0
            features['vix_sma_20'] = 20.0
            features['vix_ratio'] = 1.0
        
        # Sentiment
        if 'sentiment_score' in df.columns:
            features['sentiment'] = df['sentiment_score'].astype(float)
            features['sentiment_sma'] = df['sentiment_score'].rolling(20, min_periods=1).mean()
        else:
            features['sentiment'] = 0.5
            features['sentiment_sma'] = 0.5
        
        # Time-based features (important for intraday patterns)
        ##################################
        # Normalize timestamps: handle strings, python datetimes (aware/naive), mixed types
        if 'timestamp' in df.columns:
    # 1) Parse everything as tz-aware UTC to avoid mixed-type errors
            ts = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)

    # 2) (Optional but recommended here) convert to US/Eastern because your intraday
    #    features assume the US market clock (9:30‚Äì16:00 ET)
            ts = ts.dt.tz_convert('America/New_York')

    # 3) Drop timezone to make downstream math simple
            df['timestamp'] = ts.dt.tz_localize(None)

        ##################################
        # Fill NaN values
        features = features.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Replace infinities
        features = features.replace([np.inf, -np.inf], 0)
        
        return features
    
    def calculate_rsi(self, prices, period=14):
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))
        return rsi.fillna(50)  # Neutral RSI when not enough data
    
    def calculate_macd(self, prices, fast=12, slow=26, signal=9):
        exp1 = prices.ewm(span=fast, adjust=False).mean()
        exp2 = prices.ewm(span=slow, adjust=False).mean()
        macd = exp1 - exp2
        macd_signal = macd.ewm(span=signal, adjust=False).mean()
        return macd.fillna(0), macd_signal.fillna(0)
    
    def calculate_bollinger_position(self, prices, period=20):
        sma = prices.rolling(period, min_periods=1).mean()
        std = prices.rolling(period, min_periods=1).std()
        std = std.fillna(prices.std())  # Use overall std when not enough data
        upper = sma + (2 * std)
        lower = sma - (2 * std)
        position = (prices - lower) / (upper - lower + 1e-10)
        return position.fillna(0.5).clip(0, 1)
    
    def calculate_stochastic(self, df, k_period=14, d_period=3):
        low_min = df['low'].rolling(k_period, min_periods=1).min()
        high_max = df['high'].rolling(k_period, min_periods=1).max()
        k_percent = 100 * ((df['close'] - low_min) / (high_max - low_min + 1e-10))
        d_percent = k_percent.rolling(d_period, min_periods=1).mean()
        return k_percent.fillna(50), d_percent.fillna(50)
    
    def prepare_sequences(self, features, target_cols=['open', 'high', 'low', 'close']):
        """
        Prepare sequences for LSTM training with proper time series handling
        """
        X, y = [], []
        
        # Ensure we have enough data
        min_required = self.sequence_length + self.bars_per_day
        if len(features) < min_required:
            print(f"‚ö†Ô∏è Insufficient data: {len(features)} bars, need {min_required}")
            return np.array([]), np.array([])
        
        # Create sequences
        for i in range(self.sequence_length, len(features) - self.bars_per_day):
            # Input sequence
            sequence = features.iloc[i-self.sequence_length:i].values
            
            # Check for NaN or Inf in sequence
            if np.any(np.isnan(sequence)) or np.any(np.isinf(sequence)):
                continue
            
            X.append(sequence)
            
            # Target: next day's OHLC
            next_day_data = features.iloc[i:i+self.bars_per_day]
            if len(next_day_data) == self.bars_per_day:
                ohlc = np.array([
                    next_day_data['open'].iloc[0],
                    next_day_data['high'].max(),
                    next_day_data['low'].min(),
                    next_day_data['close'].iloc[-1]
                ])
                
                # Check for valid OHLC values
                if np.all(np.isfinite(ohlc)) and ohlc[1] >= ohlc[2]:  # high >= low
                    y.append(ohlc)
                else:
                    X.pop()  # Remove the corresponding input if target is invalid
        
        return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)
    
    def create_model(self, trial, input_shape):
        """
        Create model with Optuna trial parameters
        """
        # Model architecture choices
        model_type = trial.suggest_categorical('model_type', ['lstm', 'gru'])  # Removed bilstm for stability
        n_layers = trial.suggest_int('n_layers', 2, 3)  # Reduced max layers
        
        # Regularization
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4, step=0.05)
        use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
        
        # Optimization
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
        
        model = Sequential()
        
        # First layer
        first_units = trial.suggest_int('units_layer_0', 64, 256, step=32)
        
        if model_type == 'lstm':
            model.add(LSTM(
                first_units,
                return_sequences=(n_layers > 1),
                input_shape=input_shape
            ))
        else:  # gru
            model.add(GRU(
                first_units,
                return_sequences=(n_layers > 1),
                input_shape=input_shape
            ))
        
        model.add(Dropout(dropout_rate))
        if use_batch_norm and n_layers > 1:
            model.add(BatchNormalization())
        
        # Hidden layers
        for i in range(1, n_layers):
            units = trial.suggest_int(f'units_layer_{i}', 32, 128, step=32)
            return_seq = (i < n_layers - 1)
            
            if model_type == 'lstm':
                model.add(LSTM(units, return_sequences=return_seq))
            else:
                model.add(GRU(units, return_sequences=return_seq))
            
            model.add(Dropout(dropout_rate * 0.8))
            if use_batch_norm and return_seq:
                model.add(BatchNormalization())
        
        # Dense layers
        dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
        model.add(Dense(dense_units, activation='relu'))
        model.add(Dropout(dropout_rate * 0.5))
        
        # Output layer
        model.add(Dense(4, activation='linear'))
        
        # Compile
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss='mse',
            metrics=['mae']
        )
        
        return model
    
    def objective(self, trial, X_train, y_train, X_val, y_val):
        """
        Objective function for Optuna optimization
        """
        try:
            # Clear previous models
            tf.keras.backend.clear_session()
            
            # Create model
            model = self.create_model(trial, (X_train.shape[1], X_train.shape[2]))
            
            # Training parameters
            batch_size = trial.suggest_int('batch_size', 16, 64, step=16)
            
            # Callbacks
            early_stop = EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True,
                verbose=0
            )
            
            # Train model
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=50,  # Reduced for faster tuning
                batch_size=batch_size,
                callbacks=[early_stop],
                verbose=0
            )
            
            # Get best validation loss
            val_loss = min(history.history['val_loss'])
            
            return val_loss
            
        except Exception as e:
            print(f"Trial failed: {e}")
            return float('inf')
    
    def tune(self, df, n_trials=100, n_jobs=1):
        """
        Run hyperparameter tuning
        """
        print("\n" + "="*60)
        print("Advanced Hyperparameter Tuning")
        print("="*60 + "\n")
        
        # Prepare features
        print("Creating features...")
        features = self.create_features(df)
        print(f"Features shape: {features.shape}")
        
        # Prepare sequences
        print("Preparing sequences...")
        X, y = self.prepare_sequences(features)
        
        if len(X) == 0:
            print("‚ùå Insufficient data for tuning")
            return None
        
        print(f"X shape: {X.shape}, y shape: {y.shape}")
        
        # Split data
        val_split = 0.2
        split_idx = int(len(X) * (1 - val_split))
        
        X_train = X[:split_idx]
        y_train = y[:split_idx]
        X_val = X[split_idx:]
        y_val = y[split_idx:]
        
        print(f"Training samples: {len(X_train)}")
        print(f"Validation samples: {len(X_val)}")
        
        # Scale the data
        scaler_X = RobustScaler()
        scaler_y = RobustScaler()
        
        X_train_scaled = scaler_X.fit_transform(
            X_train.reshape(-1, X_train.shape[-1])
        ).reshape(X_train.shape)
        
        X_val_scaled = scaler_X.transform(
            X_val.reshape(-1, X_val.shape[-1])
        ).reshape(X_val.shape)
        
        y_train_scaled = scaler_y.fit_transform(y_train)
        y_val_scaled = scaler_y.transform(y_val)
        
        # Save scalers
        joblib.dump(scaler_X, 'scaler_features.pkl')
        joblib.dump(scaler_y, 'scaler_target.pkl')
        
        # Create study
        self.study = optuna.create_study(
            direction='minimize',
            study_name='tesla_ohlc_tuning'
        )
        
        # Optimize
        print(f"\nRunning {n_trials} trials...")
        self.study.optimize(
            lambda trial: self.objective(
                trial, 
                X_train_scaled, y_train_scaled,
                X_val_scaled, y_val_scaled
            ),
            n_trials=n_trials,
            n_jobs=n_jobs
        )
        
        # Get best parameters
        self.best_params = self.study.best_params
        
        print("\n" + "="*60)
        print("Best Hyperparameters Found")
        print("="*60)
        
        for key, value in self.best_params.items():
            print(f"  {key}: {value}")
        
        print(f"\nBest validation loss: {self.study.best_value:.6f}")
        
        # Save results
        with open('best_hyperparameters.json', 'w') as f:
            json.dump(self.best_params, f, indent=4)
        
        print(f"\n‚úÖ Best parameters saved to best_hyperparameters.json")
        
        return self.best_params

# Main function
def run_hyperparameter_tuning(df=None, n_trials=100):
    """
    Main function to run hyperparameter tuning
    """
    print("\n" + "="*80)
    print(" HYPERPARAMETER TUNING FOR TESLA OHLC PREDICTION ".center(80, "="))
    print("="*80 + "\n")
    
    # Load data if not provided
    if df is None:
        print("Loading data...")
        try:
            df = pd.read_pickle('tesla_6months_5min_data.pkl')
            print(f"‚úÖ Loaded {len(df)} bars from saved data")
        except:
            try:
                df = pd.read_csv('tesla_6months_5min_data.csv')
                print(f"‚úÖ Loaded {len(df)} bars from CSV")
            except:
                print("‚ùå No saved data found. Please run data collection first.")
                return None
    
    # Initialize tuner
    tuner = AdvancedHyperparameterTuner(
        lookback_days=20,
        interval_minutes=5
    )
    
    # Run tuning
    best_params = tuner.tune(df, n_trials=n_trials)
    
    if best_params:
        print("\n" + "="*80)
        print(" TUNING COMPLETE ".center(80, "="))
        print("="*80)
    
    return best_params

if __name__ == "__main__":
    # Run with fewer trials for testing
    best_params = run_hyperparameter_tuning(n_trials=20)  # Reduced for testing



Loading data...
‚úÖ Loaded 3267 bars from saved data

Advanced Hyperparameter Tuning

Creating features...
Features shape: (3267, 35)
Preparing sequences...
X shape: (1629, 1560, 35), y shape: (1629, 4)
Training samples: 1303
Validation samples: 326


[I 2025-11-09 19:23:31,129] A new study created in memory with name: tesla_ohlc_tuning
2025-11-09 19:23:31.315029: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)



Running 20 trials...


2025-11-09 19:23:31.883258: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 284575200 exceeds 10% of free system memory.
2025-11-09 19:23:37.054045: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 38338560 exceeds 10% of free system memory.
2025-11-09 19:23:37.076264: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 38338560 exceeds 10% of free system memory.
2025-11-09 19:23:37.101858: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 38338560 exceeds 10% of free system memory.
2025-11-09 19:23:37.204474: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 38338560 exceeds 10% of free system memory.


# Run the complete pipeline
main_pipeline()

# Or run blocks separately:

# 1. Just collect data
df = collect_data()

# 2. Just tune hyperparameters (needs data)
trainer = TeslaOHLCTrainer()
X, y = trainer.prepare_data(df)
best_params = tune_hyperparameters(X, y)

# 3. Just train and predict (needs data and params)
trainer.train(X, y, best_params)
prediction = trainer.predict_next_day(df)

In [5]:
df = load_6months_5min_data()




Fetching 6 Months of 5-Minute Bar Data

üìÅ Loading from cache...
‚úÖ Loaded 3267 bars from cache

Adding market features...
‚úÖ Added market features

Validating data...

üìä Data Statistics:
  Date range: 2025-09-11 to 2025-11-07
  Total bars: 3,267
  Trading days: ~57
  Bars per day: ~57
  Expected (78 bars/day): 4,446

‚úÖ Data saved to tesla_6months_5min_data.csv
‚úÖ Data saved to tesla_6months_5min_data.pkl

üìà Final Dataset:
  Shape: (3267, 11)
  Memory usage: 0.27 MB
  Columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'vix', 'sentiment_score', 'transactions']

üîÆ For LSTM Training:
  Lookback period: 20 days
  Sequence length: 1560 bars
  Potential training sequences: ~21
