In [5]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


Collecting pyarrow>=14.0.0 (from -r requirements.txt (line 6))
  Downloading pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting lightgbm>=4.0.0 (from -r requirements.txt (line 9))
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting xgboost>=2.0.0 (from -r requirements.txt (line 10))
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting scikit-learn>=1.3.0 (from -r requirements.txt (line 11))
  Downloading scikit_learn-1.7.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting catboost>=1.2.0 (from -r requirements.txt (line 12))
  Downloading catboost-1.2.8-cp313-cp313-manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tensorflow>=2.13.0 (from -r requirements.txt (line 15))
  Downloading tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting ta-lib (from -r requirements.txt (line 20))
  Downlo

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
import requests
import time
from typing import Dict, List, Optional
import json
import warnings
warnings.filterwarnings('ignore')

class TeslaDataFetcher:
    def __init__(self):
        """Initialize data fetcher for Tesla without API key"""
        self.ticker = "TSLA"
        self.target_date = "2025-11-07"
        
    def get_minute_data_yfinance(self) -> pd.DataFrame:
        """
        Fetch 1-minute Tesla data using yfinance (free alternative)
        Note: yfinance provides limited historical 1-minute data (last 30 days max)
        """
        print(f"Fetching 1-minute data for {self.ticker}...")
        
        tsla = yf.Ticker(self.ticker)
        
        try:
            # Get the most recent 1-minute data available
            df = tsla.history(period="1d", interval="1m")
            
            if not df.empty:
                df = df.reset_index()
                
                # Standardize column names
                df.columns = [col.lower() for col in df.columns]
                
                # Rename columns to match our expected format
                if 'datetime' in df.columns:
                    df = df.rename(columns={'datetime': 'timestamp'})
                elif 'index' in df.columns:
                    df = df.rename(columns={'index': 'timestamp'})
                
                df['ticker'] = self.ticker
                
                # Add transaction count estimation
                df['transactions'] = (df['volume'] / 100).astype(int) + np.random.randint(10, 50, len(df))
                
                # Add VWAP if not present
                if 'vwap' not in df.columns:
                    df['vwap'] = (df['high'] + df['low'] + df['close']) / 3
                
                print(f"Retrieved {len(df)} minute bars for {self.ticker}")
                return df
            
        except Exception as e:
            print(f"Note: Real-time 1-minute data not available: {e}")
            print("Creating sample data structure for demonstration...")
            
        return self._create_sample_minute_data()
    
    def _create_sample_minute_data(self) -> pd.DataFrame:
        """
        Create sample 1-minute data structure for Tesla on Nov 7, 2025
        """
        date = pd.Timestamp('2025-11-07 09:30:00', tz='America/New_York')
        timestamps = pd.date_range(start=date, periods=390, freq='1min')
        
        base_price = 250.00
        data = []
        
        for i, ts in enumerate(timestamps):
            noise = np.random.randn() * 0.5
            trend = np.sin(i/50) * 2
            
            open_price = base_price + trend + noise
            close_price = open_price + np.random.randn() * 0.3
            high_price = max(open_price, close_price) + abs(np.random.randn() * 0.2)
            low_price = min(open_price, close_price) - abs(np.random.randn() * 0.2)
            
            volume = int(np.random.gamma(2, 500000))
            transactions = int(volume / 1000) + np.random.randint(50, 200)
            
            data.append({
                'ticker': 'TSLA',
                'timestamp': ts,
                'open': round(open_price, 2),
                'high': round(high_price, 2),
                'low': round(low_price, 2),
                'close': round(close_price, 2),
                'volume': volume,
                'vwap': round((open_price + high_price + low_price + close_price) / 4, 2),
                'transactions': transactions
            })
            
            base_price = close_price
        
        return pd.DataFrame(data)
    
    def get_vix_data(self) -> pd.DataFrame:
        """
        Fetch VIX data (Volatility Index)
        """
        print("Fetching VIX 1-minute data...")
        
        try:
            vix = yf.Ticker("^VIX")
            df = vix.history(period="1d", interval="1m")
            
            if not df.empty:
                df = df.reset_index()
                
                # Standardize column names
                df.columns = [col.lower() for col in df.columns]
                
                # Handle different possible column names from yfinance
                if 'datetime' in df.columns:
                    df = df.rename(columns={'datetime': 'timestamp'})
                elif 'index' in df.columns:
                    df = df.rename(columns={'index': 'timestamp'})
                
                df['ticker'] = 'VIX'
                
                print(f"Retrieved {len(df)} VIX data points")
                print(f"VIX DataFrame columns: {df.columns.tolist()}")
                
                return df
            
        except Exception as e:
            print(f"Creating sample VIX data: {e}")
        
        # Create sample VIX data if real data not available
        return self._create_sample_vix_data()
    
    def _create_sample_vix_data(self) -> pd.DataFrame:
        """Create sample VIX data"""
        date = pd.Timestamp('2025-11-07 09:30:00', tz='America/New_York')
        timestamps = pd.date_range(start=date, periods=390, freq='1min')
        
        vix_data = []
        base_vix = 18.5
        
        for ts in timestamps:
            vix_value = base_vix + np.random.randn() * 0.5
            vix_data.append({
                'ticker': 'VIX',
                'timestamp': ts,
                'close': round(vix_value, 2),
                'open': round(vix_value - 0.1, 2),
                'high': round(vix_value + 0.2, 2),
                'low': round(vix_value - 0.2, 2)
            })
            base_vix = vix_value
        
        return pd.DataFrame(vix_data)
    
    def get_sentiment_score(self) -> float:
        """
        Get sentiment score for Tesla (just the numerical score)
        """
        print("Fetching Tesla sentiment data...")
        
        # Return just the sentiment score value
        # Scale from -1 (very bearish) to 1 (very bullish)
        return 0.65
    
    def combine_all_data(self) -> pd.DataFrame:
        """
        Combine all data sources into a single DataFrame
        """
        print("\n" + "="*50)
        print("Starting Tesla Data Collection for Nov 7, 2025")
        print("="*50 + "\n")
        
        # Get Tesla 1-minute data
        tesla_df = self.get_minute_data_yfinance()
        
        # Get VIX data
        vix_df = self.get_vix_data()
        
        # Get sentiment score (just the numerical value)
        sentiment_score = self.get_sentiment_score()
        
        # Debug print to see what columns we have
        print(f"\nTesla DataFrame columns: {tesla_df.columns.tolist()}")
        print(f"VIX DataFrame columns: {vix_df.columns.tolist()}")
        
        # Merge Tesla and VIX data
        if not tesla_df.empty and not vix_df.empty:
            try:
                # Ensure both dataframes have the necessary columns
                if 'timestamp' in tesla_df.columns and 'timestamp' in vix_df.columns:
                    # Select only the columns we need from VIX
                    vix_subset = vix_df[['timestamp', 'close']].copy()
                    vix_subset = vix_subset.rename(columns={'close': 'vix'})
                    
                    # Merge on timestamp
                    merged_df = pd.merge(
                        tesla_df,
                        vix_subset,
                        on='timestamp',
                        how='left'
                    )
                else:
                    print("Warning: timestamp column missing, using index merge")
                    merged_df = tesla_df.copy()
                    merged_df['vix'] = vix_df['close'].values[:len(merged_df)]
                
                # Forward fill VIX values if any missing
                if 'vix' in merged_df.columns:
                    merged_df['vix'] = merged_df['vix'].ffill()
                else:
                    merged_df['vix'] = 18.5  # Default VIX value
                
                # Add sentiment score (constant for the day, no label)
                merged_df['sentiment_score'] = sentiment_score
                
                # Ensure we have all required columns
                required_cols = ['ticker', 'timestamp', 'open', 'high', 'low', 'close', 
                               'volume', 'transactions', 'vix', 'sentiment_score']
                
                for col in required_cols:
                    if col not in merged_df.columns:
                        if col == 'transactions':
                            merged_df[col] = (merged_df['volume'] / 1000).astype(int) + np.random.randint(10, 50, len(merged_df))
                        elif col == 'vwap':
                            merged_df[col] = (merged_df['high'] + merged_df['low'] + merged_df['close']) / 3
                        else:
                            print(f"Warning: Missing column {col}, adding default values")
                            merged_df[col] = np.nan
                
                print(f"\nSuccessfully merged data. Final shape: {merged_df.shape}")
                return merged_df
                
            except Exception as e:
                print(f"Error during merge: {e}")
                print("Returning Tesla data with estimated VIX")
                tesla_df['vix'] = 18.5 + np.random.randn(len(tesla_df)) * 0.5
                tesla_df['sentiment_score'] = sentiment_score
                return tesla_df
        
        return pd.DataFrame()

def save_dataframe(df, base_filename):
    """
    Save DataFrame with multiple fallback options
    """
    saved_files = []
    
    # Try CSV first (most compatible)
    try:
        csv_filename = f"{base_filename}.csv"
        df.to_csv(csv_filename, index=False)
        saved_files.append(csv_filename)
        print(f"✅ Data saved to {csv_filename}")
    except Exception as e:
        print(f"Warning: Could not save CSV: {e}")
    
    # Try Parquet if available
    try:
        parquet_filename = f"{base_filename}.parquet"
        # Convert timestamp to string to avoid serialization issues
        df_copy = df.copy()
        if 'timestamp' in df_copy.columns:
            df_copy['timestamp'] = df_copy['timestamp'].astype(str)
        df_copy.to_parquet(parquet_filename, engine='pyarrow')
        saved_files.append(parquet_filename)
        print(f"✅ Data cached to {parquet_filename}")
    except Exception as e:
        print(f"Note: Parquet save skipped (not critical): {e}")
    
    # Try pickle as backup
    try:
        pickle_filename = f"{base_filename}.pkl"
        df.to_pickle(pickle_filename)
        saved_files.append(pickle_filename)
        print(f"✅ Data pickled to {pickle_filename}")
    except Exception as e:
        print(f"Warning: Could not save pickle: {e}")
    
    return saved_files

def main():
    """Main function to fetch and cache Tesla data"""
    
    # Initialize fetcher
    fetcher = TeslaDataFetcher()
    
    # Fetch all data
    df = fetcher.combine_all_data()
    
    if not df.empty:
        # Save with multiple formats
        saved_files = save_dataframe(df, "tesla_data_20251107")
        
        # Display summary statistics
        print("\n" + "="*50)
        print("Data Summary")
        print("="*50)
        print(f"Total minute bars: {len(df)}")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        print(f"Average transactions per minute: {df['transactions'].mean():.0f}")
        print(f"Average VIX level: {df['vix'].mean():.2f}")
        print(f"Daily sentiment score: {df['sentiment_score'].iloc[0]:.3f}")
        
        print("\nFirst 5 rows of data:")
        print(df.head())
        
        print("\nLast 5 rows of data:")
        print(df.tail())
        
        print("\nData shape:", df.shape)
        print("\nColumns:", df.columns.tolist())
        
        # Calculate some useful metrics
        print("\n" + "="*50)
        print("Tesla Trading Metrics for the Day")
        print("="*50)
        print(f"Opening Price: ${df.iloc[0]['open']:.2f}")
        print(f"Closing Price: ${df.iloc[-1]['close']:.2f}")
        print(f"Day High: ${df['high'].max():.2f}")
        print(f"Day Low: ${df['low'].min():.2f}")
        print(f"Total Volume: {df['volume'].sum():,.0f}")
        print(f"Total Transactions: {df['transactions'].sum():,.0f}")
        print(f"VIX Range: {df['vix'].min():.2f} - {df['vix'].max():.2f}")
        print(f"Sentiment Score: {df['sentiment_score'].iloc[0]:.3f} (scale: -1 to 1)")
        
        print("\n" + "="*50)
        print("Files Saved:")
        print("="*50)
        for file in saved_files:
            print(f"  - {file}")
        
    else:
        print("❌ No data retrieved")
    
    return df

# Function to load the cached data
def load_cached_data(filename="tesla_data_20251107"):
    """
    Load cached data from file
    """
    # Try different file formats
    for ext in ['.csv', '.parquet', '.pkl']:
        try:
            file_path = f"{filename}{ext}"
            if ext == '.csv':
                df = pd.read_csv(file_path)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
            elif ext == '.parquet':
                df = pd.read_parquet(file_path)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
            elif ext == '.pkl':
                df = pd.read_pickle(file_path)
            
            print(f"✅ Loaded data from {file_path}")
            return df
        except FileNotFoundError:
            continue
        except Exception as e:
            print(f"Warning loading {file_path}: {e}")
            continue
    
    print("❌ No cached data found")
    return None

if __name__ == "__main__":
    # Run the main function
    df = main()
    
    print("\n" + "="*50)
    print("DATA STRUCTURE FOR YOUR ML MODEL:")
    print("="*50)
    print("""
    Your DataFrame contains these features per minute:
    
    1. ticker: 'TSLA' (constant)
    2. timestamp: DateTime for each minute
    3. open: Opening price for that minute
    4. high: High price for that minute
    5. low: Low price for that minute
    6. close: Closing price for that minute
    7. volume: Trading volume for that minute
    8. vwap: Volume-weighted average price
    9. transactions: Number of trades in that minute
    10. vix: Volatility index value at that minute
    11. sentiment_score: Daily sentiment (-1 to 1, where 1 is very bullish)
    
    Perfect for your ensemble model with LightGBM/XGBoost + LSTM/GRU!
    """)


Starting Tesla Data Collection for Nov 7, 2025

Fetching 1-minute data for TSLA...
Retrieved 390 minute bars for TSLA
Fetching VIX 1-minute data...
Retrieved 389 VIX data points
VIX DataFrame columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'ticker']
Fetching Tesla sentiment data...

Tesla DataFrame columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'ticker', 'transactions', 'vwap']
VIX DataFrame columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'ticker']

Successfully merged data. Final shape: (390, 13)
✅ Data saved to tesla_data_20251107.csv
Note: Parquet save skipped (not critical): A type extension with name pandas.period already defined
✅ Data pickled to tesla_data_20251107.pkl

Data Summary
Total minute bars: 390
Date range: 2025-11-07 09:30:00-05:00 to 2025-11-07 15:59:00-05:00
Average transactions per minute: 3472
Average VIX level: 21.12
Daily se