In [1]:
# Cell 1: Setup environment and load libraries
import os
import sys
from dotenv import load_dotenv
import warnings
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import random

warnings.filterwarnings('ignore')

# Load environment variables if available
try:
    load_dotenv()
    print("Environment variables loaded successfully")
except Exception as e:
    print(f"Note: {e} - continuing without .env file")

# Create results directory
output_dir = os.getenv('OUTPUT_DIR', './results')
os.makedirs(output_dir, exist_ok=True)

# Get configuration from environment or set defaults
MAX_SYMBOLS = int(os.getenv('MAX_SYMBOLS', '10'))
print(f"Will analyze up to {MAX_SYMBOLS} stocks")

# Cell 2: Data fetching and preprocessing functions
def validate_data_structure(data, symbol):
    """Validate and clean the data structure"""
    if data is None or len(data) == 0:
        return None
    
    # Handle MultiIndex columns from yfinance
    if isinstance(data.columns, pd.MultiIndex):
        # Flatten columns by taking the first level
        data.columns = [col[0] if isinstance(col, tuple) else col for col in data.columns]
    
    # Check for required columns
    required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    missing_columns = [col for col in required_columns if col not in data.columns]
    
    if missing_columns:
        print(f"Error: Missing columns {missing_columns} for {symbol}")
        return None
    
    # Convert to numeric and handle any potential issues
    for col in required_columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    # Remove rows with any NaN values
    initial_rows = len(data)
    data = data.dropna()
    if len(data) < initial_rows:
        print(f"Removed {initial_rows - len(data)} rows with NaN values for {symbol}")
    
    # Check if we still have enough data
    if len(data) < 50:
        print(f"Error: Insufficient data for {symbol}. Only {len(data)} rows available.")
        return None
    
    return data

def get_sp500_symbols():
    """Fetch S&P 500 symbols from Wikipedia"""
    print("Fetching S&P 500 symbols...")
    try:
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': 'wikitable'})
        
        symbols = []
        for row in table.findAll('tr')[1:]:
            symbol = row.findAll('td')[0].text.strip()
            symbols.append(symbol)
        
        print(f"Found {len(symbols)} S&P 500 symbols")
        return symbols
    except Exception as e:
        print(f"Error fetching S&P 500 symbols: {e}")
        # Return a small fallback list of major companies
        return ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'NVDA', 'JPM', 'JNJ', 'V']

def get_historical_data(symbol, days=360):
    """Download historical stock data using yfinance"""
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    
    try:
        # Download data
        data = yf.download(symbol, start=start_date, end=end_date, progress=False)
        
        # Validate and clean the data structure
        data = validate_data_structure(data, symbol)
        
        return data
    except Exception as e:
        print(f"Error downloading data for {symbol}: {e}")
        return None

Environment variables loaded successfully
Will analyze up to 10 stocks
