In [1]:
# Cell 1: Setup environment and load libraries
import os
import sys
from dotenv import load_dotenv
import warnings
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import random
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Load environment variables if available
try:
    load_dotenv()
    print("Environment variables loaded successfully")
except Exception as e:
    print(f"Note: {e} - continuing without .env file")

# Create results directory
output_dir = os.getenv('OUTPUT_DIR', './results')
os.makedirs(output_dir, exist_ok=True)

# Get configuration from environment or set defaults
MAX_SYMBOLS = int(os.getenv('MAX_SYMBOLS', '10'))
print(f"Will analyze up to {MAX_SYMBOLS} stocks")

Environment variables loaded successfully
Will analyze up to 10 stocks


In [2]:
# Cell 2: FinBERT model setup for sentiment analysis
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Set device (CPU or GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} for inference")

def setup_finbert():
    """Setup FinBERT model for financial sentiment analysis"""
    try:
        # Load FinBERT model for financial sentiment analysis
        model_name = "ProsusAI/finbert"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        
        # Move model to GPU if available
        model = model.to(device)
        
        # Create sentiment analysis pipeline
        nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
        print("✓ Successfully loaded FinBERT model")
        return nlp
    except Exception as e:
        print(f"Error loading FinBERT: {e}")
        print("Falling back to alternative model...")
        
        try:
            # Fallback to another financial model or general sentiment model
            model_name = "distilbert-base-uncased-finetuned-sst-2-english"  # General sentiment model
            nlp = pipeline("sentiment-analysis", model=model_name, device=0 if device == "cuda" else -1)
            print("✓ Successfully loaded fallback sentiment model")
            return nlp
        except Exception as e2:
            print(f"Error loading fallback model: {e2}")
            print("Unable to load any sentiment model. Will use rule-based sentiment.")
            return None

# Initialize sentiment model
sentiment_model = setup_finbert()

Using cpu for inference


Device set to use cpu


✓ Successfully loaded FinBERT model


In [3]:
# Cell 3: Data fetching and preprocessing functions
def validate_data_structure(data, symbol):
    """Validate and clean the data structure"""
    if data is None or len(data) == 0:
        return None
    
    # Handle MultiIndex columns from yfinance
    if isinstance(data.columns, pd.MultiIndex):
        # Flatten columns by taking the first level
        data.columns = [col[0] if isinstance(col, tuple) else col for col in data.columns]
    
    # Check for required columns
    required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    missing_columns = [col for col in required_columns if col not in data.columns]
    
    if missing_columns:
        print(f"Error: Missing columns {missing_columns} for {symbol}")
        return None
    
    # Convert to numeric and handle any potential issues
    for col in required_columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    # Remove rows with any NaN values
    initial_rows = len(data)
    data = data.dropna()
    if len(data) < initial_rows:
        print(f"Removed {initial_rows - len(data)} rows with NaN values for {symbol}")
    
    # Check if we still have enough data
    if len(data) < 50:
        print(f"Error: Insufficient data for {symbol}. Only {len(data)} rows available.")
        return None
    
    return data

def get_sp500_symbols():
    """Fetch S&P 500 symbols from Wikipedia"""
    print("Fetching S&P 500 symbols...")
    try:
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': 'wikitable'})
        
        symbols = []
        for row in table.findAll('tr')[1:]:
            symbol = row.findAll('td')[0].text.strip()
            symbols.append(symbol)
        
        print(f"Found {len(symbols)} S&P 500 symbols")
        return symbols
    except Exception as e:
        print(f"Error fetching S&P 500 symbols: {e}")
        # Return a small fallback list of major companies
        return ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'NVDA', 'JPM', 'JNJ', 'V']

def get_historical_data(symbol, days=360):
    """Download historical stock data using yfinance"""
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    
    try:
        # Download data
        data = yf.download(symbol, start=start_date, end=end_date, progress=False)
        
        # Validate and clean the data structure
        data = validate_data_structure(data, symbol)
        
        return data
    except Exception as e:
        print(f"Error downloading data for {symbol}: {e}")
        return None

In [4]:
# Cell 4: Technical indicator calculation functions
def ensure_series(data, column_name=None):
    """Ensure data is a pandas Series"""
    if isinstance(data, pd.DataFrame):
        if column_name and column_name in data.columns:
            return data[column_name]
        else:
            return data.iloc[:, 0]  # Take first column
    elif isinstance(data, pd.Series):
        return data
    else:
        return pd.Series(data)

def calculate_rsi(prices, period=14):
    prices = ensure_series(prices)
    delta = prices.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    prices = ensure_series(prices)
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    histogram = macd - signal_line
    return macd, signal_line, histogram

def calculate_atr(high, low, close, period=14):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(window=period).mean()
    return atr

def calculate_bollinger_bands(prices, period=20, std_dev=2):
    prices = ensure_series(prices)
    middle = prices.rolling(window=period).mean()
    std = prices.rolling(window=period).std()
    upper = middle + (std * std_dev)
    lower = middle - (std * std_dev)
    width = (upper - lower) / middle
    return upper, middle, lower, width

def calculate_obv(close, volume):
    close = ensure_series(close)
    volume = ensure_series(volume)
    obv = pd.Series(0.0, index=close.index)
    for i in range(1, len(close)):
        if close.iloc[i] > close.iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] + volume.iloc[i]
        elif close.iloc[i] < close.iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] - volume.iloc[i]
        else:
            obv.iloc[i] = obv.iloc[i-1]
    return obv

def calculate_stochastic(high, low, close, k_period=14, d_period=3):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    high_roll = high.rolling(window=k_period).max()
    low_roll = low.rolling(window=k_period).min()
    stoch_k = 100 * (close - low_roll) / (high_roll - low_roll)
    stoch_d = stoch_k.rolling(window=d_period).mean()
    return stoch_k, stoch_d

def calculate_adx(high, low, close, period=14):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    
    atr = tr.rolling(window=period).mean()
    
    plus_di = 100 * (plus_dm.rolling(window=period).mean() / atr)
    minus_di = 100 * (abs(minus_dm.rolling(window=period).mean()) / atr)
    
    dx = 100 * (abs(plus_di - minus_di) / (plus_di + minus_di))
    adx = dx.rolling(window=period).mean()
    
    return adx

def calculate_flexible_technical_sentiment(df):
    """Calculate a sentiment score based on technical indicators"""
    # Initialize sentiment series
    sentiment = pd.Series(0.0, index=df.index)
    
    # Ensure all required columns exist and are Series
    required_cols = ['RSI', 'MACD', 'Signal_Line', 'Close', 'SMA20', 'SMA50', 'Volume_Ratio', 'Daily_Return', 'BB_Lower', 'BB_Upper']
    for col in required_cols:
        if col not in df.columns:
            print(f"Warning: {col} not found in dataframe for sentiment calculation")
            return sentiment
    
    # RSI component (-0.4 to 0.4)
    rsi_sentiment = pd.Series(0.0, index=df.index)
    try:
        rsi_sentiment[df['RSI'] < 30] = 0.4  # Oversold - positive sentiment
        rsi_sentiment[(df['RSI'] >= 30) & (df['RSI'] <= 70)] = (df['RSI'] - 50) / 50 * 0.2  # Neutral zone
        rsi_sentiment[df['RSI'] > 70] = -0.4  # Overbought - negative sentiment
    except Exception as e:
        print(f"Error calculating RSI sentiment: {e}")
        rsi_sentiment = pd.Series(0.0, index=df.index)
    
    # MACD component (-0.2 to 0.2)
    macd_sentiment = pd.Series(0.0, index=df.index)
    try:
        macd_sentiment[df['MACD'] > df['Signal_Line']] = 0.2  # Bullish
        macd_sentiment[df['MACD'] <= df['Signal_Line']] = -0.2  # Bearish
    except Exception as e:
        print(f"Error calculating MACD sentiment: {e}")
        macd_sentiment = pd.Series(0.0, index=df.index)
    
    # Moving Average component (-0.2 to 0.2)
    ma_sentiment = pd.Series(0.0, index=df.index)
    try:
        ma_bullish_count = ((df['Close'] > df['SMA20']) & 
                            (df['Close'] > df['SMA50']) & 
                            (df['SMA20'] > df['SMA50'])).astype(int)
        ma_bearish_count = ((df['Close'] < df['SMA20']) & 
                            (df['Close'] < df['SMA50']) & 
                            (df['SMA20'] < df['SMA50'])).astype(int)
        ma_sentiment = (ma_bullish_count - ma_bearish_count) * 0.2
    except Exception as e:
        print(f"Error calculating MA sentiment: {e}")
        ma_sentiment = pd.Series(0.0, index=df.index)
    
    # Bollinger Bands component (-0.1 to 0.1)
    bb_sentiment = pd.Series(0.0, index=df.index)
    try:
        bb_sentiment[df['Close'] < df['BB_Lower']] = 0.1  # Below lower band - potential buy
        bb_sentiment[df['Close'] > df['BB_Upper']] = -0.1  # Above upper band - potential sell
    except Exception as e:
        print(f"Error calculating BB sentiment: {e}")
        bb_sentiment = pd.Series(0.0, index=df.index)
    
    # Volume component (-0.1 to 0.1)
    vol_sentiment = pd.Series(0.0, index=df.index)
    try:
        vol_sentiment[df['Volume_Ratio'] > 1.5] = 0.1 * np.sign(df['Daily_Return'])  # High volume in direction of move
    except Exception as e:
        print(f"Error calculating volume sentiment: {e}")
        vol_sentiment = pd.Series(0.0, index=df.index)
    
    # Combine all components
    sentiment = rsi_sentiment + macd_sentiment + ma_sentiment + bb_sentiment + vol_sentiment
    
    # Ensure sentiment is between -1 and 1
    sentiment = sentiment.clip(-1, 1)
    
    return sentiment

def get_technical_sentiment(data):
    """Calculate sentiment based on technical indicators"""
    if data is None or len(data) < 5:
        return 0.0
    
    try:
        # Get the latest data point
        latest = data.iloc[-1]
        
        # Initialize sentiment components
        ma_sentiment = 0.0
        momentum_sentiment = 0.0
        volatility_sentiment = 0.0
        
        # Moving Average component (-0.4 to 0.4)
        ma_signals = 0
        ma_count = 0
        
        # Price vs moving averages
        if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
            ma_signals += 1 if latest['Close'] > latest['SMA20'] else -1
            ma_count += 1
            
        if 'SMA50' in latest.index and pd.notna(latest['SMA50']):
            ma_signals += 1 if latest['Close'] > latest['SMA50'] else -1
            ma_count += 1
            
        if 'SMA200' in latest.index and pd.notna(latest['SMA200']):
            ma_signals += 1 if latest['Close'] > latest['SMA200'] else -1
            ma_count += 1
        
        # Moving average crossovers
        if 'SMA20' in latest.index and 'SMA50' in latest.index:
            if pd.notna(latest['SMA20']) and pd.notna(latest['SMA50']):
                ma_signals += 1 if latest['SMA20'] > latest['SMA50'] else -1
                ma_count += 1
        
        if 'SMA50' in latest.index and 'SMA200' in latest.index:
            if pd.notna(latest['SMA50']) and pd.notna(latest['SMA200']):
                ma_signals += 1 if latest['SMA50'] > latest['SMA200'] else -1
                ma_count += 1
        
        # Calculate moving average sentiment
        if ma_count > 0:
            ma_sentiment = (ma_signals / ma_count) * 0.4  # Scale to -0.4 to 0.4
        
        # Momentum indicators component (-0.4 to 0.4)
        momentum_signals = 0
        momentum_count = 0
        
        # RSI
        if 'RSI' in latest.index and pd.notna(latest['RSI']):
            rsi = latest['RSI']
            if rsi < 30:
                momentum_signals += 1  # Oversold - positive for future
            elif rsi > 70:
                momentum_signals -= 1  # Overbought - negative for future
            momentum_count += 1
        
        # MACD
        if 'MACD' in latest.index and 'Signal_Line' in latest.index:
            if pd.notna(latest['MACD']) and pd.notna(latest['Signal_Line']):
                momentum_signals += 1 if latest['MACD'] > latest['Signal_Line'] else -1
                momentum_count += 1
        
        # ROC (Rate of Change)
        if 'ROC10' in latest.index and pd.notna(latest['ROC10']):
            roc = latest['ROC10']
            if roc > 0:  # Rising price = positive momentum
                momentum_signals += 1
            else:  # Falling price = negative momentum
                momentum_signals -= 1
            momentum_count += 1
        
        # Calculate momentum sentiment
        if momentum_count > 0:
            momentum_sentiment = (momentum_signals / momentum_count) * 0.4  # Scale to -0.4 to 0.4
        
        # Volatility and Other Indicators component (-0.2 to 0.2)
        vol_signals = 0
        vol_count = 0
        
        # Bollinger Bands
        if all(item in latest.index for item in ['Close', 'BB_Upper', 'BB_Lower']):
            if all(pd.notna(latest[item]) for item in ['Close', 'BB_Upper', 'BB_Lower']):
                if latest['Close'] < latest['BB_Lower']:  # Oversold
                    vol_signals += 1
                elif latest['Close'] > latest['BB_Upper']:  # Overbought
                    vol_signals -= 1
                vol_count += 1
        
        # ATR (high volatility can be concerning)
        if 'ATR' in latest.index and 'Close' in latest.index:
            if pd.notna(latest['ATR']) and pd.notna(latest['Close']):
                atr_pct = latest['ATR'] / latest['Close'] * 100
                if atr_pct > 3:  # High volatility
                    vol_signals -= 1
                vol_count += 1
        
        # Volume indicators
        if 'Volume_Ratio' in latest.index and pd.notna(latest['Volume_Ratio']):
            volume_ratio = latest['Volume_Ratio']
            if volume_ratio > 1.5:  # Higher than average volume
                if 'Daily_Return' in latest.index and pd.notna(latest['Daily_Return']):
                    # Volume in direction of price move
                    vol_signals += 1 if latest['Daily_Return'] > 0 else -1
                vol_count += 1
        
        # Calculate volatility sentiment
        if vol_count > 0:
            volatility_sentiment = (vol_signals / vol_count) * 0.2  # Scale to -0.2 to 0.2
        
        # Combine all components
        total_sentiment = ma_sentiment + momentum_sentiment + volatility_sentiment
        
        # Ensure result is between -1 and 1
        return max(-1.0, min(1.0, total_sentiment))
        
    except Exception as e:
        print(f"Error calculating technical sentiment: {e}")
        return 0.0

In [5]:
# Cell 5: Main feature engineering function (Fixed for smaller datasets)
def calculate_technical_features(data):
    """Calculate comprehensive technical indicators with adaptive periods"""
    if data is None or len(data) < 30:
        print(f"  Insufficient data: only {len(data) if data is not None else 0} rows")
        return None
    
    try:
        df = data.copy()
        initial_length = len(df)
        print(f"  Starting with {initial_length} days of data")
        
        # Ensure we have the right column structure
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = [col[0] if isinstance(col, tuple) else col for col in df.columns]
        
        # Verify required columns exist
        required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"  Error: Missing required columns: {missing_columns}")
            return None
        
        # Adaptive indicator calculation based on available data
        data_length = len(df)
        print(f"  Calculating indicators for {data_length} days...")
        
        # Always calculate these basic indicators
        df['Daily_Return'] = df['Close'].pct_change()
        df['Price_Range'] = (df['High'] - df['Low']) / df['Close'].replace(0, np.nan)
        
        # Short-term indicators (require less data)
        if data_length >= 10:
            df['SMA5'] = df['Close'].rolling(window=5).mean()
            df['SMA10'] = df['Close'].rolling(window=10).mean()
            df['EMA5'] = df['Close'].ewm(span=5, adjust=False).mean()
            df['EMA10'] = df['Close'].ewm(span=10, adjust=False).mean()
            df['ROC5'] = df['Close'].pct_change(periods=5) * 100
            df['Volatility_5'] = df['Daily_Return'].rolling(window=5).std() * np.sqrt(252)
        
        # Medium-term indicators
        if data_length >= 20:
            df['SMA20'] = df['Close'].rolling(window=20).mean()
            df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
            df['ROC10'] = df['Close'].pct_change(periods=10) * 100
            df['ROC20'] = df['Close'].pct_change(periods=20) * 100
            
            # RSI (needs at least 15 days)
            df['RSI'] = calculate_rsi(df['Close'], period=14)
            
            # MACD
            df['MACD'], df['Signal_Line'], df['MACD_Histogram'] = calculate_macd(df['Close'])
            
            # Bollinger Bands
            df['BB_Upper'], df['BB_Middle'], df['BB_Lower'], df['BB_Width'] = calculate_bollinger_bands(df['Close'], period=20)
            
            # ATR
            df['ATR'] = calculate_atr(df['High'], df['Low'], df['Close'], period=14)
            
            # Volume indicators
            df['Volume_SMA20'] = df['Volume'].rolling(window=20).mean()
            df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA20'].replace(0, np.nan)
            df['OBV'] = calculate_obv(df['Close'], df['Volume'])
            
            # Stochastic
            df['K_percent'], df['D_percent'] = calculate_stochastic(df['High'], df['Low'], df['Close'])
            
            # Calculate ratios for short/medium term MAs
            if 'SMA10' in df.columns:
                df['Close_SMA10_Ratio'] = df['Close'] / df['SMA10'].replace(0, np.nan)
            if 'SMA20' in df.columns:
                df['Close_SMA20_Ratio'] = df['Close'] / df['SMA20'].replace(0, np.nan)
        
        # Longer-term indicators (only if we have enough data)
        if data_length >= 50:
            df['SMA50'] = df['Close'].rolling(window=50).mean()
            df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()
            df['Close_SMA50_Ratio'] = df['Close'] / df['SMA50'].replace(0, np.nan)
            df['ROC50'] = df['Close'].pct_change(periods=50) * 100
            df['Volatility_30'] = df['Daily_Return'].rolling(window=30).std() * np.sqrt(252)
            
            # ADX (needs more data)
            df['ADX'] = calculate_adx(df['High'], df['Low'], df['Close'], period=14)
        
        # Very long-term indicators (only with lots of data)
        if data_length >= 100:
            df['SMA100'] = df['Close'].rolling(window=100).mean()
            df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
            df['Close_SMA100_Ratio'] = df['Close'] / df['SMA100'].replace(0, np.nan)
        
        if data_length >= 220:  # Only with 200+ extra buffer
            df['SMA200'] = df['Close'].rolling(window=200).mean()
            df['EMA200'] = df['Close'].ewm(span=200, adjust=False).mean()
            df['Close_SMA200_Ratio'] = df['Close'] / df['SMA200'].replace(0, np.nan)
        
        # Create target variables
        df['Next_Close'] = df['Close'].shift(-1)
        df['Target'] = ((df['Next_Close'] > df['Close']) * 1).astype(int)
        df['Target_Return'] = ((df['Next_Close'] - df['Close']) / df['Close'].replace(0, np.nan)) * 100
        
        # Create lagged variables (only for available features)
        lag_features = ['Close', 'Volume']
        if 'RSI' in df.columns:
            lag_features.append('RSI')
        if 'MACD' in df.columns:
            lag_features.append('MACD')
            
        for feature in lag_features:
            if feature in df.columns:
                df[f'{feature}_Lag1'] = df[feature].shift(1)
                df[f'{feature}_Lag2'] = df[feature].shift(2)
        
        # Calculate technical sentiment based on available indicators
        df['Tech_Sentiment'] = calculate_flexible_technical_sentiment(df)
        
        # SMART NaN removal - be more aggressive about keeping data
        print(f"  Before NaN removal: {len(df)} rows")
        
        # Drop rows where the target is NaN (last row)
        df = df[df['Target'].notna()]
        
        # For other NaN values, be more selective
        # Only require the most basic indicators to be non-NaN
        essential_features = ['Close', 'Daily_Return']
        if 'RSI' in df.columns:
            essential_features.append('RSI')
        if 'SMA20' in df.columns:
            essential_features.append('SMA20')
        
        # Drop rows where essential features are NaN
        for feature in essential_features:
            if feature in df.columns:
                initial_rows = len(df)
                df = df[df[feature].notna()]
                dropped = initial_rows - len(df)
                if dropped > 0:
                    print(f"    Dropped {dropped} rows due to NaN in {feature}")
        
        # Fill remaining NaN values with forward-fill then back-fill
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(method='bfill')
        
        print(f"  After processing: {len(df)} rows with {len(df.columns)} features")
        
        # Final check - ensure we have enough data for meaningful analysis
        if len(df) < 20:
            print(f"  Warning: Very little data remaining ({len(df)} rows). Consider using more historical data.")
            if len(df) < 10:
                print(f"  Insufficient data for analysis.")
                return None
        
        return df
        
    except Exception as e:
        print(f"  Error in calculate_technical_features: {e}")
        import traceback
        traceback.print_exc()
        return None

In [19]:
t re
                        pct_match = re.search(r'([-+]?)([0-9.]+)%', text)
                        if pct_match:
                            sign = pct_match.group(1)
                            value = float(pct_match.group(2))
                            pct_change = value if sign != '-' else -value
                            
                            # Convert percentage change to sentiment
                            # +5% or more = positive, -5% or less = negative
                            if pct_change > 5:
                                sentiment = 0.5
                            elif pct_change > 2:
                                sentiment = 0.25
                            elif pct_change < -5:
                                sentiment = -0.5
                            elif pct_change < -2:
                                sentiment = -0.25
                            else:
                                sentiment = 0.0
                            
                            print(f"    Estimated analyst sentiment from price change ({pct_change:.1f}%): {sentiment:.2f}")
                            return sentiment
                    except (ValueError, AttributeError):
                        continue
                        
    except Exception as e:
        print(f"    Fallback analyst method failed: {e}")
    
    print("    All analyst sentiment methods failed, returning neutral")
    return 0.0  # Neutral if not found

def analyze_news_sentiment_rule_based(news_data):
    """Simple rule-based sentiment analysis as fallback"""
    if not news_data:
        return 0.0
    
    # Define positive and negative word lists for financial context
    positive_words = [
        'gain', 'gains', 'up', 'rise', 'rises', 'rising', 'rose', 'bullish', 'outperform',
        'buy', 'growth', 'profit', 'profits', 'positive', 'strong', 'strength', 'higher',
        'record', 'upgrade', 'upgraded', 'beat', 'beats', 'exceed', 'exceeds', 'success',
        'successful', 'increase', 'increases', 'increased', 'boost', 'boosts', 'boosted',
        'opportunity', 'opportunities', 'potential', 'promising', 'optimistic', 'confident',
        'momentum', 'surge', 'rally', 'advance', 'advances', 'breakthrough', 'expansion',
        'outperforming', 'soars', 'jumped', 'climbed', 'accelerate', 'accelerating'
    ]
    
    negative_words = [
        'loss', 'losses', 'down', 'fall', 'falls', 'falling', 'fell', 'bearish', 'underperform',
        'sell', 'decline', 'declines', 'declined', 'negative', 'weak', 'weakness', 'lower',
        'downgrade', 'downgraded', 'miss', 'misses', 'missed', 'fail', 'fails', 'failed',
        'decrease', 'decreases', 'decreased', 'cut', 'cuts', 'risk', 'risks', 'risky',
        'concern', 'concerns', 'warning', 'problem', 'problems', 'threat', 'threats',
        'disappointing', 'crash', 'plunge', 'plummeted', 'volatile', 'uncertainty',
        'recession', 'bear', 'correction', 'selloff', 'slump', 'struggle', 'struggling'
    ]
    
    # Intensifier words that modify sentiment
    intensifiers = {
        'very': 1.5, 'extremely': 2.0, 'significantly': 1.7, 'substantially': 1.7,
        'dramatically': 2.0, 'sharply': 1.8, 'strongly': 1.6, 'heavily': 1.5,
        'massive': 2.0, 'huge': 1.8, 'major': 1.5, 'significant': 1.3
    }
    
    total_sentiment = 0
    article_count = 0
    
    for article in news_data:
        text = (article['headline'] + " " + article.get('summary', '')).lower()
        words = text.split()
        
        # Count positive and negative words
        positive_count = 0
        negative_count = 0
        
        for i, word in enumerate(words):
            # Clean the word
            clean_word = word.strip('.,!?;:"()[]')
            
            # Check for intensifiers
            intensifier = 1.0
            if i > 0:
                prev_word = words[i-1].strip('.,!?;:"()[]')
                if prev_word in intensifiers:
                    intensifier = intensifiers[prev_word]
            
            # Count sentiment words with intensifier
            if clean_word in positive_words:
                positive_count += intensifier
            elif clean_word in negative_words:
                negative_count += intensifier
        
        # Calculate sentiment for this article
        if positive_count > 0 or negative_count > 0:
            # Normalize by total sentiment words found
            total_sentiment_words = positive_count + negative_count
            article_sentiment = (positive_count - negative_count) / total_sentiment_words
            
            # Scale to reasonable range
            article_sentiment = max(-1.0, min(1.0, article_sentiment))
            total_sentiment += article_sentiment
            article_count += 1
    
    # Average sentiment across all articles
    if article_count > 0:
        average_sentiment = total_sentiment / article_count
        return average_sentiment
    else:
        return 0.0

def analyze_news_sentiment_with_finbert(news_data, nlp):
    """Analyze sentiment using FinBERT with enhanced fallbacks"""
    if not news_data:
        return 0.0  # Neutral sentiment if no news
    
    # If FinBERT model is not available, use rule-based approach
    if nlp is None:
        print("    Using rule-based sentiment analysis (FinBERT not available)")
        return analyze_news_sentiment_rule_based(news_data)
    
    sentiments = []
    
    for article in news_data:
        text = article['headline']
        if len(article.get('summary', '')) > 0:
            text += " " + article['summary']
        
        # Skip empty text
        if not text.strip():
            continue
            
        try:
            # Try FinBERT analysis
            result = nlp(text[:500])  # Limit text length
            
            if result and len(result) > 0:
                # Map FinBERT sentiment labels to scores
                sent_label = result[0]['label'].lower()
                sent_score = result[0]['score']
                
                if 'positive' in sent_label:
                    sentiments.append(sent_score)
                elif 'negative' in sent_label:
                    sentiments.append(-sent_score)
                else:  # neutral
                    sentiments.append(0.0)
            else:
                # Fallback to rule-based for this article
                rule_sentiment = analyze_news_sentiment_rule_based([article])
                sentiments.append(rule_sentiment)
                
        except Exception as e:
            # If FinBERT fails, use rule-based for this article
            try:
                rule_sentiment = analyze_news_sentiment_rule_based([article])
                sentiments.append(rule_sentiment)
            except:
                sentiments.append(0.0)
    
    # Average all sentiment scores
    if sentiments:
        avg_sentiment = sum(sentiments) / len(sentiments)
        # Scale to ensure [-1, 1] range
        return max(-1.0, min(1.0, avg_sentiment))
    else:
        return 0.0

# Update the main news function
def get_stock_news(symbol, max_articles=5):
    """Get recent news articles about a stock symbol"""
    return get_stock_news_robust(symbol, max_articles)

def get_sentiment_analysis(symbol, data=None):
    """Get comprehensive sentiment analysis for a stock using FinBERT"""
    print(f"  Analyzing sentiment for {symbol}...")
    
    # Initialize sentiment components
    news_sentiment = 0.0
    analyst_sentiment = 0.0
    tech_sentiment = 0.0
    
    # Get news sentiment using FinBERT
    try:
        news_data = get_stock_news(symbol)
        news_sentiment = analyze_news_sentiment_with_finbert(news_data, sentiment_model)
        print(f"    News sentiment: {news_sentiment:.2f} (from {len(news_data)} articles)")
    except Exception as e:
        print(f"    News sentiment failed: {e}")
        news_sentiment = 0.0
    
    # Get analyst ratings
    try:
        analyst_sentiment = get_analyst_ratings(symbol)
        print(f"    Analyst sentiment: {analyst_sentiment:.2f}")
    except Exception as e:
        print(f"    Analyst sentiment failed: {e}")
        analyst_sentiment = 0.0
    
    # Get technical sentiment
    try:
        if data is not None:
            tech_sentiment = get_technical_sentiment(data)
            print(f"    Technical sentiment: {tech_sentiment:.2f}")
        else:
            tech_sentiment = 0.0
    except Exception as e:
        print(f"    Technical sentiment failed: {e}")
        tech_sentiment = 0.0
    
    # Weight the components with fallbacks
    # If we have all three: 40% news, 30% analyst, 30% technical
    # If missing news: 50% analyst, 50% technical
    # If missing analyst: 60% news, 40% technical
    # If only technical: 100% technical
    
    components = []
    weights = []
    
    if abs(news_sentiment) > 0.001:  # We have meaningful news sentiment
        components.append(news_sentiment)
        weights.append(0.4)
    
    if abs(analyst_sentiment) > 0.001:  # We have meaningful analyst sentiment
        components.append(analyst_sentiment)
        weights.append(0.3)
    
    if abs(tech_sentiment) > 0.001:  # We have meaningful technical sentiment
        components.append(tech_sentiment)
        weights.append(0.3)
    
    # Normalize weights
    if weights:
        total_weight = sum(weights)
        weights = [w/total_weight for w in weights]
        combined_sentiment = sum(c*w for c, w in zip(components, weights))
    else:
        # Fallback to neutral sentiment
        combined_sentiment = 0.0
    
    print(f"    Combined sentiment: {combined_sentiment:.2f}")
    
    # Add a small random variation to avoid all stocks having identical sentiment scores
    final_sentiment = combined_sentiment + (random.uniform(-0.05, 0.05))
    final_sentiment = max(-1.0, min(1.0, final_sentiment))
    
    # Add a small delay to avoid hitting rate limits
    time.sleep(random.uniform(1, 2))
    
    return final_sentiment

In [7]:
# Cell 7: Enhanced ML Model Training Functions (Fixed for small datasets)
def create_ensemble_models_adaptive(data_size):
    """Create models adapted to the size of available data"""
    if data_size < 50:
        # Very small dataset - use simple models
        models = {
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=100,
                solver='liblinear',
                C=0.1  # More regularization
            )
        }
        print(f"    Using simple model for small dataset ({data_size} samples)")
    elif data_size < 100:
        # Small dataset - use regularized models
        models = {
            'random_forest': RandomForestClassifier(
                n_estimators=20,
                max_depth=5,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=42
            ),
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=200,
                solver='liblinear',
                C=0.1
            )
        }
        print(f"    Using regularized models for medium dataset ({data_size} samples)")
    else:
        # Larger dataset - use full ensemble
        models = {
            'random_forest': RandomForestClassifier(
                n_estimators=50,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42
            ),
            'gradient_boosting': GradientBoostingClassifier(
                n_estimators=50,
                max_depth=6,
                learning_rate=0.1,
                random_state=42
            ),
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=500,
                solver='liblinear',
                C=1.0
            )
        }
        
        # Add XGBoost for larger datasets
        try:
            models['xgboost'] = xgb.XGBClassifier(
                n_estimators=50,
                max_depth=6,
                learning_rate=0.1,
                random_state=42,
                use_label_encoder=False,
                eval_metric='logloss',
                verbosity=0
            )
        except Exception as e:
            print(f"    XGBoost not available: {e}")
        
        print(f"    Using full ensemble for large dataset ({data_size} samples)")
    
    return models

def prepare_ml_data_robust(data):
    """Prepare and clean data for ML training with enhanced small dataset handling"""
    try:
        print(f"  Preparing ML data from {len(data)} rows...")
        
        # Select features for ML, excluding target and non-predictive columns
        exclude_cols = [
            'Target', 'Target_Return', 'Next_Close', 
            'Tech_Sentiment',  # Used separately as External_Sentiment
        ]
        
        # Get all potential feature columns
        all_feature_cols = [col for col in data.columns if col not in exclude_cols]
        
        # Keep only numeric columns
        numeric_cols = []
        for col in all_feature_cols:
            if pd.api.types.is_numeric_dtype(data[col]):
                numeric_cols.append(col)
        
        print(f"    Found {len(numeric_cols)} numeric features")
        
        # Create feature matrix and target
        X = data[numeric_cols].copy()
        y = data['Target'].copy()
        
        # Handle remaining NaN values more aggressively
        print(f"    Handling missing values...")
        
        # Count NaN values per column
        nan_counts = X.isna().sum()
        
        # Remove columns with too many NaN values (>50% missing)
        threshold = len(X) * 0.5
        good_cols = []
        for col in X.columns:
            if nan_counts[col] <= threshold:
                good_cols.append(col)
            else:
                print(f"      Removing {col}: {nan_counts[col]}/{len(X)} missing values")
        
        X = X[good_cols]
        
        # Fill remaining NaN values with median
        for col in X.columns:
            if X[col].isna().any():
                median_val = X[col].median()
                if pd.isna(median_val):  # If median is also NaN, use 0
                    median_val = 0
                n_filled = X[col].isna().sum()
                X[col].fillna(median_val, inplace=True)
                print(f"      Filled {n_filled} NaN values in {col} with {median_val:.3f}")
        
        # Handle infinite values
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(X.median())
        X = X.fillna(0)  # Final fallback
        
        # Remove features with zero variance
        from sklearn.feature_selection import VarianceThreshold
        variance_selector = VarianceThreshold(threshold=0.0)
        X_filtered = variance_selector.fit_transform(X)
        selected_features = X.columns[variance_selector.get_support()].tolist()
        X = pd.DataFrame(X_filtered, columns=selected_features, index=X.index)
        
        if len(selected_features) < len(numeric_cols):
            removed = len(numeric_cols) - len(selected_features)
            print(f"      Removed {removed} zero-variance features")
        
        # Ensure we have valid target values
        valid_mask = y.notna()
        X = X[valid_mask]
        y = y[valid_mask]
        
        # Final data checks
        print(f"    Final dataset: {X.shape[0]} samples, {X.shape[1]} features")
        
        # Check target distribution
        target_counts = y.value_counts()
        print(f"    Target distribution: {target_counts.to_dict()}")
        
        return X, y, selected_features
        
    except Exception as e:
        print(f"    Error preparing ML data: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None

def train_and_predict_robust(data):
    """Enhanced ML training with small dataset support"""
    if data is None:
        print("  No data provided for ML training")
        return None
    
    print(f"  Starting ML training with {len(data)} data points...")
    
    # Lower the minimum data requirement
    if len(data) < 20:
        print(f"  Insufficient data for ML training (need at least 20, got {len(data)})")
        return None
    
    try:
        # Prepare the data
        X, y, feature_cols = prepare_ml_data_robust(data)
        
        if X is None or len(X) < 15:
            print("  Failed to prepare sufficient data for ML")
            return None
        
        # Check class balance
        class_counts = y.value_counts()
        if len(class_counts) < 2:
            print(f"  Only one class present: {class_counts.to_dict()}")
            return None
        
        min_class_count = class_counts.min()
        if min_class_count < 3:
            print(f"  Insufficient class balance for ML: {class_counts.to_dict()}")
            print("  (Need at least 3 samples of each class)")
            return None
        
        print(f"    Class balance OK: {class_counts.to_dict()}")
        
        # Use adaptive test size based on total data
        if len(X) < 30:
            test_size = 0.2  # Keep more for training
        elif len(X) < 60:
            test_size = 0.25
        else:
            test_size = 0.3
        
        # Ensure minimum test size
        min_test_samples = max(3, min_class_count // 2)
        actual_test_size = max(test_size, min_test_samples / len(X))
        
        print(f"    Using test size: {actual_test_size:.2f}")
        
        # Split data
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                test_size=actual_test_size, 
                stratify=y, 
                random_state=42
            )
        except ValueError as e:
            print(f"    Stratified split failed ({e}), using random split")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                test_size=actual_test_size, 
                random_state=42
            )
        
        print(f"    Train: {len(X_train)} samples, Test: {len(X_test)} samples")
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Get adaptive models
        models = create_ensemble_models_adaptive(len(X_train))
        model_results = {}
        successful_models = 0
        
        for name, model in models.items():
            try:
                print(f"      Training {name}...")
                
                # Train the model
                model.fit(X_train_scaled, y_train)
                
                # Make predictions
                if len(X_test) > 0:
                    test_pred = model.predict(X_test_scaled)
                    test_proba = model.predict_proba(X_test_scaled)[:, 1]
                    test_accuracy = accuracy_score(y_test, test_pred)
                else:
                    # If no test set, use train set for evaluation (not ideal but better than failure)
                    test_pred = model.predict(X_train_scaled)
                    test_proba = model.predict_proba(X_train_scaled)[:, 1]
                    test_accuracy = accuracy_score(y_train, test_pred)
                    print(f"        Warning: Using training set for evaluation")
                
                # Train accuracy
                train_pred = model.predict(X_train_scaled)
                train_accuracy = accuracy_score(y_train, train_pred)
                
                # Check for overfitting
                overfitting = train_accuracy - test_accuracy
                
                model_results[name] = {
                    'model': model,
                    'train_accuracy': train_accuracy,
                    'test_accuracy': test_accuracy,
                    'prediction': test_pred[-1] if len(test_pred) > 0 else train_pred[-1],
                    'probability': test_proba[-1] if len(test_proba) > 0 else 0.5,
                    'overfitting': overfitting
                }
                
                successful_models += 1
                print(f"        ✓ Train={train_accuracy:.3f}, Test={test_accuracy:.3f}")
                
                if overfitting > 0.3:
                    print(f"        ⚠ High overfitting: {overfitting:.3f}")
                
            except Exception as e:
                print(f"        ❌ {name} failed: {e}")
                continue
        
        if successful_models == 0:
            print("  All ML models failed to train")
            return None
        
        # Ensemble prediction
        predictions = []
        probabilities = []
        weights = []
        
        for name, result in model_results.items():
            # Weight by test accuracy, penalize overfitting
            weight = result['test_accuracy'] * (1 - min(result['overfitting'], 0.5))
            weight = max(weight, 0.1)  # Minimum weight
            weights.append(weight)
            predictions.append(result['prediction'])
            probabilities.append(result['probability'])
        
        # Weighted ensemble
        weights = np.array(weights)
        weights = weights / weights.sum()
        
        ensemble_prediction = np.average(predictions, weights=weights)
        ensemble_probability = np.average(probabilities, weights=weights)
        average_accuracy = np.average([r['test_accuracy'] for r in model_results.values()])
        
        # Get feature importance from best model
        best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['test_accuracy'])
        best_model = model_results[best_model_name]['model']
        
        importance_dict = {}
        if hasattr(best_model, 'feature_importances_'):
            importance_dict[best_model_name] = dict(zip(feature_cols, best_model.feature_importances_))
        elif hasattr(best_model, 'coef_'):
            importance_dict[best_model_name] = dict(zip(feature_cols, np.abs(best_model.coef_[0])))
        
        result = {
            'prediction': ensemble_prediction,
            'probability': ensemble_probability,
            'accuracy': average_accuracy,
            'model_results': model_results,
            'feature_importance': importance_dict,
            'successful_models': successful_models,
            'best_model': best_model_name,
            'data_size': len(X)
        }
        
        print(f"  ✓ ML training complete: {successful_models} models, accuracy={average_accuracy:.3f}")
        return result
        
    except Exception as e:
        print(f"  ML training error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Update the main training function
def train_and_predict(data):
    """Main ML training function with enhanced robustness for small datasets"""
    return train_and_predict_robust(data)

In [8]:
# Cell 8: Generate stock recommendations (Fixed to be more responsive)
def generate_enhanced_recommendation(technical_data, sentiment_score, ml_result):
    """Generate trading recommendations based on technical, sentiment, and ML data"""
    if technical_data is None or len(technical_data) < 5:
        return "INSUFFICIENT_DATA"
    
    latest = technical_data.iloc[-1]
    
    # Technical analysis score (more detailed)
    technical_score = 0
    
    # Moving Average signals (stronger weight for clear trends)
    ma_signals = 0
    ma_count = 0
    
    # Check available moving averages
    if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
        if latest['Close'] > latest['SMA20']:
            ma_signals += 1
        else:
            ma_signals -= 1
        ma_count += 1
    
    if 'SMA50' in latest.index and pd.notna(latest['SMA50']):
        if latest['Close'] > latest['SMA50']:
            ma_signals += 1.5  # More weight for longer MA
        else:
            ma_signals -= 1.5
        ma_count += 1
    
    if 'SMA200' in latest.index and pd.notna(latest['SMA200']):
        if latest['Close'] > latest['SMA200']:
            ma_signals += 2  # Strong weight for long-term trend
        else:
            ma_signals -= 2
        ma_count += 1
    
    # Check MA crossovers
    if 'SMA20' in latest.index and 'SMA50' in latest.index:
        if pd.notna(latest['SMA20']) and pd.notna(latest['SMA50']):
            if latest['SMA20'] > latest['SMA50']:
                ma_signals += 1
            else:
                ma_signals -= 1
            ma_count += 1
    
    # Normalize MA signals
    if ma_count > 0:
        technical_score += ma_signals / ma_count * 3  # Scale up the MA influence
    
    # Momentum signals (RSI)
    if 'RSI' in latest.index and pd.notna(latest['RSI']):
        rsi = latest['RSI']
        if rsi < 25:  # Very oversold - strong buy signal
            technical_score += 3
        elif rsi < 35:  # Oversold - buy signal
            technical_score += 2
        elif rsi > 75:  # Very overbought - strong sell signal
            technical_score -= 3
        elif rsi > 65:  # Overbought - sell signal
            technical_score -= 2
        # Neutral zone (35-65) adds no score
    
    # MACD signals
    if 'MACD' in latest.index and 'Signal_Line' in latest.index:
        if pd.notna(latest['MACD']) and pd.notna(latest['Signal_Line']):
            if latest['MACD'] > latest['Signal_Line']:
                technical_score += 2
            else:
                technical_score -= 2
    
    # Trend strength (ADX)
    if 'ADX' in latest.index and pd.notna(latest['ADX']):
        adx = latest['ADX']
        if adx > 40:  # Strong trend
            # Determine trend direction from price vs MAs
            if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
                if latest['Close'] > latest['SMA20']:
                    technical_score += 1  # Strong uptrend
                else:
                    technical_score -= 1  # Strong downtrend
    
    # Price momentum (recent performance)
    if 'ROC10' in latest.index and pd.notna(latest['ROC10']):
        roc = latest['ROC10']
        if roc > 5:  # Strong positive momentum
            technical_score += 2
        elif roc > 2:  # Positive momentum
            technical_score += 1
        elif roc < -5:  # Strong negative momentum
            technical_score -= 2
        elif roc < -2:  # Negative momentum
            technical_score -= 1
    
    # Normalize technical score (-10 to +10 range)
    technical_score = max(-10, min(10, technical_score))
    normalized_technical = technical_score / 10.0  # Scale to -1 to 1
    
    print(f"    Technical score: {technical_score}/10 ({normalized_technical:.2f})")
    
    # ML prediction weight
    ml_weight = 0
    ml_confidence = 0.5
    
    if ml_result:
        # Scale ML probability to -1 to 1 range
        ml_weight = (ml_result['probability'] - 0.5) * 2
        ml_confidence = ml_result.get('accuracy', 0.5)
        print(f"    ML weight: {ml_weight:.2f} (prob: {ml_result['probability']:.2f}, accuracy: {ml_confidence:.2f})")
    
    # Sentiment weight (already in -1 to 1 range)
    print(f"    Sentiment: {sentiment_score:.2f}")
    
    # Combine all scores with weights
    # Increased weights to make system more responsive
    if ml_confidence >= 0.6:
        # High ML confidence: 35% technical, 45% ML, 20% sentiment
        final_score = (normalized_technical * 0.35) + (ml_weight * 0.45) + (sentiment_score * 0.20)
    else:
        # Low ML confidence: 50% technical, 25% ML, 25% sentiment
        final_score = (normalized_technical * 0.50) + (ml_weight * 0.25) + (sentiment_score * 0.25)
    
    print(f"    Final combined score: {final_score:.3f}")
    
    # Generate recommendation with more aggressive thresholds
    if ml_confidence < 0.5:
        # Very low confidence - be very conservative
        if final_score > 0.6:
            return "WEAK_BUY"
        elif final_score < -0.6:
            return "WEAK_SELL"
        else:
            return "HOLD"
    elif ml_confidence < 0.65:
        # Medium confidence - somewhat conservative
        if final_score > 0.4:
            return "BUY"
        elif final_score > 0.15:
            return "WEAK_BUY"
        elif final_score < -0.4:
            return "SELL"
        elif final_score < -0.15:
            return "WEAK_SELL"
        else:
            return "HOLD"
    else:
        # High confidence - more aggressive recommendations
        if final_score > 0.25:
            return "STRONG_BUY"
        elif final_score > 0.1:
            return "BUY"
        elif final_score > 0.05:
            return "WEAK_BUY"
        elif final_score < -0.25:
            return "STRONG_SELL"
        elif final_score < -0.1:
            return "SELL"
        elif final_score < -0.05:
            return "WEAK_SELL"
        else:
            return "HOLD"

In [9]:
# Updated Cell 9: Main analysis function (with better debugging)
def analyze_stock(symbol, show_details=True):
    """Analyze a single stock with detailed output using FinBERT sentiment analysis"""
    print(f"Analyzing {symbol}...")
    
    # Get historical data
    historical_data = get_historical_data(symbol)
    if historical_data is None or len(historical_data) < 100:
        print(f"❌ Insufficient historical data for {symbol}")
        return None
    
    # Calculate technical features
    technical_data = calculate_technical_features(historical_data)
    if technical_data is None:
        print(f"❌ Failed to calculate technical indicators for {symbol}")
        return None
    
    # Get sentiment analysis using FinBERT
    sentiment_score = get_sentiment_analysis(symbol, technical_data)
    print(f"✓ Sentiment score: {sentiment_score:.2f}")
    
    # Add sentiment to the data for ML
    technical_data['External_Sentiment'] = sentiment_score
    
    # Train ML models and predict
    ml_result = train_and_predict(technical_data)
    if ml_result:
        print(f"✓ ML model accuracy: {ml_result['accuracy']:.2f}")
        print(f"✓ ML prediction probability: {ml_result['probability']:.2f}")
    else:
        print("❌ ML prediction failed")
    
    # Generate recommendation with detailed scoring
    print("  Generating recommendation:")
    recommendation = generate_enhanced_recommendation(technical_data, sentiment_score, ml_result)
    print(f"✓ Final recommendation: {recommendation}")
    
    # Create result summary
    result = {
        'Symbol': symbol,
        'Last_Price': technical_data['Close'].iloc[-1],
        'RSI': technical_data['RSI'].iloc[-1] if 'RSI' in technical_data.columns else None,
        'SMA20': technical_data['SMA20'].iloc[-1] if 'SMA20' in technical_data.columns else None,
        'SMA50': technical_data['SMA50'].iloc[-1] if 'SMA50' in technical_data.columns else None,
        'SMA200': technical_data['SMA200'].iloc[-1] if 'SMA200' in technical_data.columns else None,
        'MACD': technical_data['MACD'].iloc[-1] if 'MACD' in technical_data.columns else None,
        'ADX': technical_data['ADX'].iloc[-1] if 'ADX' in technical_data.columns else None,
        'Sentiment': sentiment_score,
        'Recommendation': recommendation
    }
    
    # Add ML results if available
    if ml_result:
        result['ML_Prediction'] = ml_result['prediction']
        result['ML_Probability'] = ml_result['probability']
        result['ML_Accuracy'] = ml_result['accuracy']
    
    # Show additional details if requested
    if show_details:
        print(f"\n  Detailed Analysis for {symbol}:")
        print(f"  Current Price: ${result['Last_Price']:.2f}")
        if result['RSI']:
            print(f"  RSI: {result['RSI']:.1f}")
        if result['SMA20']:
            print(f"  Price vs SMA20: {((result['Last_Price']/result['SMA20']-1)*100):+.1f}%")
        if result['SMA50']:
            print(f"  Price vs SMA50: {((result['Last_Price']/result['SMA50']-1)*100):+.1f}%")
        
        # Show feature importance if available
        if ml_result and 'feature_importance' in ml_result and show_details:
            best_model = ml_result.get('best_model', 'random_forest')
            if best_model in ml_result['feature_importance']:
                importances = ml_result['feature_importance'][best_model]
                top_features = dict(sorted(importances.items(), key=lambda x: x[1], reverse=True)[:5])
                print(f"  Top 5 features ({best_model}):")
                for feature, importance in top_features.items():
                    print(f"    {feature}: {importance:.3f}")
    
    return result

In [10]:
# Cell 10: Process multiple stocks
def analyze_multiple_stocks(symbols, max_stocks=None):
    """Analyze multiple stocks and return a DataFrame of results"""
    if max_stocks:
        symbols = symbols[:max_stocks]
    
    results = []
    
    for i, symbol in enumerate(symbols):
        print(f"Analyzing {symbol} ({i+1}/{len(symbols)})")
        
        try:
            result = analyze_stock(symbol, show_details=False)
            if result:
                results.append(result)
        except Exception as e:
            print(f"Error analyzing {symbol}: {e}")
            continue
        
        # Rate limiting to avoid being blocked by web services
        if (i + 1) % 3 == 0 and i < len(symbols) - 1:
            sleep_time = 10 + random.randint(1, 5)
            print(f"Pausing for {sleep_time} seconds to avoid rate limiting...")
            time.sleep(sleep_time)
    
    return pd.DataFrame(results)

In [18]:
# Cell 11: Visualization and results with robust column handling
def plot_results(results_df):
    """Create visualizations and summary of results with better error handling"""
    # Display recommendations summary
    print("\nRecommendation Summary:")
    rec_counts = results_df['Recommendation'].value_counts()
    display(rec_counts)

    # Plot recommendation distribution
    plt.figure(figsize=(10, 6))
    rec_order = [
        'STRONG_BUY', 'BUY', 'WEAK_BUY', 
        'HOLD', 
        'WEAK_SELL', 'SELL', 'STRONG_SELL'
    ]
    # Only include categories that exist in the data
    existing_categories = [cat for cat in rec_order if cat in rec_counts.index]
    
    if existing_categories:
        sns.countplot(x='Recommendation', data=results_df, order=existing_categories)
        plt.title('Stock Recommendations Distribution')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("No recommendation data to plot")

    # Ensure we have required columns for displays
    required_cols = ['Symbol', 'Last_Price', 'Recommendation']
    missing_cols = [col for col in required_cols if col not in results_df.columns]
    
    if missing_cols:
        print(f"Missing required columns for display: {missing_cols}")
        return results_df, results_df  # Return empty DataFrames as placeholders
        
    # Optional columns to display if available
    display_cols = ['Symbol', 'Last_Price', 'Recommendation']
    for col in ['RSI', 'ADX', 'Sentiment', 'ML_Probability', 'ML_Accuracy']:
        if col in results_df.columns:
            display_cols.append(col)
    
    # Show top buy recommendations
    buy_stocks = results_df[results_df['Recommendation'].isin(['STRONG_BUY', 'BUY'])].sort_values('Last_Price', ascending=True)
    if 'ML_Probability' in results_df.columns:
        buy_stocks = results_df[results_df['Recommendation'].isin(['STRONG_BUY', 'BUY'])].sort_values('ML_Probability', ascending=False)
    
    print("\nTOP BUY RECOMMENDATIONS:")
    if len(buy_stocks) > 0:
        display(buy_stocks[display_cols])
    else:
        print("No buy recommendations found")

    # Show top sell recommendations
    sell_stocks = results_df[results_df['Recommendation'].isin(['STRONG_SELL', 'SELL'])].sort_values('Last_Price', ascending=True)
    if 'ML_Probability' in results_df.columns:
        sell_stocks = results_df[results_df['Recommendation'].isin(['STRONG_SELL', 'SELL'])].sort_values('ML_Probability', ascending=True)
    
    print("\nTOP SELL RECOMMENDATIONS:")
    if len(sell_stocks) > 0:
        display(sell_stocks[display_cols])
    else:
        print("No sell recommendations found")

    # Only create scatter plot if we have the necessary columns
    if 'Sentiment' in results_df.columns and 'ML_Probability' in results_df.columns:
        try:
            # Create scatter plot of Sentiment vs ML_Probability colored by Recommendation
            plt.figure(figsize=(12, 8))
            recommendation_colors = {
                'STRONG_BUY': 'darkgreen',
                'BUY': 'green',
                'WEAK_BUY': 'lightgreen',
                'HOLD': 'blue',
                'WEAK_SELL': 'salmon',
                'SELL': 'red',
                'STRONG_SELL': 'darkred'
            }

            for rec in recommendation_colors:
                subset = results_df[results_df['Recommendation'] == rec]
                if len(subset) > 0:
                    plt.scatter(subset['Sentiment'], subset['ML_Probability'], 
                              c=recommendation_colors[rec], label=rec, s=100, alpha=0.7)

            for i, row in results_df.iterrows():
                plt.annotate(row['Symbol'], 
                            (row['Sentiment'], row['ML_Probability']),
                            xytext=(5, 5), textcoords='offset points')

            plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.3)
            plt.axvline(x=0, color='gray', linestyle='--', alpha=0.3)
            plt.xlabel('Sentiment Score')
            plt.ylabel('ML Probability (Higher = More Bullish)')
            plt.title('Stock Analysis: Sentiment vs ML Prediction')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            plt.savefig(os.path.join(output_dir, f"sentiment_ml_plot_{timestamp}.png"))
            plt.show()
        except Exception as e:
            print(f"Error creating scatter plot: {e}")
    else:
        print("Cannot create scatter plot: missing required columns (Sentiment and/or ML_Probability)")
    
    return buy_stocks, sell_stocks



In [17]:
# Cell 12: Save results and create a styled dataframe
def save_and_display_results(results_df):
    """Save results to files and create styled display with better error handling"""
    if len(results_df) == 0:
        print("No results to save or display")
        return results_df.style  # Return empty styled DataFrame
        
    # Create output directory if it doesn't exist
    output_dir = os.getenv('OUTPUT_DIR', './results')
    os.makedirs(output_dir, exist_ok=True)

    # Save results to CSV
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = os.path.join(output_dir, f"sp500_analysis_{timestamp}.csv")
    results_df.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")
    
    # Create HTML report (with error handling for missing columns)
    try:
        # Count recommendations
        rec_counts = {
            'STRONG_BUY': len(results_df[results_df['Recommendation'] == 'STRONG_BUY']),
            'BUY': len(results_df[results_df['Recommendation'] == 'BUY']), 
            'HOLD': len(results_df[results_df['Recommendation'] == 'HOLD']),
            'SELL': len(results_df[results_df['Recommendation'] == 'SELL']),
            'STRONG_SELL': len(results_df[results_df['Recommendation'] == 'STRONG_SELL'])
        }
        
        # Get buy and sell recommendations
        buy_recs = results_df[results_df['Recommendation'].isin(['STRONG_BUY', 'BUY'])]
        sell_recs = results_df[results_df['Recommendation'].isin(['STRONG_SELL', 'SELL'])]
        
        html_report = f"""
        <html>
        <head>
            <title>S&P 500 Analysis Report - {datetime.now().strftime('%Y-%m-%d')}</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
                th, td {{ border: 1px solid #ddd; padding: 8px; text-align: center; }}
                th {{ background-color: #4CAF50; color: white; }}
                .buy {{ color: green; font-weight: bold; }}
                .sell {{ color: red; font-weight: bold; }}
                .hold {{ color: blue; font-weight: bold; }}
                .summary {{ margin: 20px 0; }}
            </style>
        </head>
        <body>
            <h1>S&P 500 Analysis Report</h1>
            <p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            
            <div class="summary">
                <h2>Summary</h2>
                <p>Total Stocks Analyzed: {len(results_df)}</p>
                <ul>
                    <li>Strong Buy: {rec_counts['STRONG_BUY']}</li>
                    <li>Buy: {rec_counts['BUY']}</li>
                    <li>Hold: {rec_counts['HOLD']}</li>
                    <li>Sell: {rec_counts['SELL']}</li>
                    <li>Strong Sell: {rec_counts['STRONG_SELL']}</li>
                </ul>
            </div>
            
            <h2>Top Buy Recommendations</h2>
            {buy_recs.to_html(classes='data', index=False) if len(buy_recs) > 0 else "<p>No buy recommendations found.</p>"}
            
            <h2>Top Sell Recommendations</h2>
            {sell_recs.to_html(classes='data', index=False) if len(sell_recs) > 0 else "<p>No sell recommendations found.</p>"}
        </body>
        </html>
        """
        
        html_file = os.path.join(output_dir, f"analysis_report_{timestamp}.html")
        with open(html_file, 'w') as f:
            f.write(html_report)
        
        print(f"HTML report saved to: {html_file}")
    except Exception as e:
        print(f"Error creating HTML report: {e}")
    
    # Style DataFrame for display
    def color_recommendation(val):
        if val == 'STRONG_BUY':
            return 'background-color: darkgreen; color: white'
        elif val == 'BUY':
            return 'background-color: green; color: white'
        elif val == 'WEAK_BUY':
            return 'background-color: lightgreen'
        elif val == 'HOLD':
            return 'background-color: lightblue'
        elif val == 'WEAK_SELL':
            return 'background-color: salmon'
        elif val == 'SELL':
            return 'background-color: red; color: white'
        elif val == 'STRONG_SELL':
            return 'background-color: darkred; color: white'
        return ''

    # Return styled DataFrame
    try:
        return results_df.style.applymap(color_recommendation, subset=['Recommendation'])
    except Exception as e:
        print(f"Error styling DataFrame: {e}")
        return results_df  # Return unstyles DataFrame as fallback

In [13]:
# Cell 13: Interactive analysis of a single stock (optional)
def interactive_single_stock_analysis(symbol="AAPL"):
    """Interactive analysis of a single stock with detailed output"""
    result = analyze_stock(symbol, show_details=True)
    if result:
        # Plot recent price history
        historical_data = get_historical_data(symbol, days=360)
        if historical_data is not None:
            plt.figure(figsize=(12, 6))
            plt.plot(historical_data.index, historical_data['Close'])
            plt.title(f"{symbol} Price History - Last 360 Days")
            plt.xlabel("Date")
            plt.ylabel("Price ($)")
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            # Technical indicators plot
            fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 12), sharex=True)
            
            # Price and moving averages
            ax1.plot(historical_data.index, historical_data['Close'], label='Close')
            ax1.plot(historical_data.index, historical_data['Close'].rolling(window=20).mean(), label='SMA20')
            ax1.plot(historical_data.index, historical_data['Close'].rolling(window=50).mean(), label='SMA50')
            ax1.plot(historical_data.index, historical_data['Close'].rolling(window=200).mean(), label='SMA200')
            ax1.set_title(f"{symbol} Price and Moving Averages")
            ax1.set_ylabel("Price ($)")
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # RSI
            rsi = calculate_rsi(historical_data['Close'])
            ax2.plot(historical_data.index, rsi, color='purple')
            ax2.axhline(y=70, color='r', linestyle='-', alpha=0.3)
            ax2.axhline(y=30, color='g', linestyle='-', alpha=0.3)
            ax2.set_title("RSI (14)")
            ax2.set_ylabel("RSI")
            ax2.set_ylim(0, 100)
            ax2.grid(True, alpha=0.3)
            
            # MACD
            macd, signal, hist = calculate_macd(historical_data['Close'])
            ax3.plot(historical_data.index, macd, label='MACD')
            ax3.plot(historical_data.index, signal, label='Signal')
            ax3.bar(historical_data.index, hist, label='Histogram', alpha=0.5)
            ax3.set_title("MACD")
            ax3.set_ylabel("MACD")
            ax3.set_xlabel("Date")
            ax3.legend()
            ax3.grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            # Show final recommendation with styled color
            recommendation = result['Recommendation']
            rec_color = {
                'STRONG_BUY': '🟢 <span style="color:darkgreen;font-weight:bold;">STRONG BUY</span>',
                'BUY': '🟢 <span style="color:green;font-weight:bold;">BUY</span>',
                'WEAK_BUY': '🟢 <span style="color:lightgreen;font-weight:bold;">WEAK BUY</span>',
                'HOLD': '🔵 <span style="color:blue;font-weight:bold;">HOLD</span>',
                'WEAK_SELL': '🔴 <span style="color:salmon;font-weight:bold;">WEAK SELL</span>',
                'SELL': '🔴 <span style="color:red;font-weight:bold;">SELL</span>',
                'STRONG_SELL': '🔴 <span style="color:darkred;font-weight:bold;">STRONG SELL</span>'
            }
            
            display(HTML(f"<h2>Final recommendation for {symbol}: {rec_color.get(recommendation, recommendation)}</h2>"))
            
            return result
    return None

In [14]:
# Cell 14: Testing and validation functions
def test_single_stock_analysis(symbol="AAPL"):
    """Test the analysis pipeline with a single stock"""
    print(f"Testing analysis pipeline with {symbol}...")
    
    try:
        # Step 1: Download data
        print("1. Downloading historical data...")
        historical_data = get_historical_data(symbol, days=360)
        if historical_data is None:
            print("❌ Failed to download data")
            return False
        print(f"✓ Downloaded {len(historical_data)} days of data")
        print(f"  Columns: {list(historical_data.columns)}")
        print(f"  Data types: {historical_data.dtypes.to_dict()}")
        
        # Step 2: Calculate technical features
        print("2. Calculating technical features...")
        technical_data = calculate_technical_features(historical_data)
        if technical_data is None:
            print("❌ Failed to calculate technical features")
            return False
        print(f"✓ Calculated {len(technical_data.columns)} technical features")
        print(f"  Final dataset shape: {technical_data.shape}")
        
        # Step 3: Test sentiment analysis
        print("3. Testing sentiment analysis...")
        try:
            sentiment_score = get_sentiment_analysis(symbol, technical_data)
            print(f"✓ Sentiment analysis completed: {sentiment_score:.3f}")
        except Exception as e:
            print(f"⚠ Sentiment analysis failed: {e}")
            sentiment_score = 0.0
        
        # Step 4: Test ML prediction
        print("4. Testing ML prediction...")
        technical_data['External_Sentiment'] = sentiment_score
        ml_result = train_and_predict(technical_data)
        if ml_result:
            print(f"✓ ML prediction successful")
            print(f"  Ensemble accuracy: {ml_result['accuracy']:.3f}")
            print(f"  Prediction probability: {ml_result['probability']:.3f}")
        else:
            print("⚠ ML prediction failed")
        
        # Step 5: Generate recommendation
        print("5. Generating recommendation...")
        recommendation = generate_enhanced_recommendation(technical_data, sentiment_score, ml_result)
        print(f"✓ Final recommendation: {recommendation}")
        
        print(f"\n✅ Test completed successfully for {symbol}")
        return True
        
    except Exception as e:
        print(f"❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()
        return False

In [15]:
# Cell 15: Quick analysis function with error handling
def quick_analyze_stock(symbol, show_details=False):
    """Quick analysis with comprehensive error handling"""
    result = {
        'Symbol': symbol,
        'Status': 'Failed',
        'Error': None,
        'Last_Price': None,
        'RSI': None,
        'MACD': None,
        'Sentiment': None,
        'ML_Probability': None,
        'Recommendation': 'INSUFFICIENT_DATA'
    }
    
    try:
        # Get data with timeout and retry logic
        print(f"Analyzing {symbol}...")
        
        for attempt in range(2):  # Try twice
            historical_data = get_historical_data(symbol)
            if historical_data is not None:
                break
            if attempt == 0:
                print(f"  Retry attempt for {symbol}...")
                time.sleep(2)
        
        if historical_data is None:
            result['Error'] = 'Failed to download data'
            return result
        
        # Calculate features with error handling
        try:
            technical_data = calculate_technical_features(historical_data)
            if technical_data is None:
                result['Error'] = 'Failed to calculate technical features'
                return result
        except Exception as e:
            result['Error'] = f'Technical analysis error: {str(e)}'
            return result
        
        # Extract basic metrics
        latest = technical_data.iloc[-1]
        result['Last_Price'] = latest['Close']
        result['RSI'] = latest.get('RSI', None)
        result['MACD'] = latest.get('MACD', None)
        
        # Get sentiment with fallback
        try:
            sentiment_score = get_sentiment_analysis(symbol, technical_data)
            result['Sentiment'] = sentiment_score
        except Exception as e:
            print(f"  Sentiment analysis failed, using technical sentiment: {e}")
            result['Sentiment'] = get_technical_sentiment(technical_data)
        
        # ML prediction with fallback
        technical_data['External_Sentiment'] = result['Sentiment']
        try:
            ml_result = train_and_predict(technical_data)
            if ml_result:
                result['ML_Probability'] = ml_result['probability']
                result['ML_Accuracy'] = ml_result['accuracy']
            else:
                # Simple prediction based on technical indicators
                result['ML_Probability'] = 0.5 + (result['Sentiment'] * 0.3)
        except Exception as e:
            print(f"  ML prediction failed, using simple heuristic: {e}")
            result['ML_Probability'] = 0.5 + (result['Sentiment'] * 0.3)
        
        # Generate recommendation
        try:
            recommendation = generate_enhanced_recommendation(
                technical_data, 
                result['Sentiment'], 
                ml_result if 'ml_result' in locals() else None
            )
            result['Recommendation'] = recommendation
        except Exception as e:
            # Simple recommendation based on sentiment and basic indicators
            if result['Sentiment'] > 0.3 and result['RSI'] < 70:
                result['Recommendation'] = 'BUY'
            elif result['Sentiment'] < -0.3 and result['RSI'] > 30:
                result['Recommendation'] = 'SELL'
            else:
                result['Recommendation'] = 'HOLD'
        
        result['Status'] = 'Success'
        print(f"✓ {symbol}: {result['Recommendation']} (Sentiment: {result['Sentiment']:.2f})")
        
        if show_details:
            print(f"  Price: ${result['Last_Price']:.2f}")
            print(f"  RSI: {result['RSI']:.1f}" if result['RSI'] else "  RSI: N/A")
            print(f"  ML Prob: {result['ML_Probability']:.3f}" if result['ML_Probability'] else "  ML Prob: N/A")
        
    except Exception as e:
        result['Error'] = str(e)
        print(f"❌ {symbol}: Analysis failed - {e}")
    
    return result

In [23]:
# Cell 16: Main execution cell with enhanced error handling
def analyze_multiple_stocks_robust(symbols, max_stocks=None):
    """Robust analysis of multiple stocks with error handling"""
    if max_stocks:
        symbols = symbols[:max_stocks]
    
    results = []
    successful = 0
    failed = 0
    
    print(f"Starting analysis of {len(symbols)} stocks...")
    
    for i, symbol in enumerate(symbols):
        print(f"\n[{i+1}/{len(symbols)}] Processing {symbol}...")
        
        try:
            result = quick_analyze_stock(symbol, show_details=False)
            results.append(result)
            
            if result['Status'] == 'Success':
                successful += 1
            else:
                failed += 1
                print(f"  ⚠ Failed: {result['Error']}")
                
        except Exception as e:
            print(f"  ❌ Unexpected error: {e}")
            failed += 1
            # Add failed entry to maintain count
            results.append({
                'Symbol': symbol,
                'Status': 'Failed',
                'Error': str(e),
                'Recommendation': 'INSUFFICIENT_DATA'
            })
        
        # Rate limiting with variable delays
        if (i + 1) % 3 == 0 and i < len(symbols) - 1:
            delay = random.randint(8, 15)
            print(f"  Pausing {delay}s to avoid rate limits...")
            time.sleep(delay)
        elif i < len(symbols) - 1:
            time.sleep(random.uniform(1, 3))  # Small random delay between requests
    
    print(f"\n📊 Analysis Summary:")
    print(f"  ✅ Successful: {successful}")
    print(f"  ❌ Failed: {failed}")
    print(f"  📈 Success Rate: {successful/len(symbols)*100:.1f}%")
    
    # Convert to DataFrame, only including successful analyses
    successful_results = [r for r in results if r['Status'] == 'Success']
    if successful_results:
        df = pd.DataFrame(successful_results)
        
        # Clean up the DataFrame
        columns_to_keep = [
            'Symbol', 'Last_Price', 'RSI', 'MACD', 'Sentiment', 
            'ML_Probability', 'ML_Accuracy', 'Recommendation'
        ]
        existing_columns = [col for col in columns_to_keep if col in df.columns]
        df = df[existing_columns]
        
        return df
    else:
        print("⚠ No successful analyses to return")
        return pd.DataFrame()

# Main execution
if __name__ == "__main__" or '__ipython__' in globals():
    # Test with a single stock first
    print("🧪 Running initial test...")
    test_success = test_single_stock_analysis("BTC")
    
    if not test_success:
        print("❌ Initial test failed. Please check the errors above.")
    else:
        print("✅ Initial test passed. Proceeding with full analysis...\n")
        
        # Get S&P 500 symbols
        sp500_symbols = get_sp500_symbols()
        
        # Get configuration
        max_stocks = int(os.getenv('MAX_SYMBOLS', '10'))
        print(f"Will analyze {max_stocks} stocks from S&P 500")

        sp500_symbols = ['AAPL']
        
        # Run analysis
        results_df = analyze_multiple_stocks_robust(sp500_symbols, max_stocks=max_stocks)
        
        if len(results_df) > 0:
            print(f"\n📈 Analysis Results:")
            print(f"Successfully analyzed {len(results_df)} stocks")
            
            # Display summary statistics
            print(f"\nRecommendation Distribution:")
            print(results_df['Recommendation'].value_counts())
            
            # Show top recommendations
            buy_stocks = results_df[results_df['Recommendation'].isin(['STRONG_BUY', 'BUY'])]
            if len(buy_stocks) > 0:
                print(f"\n🟢 BUY Recommendations ({len(buy_stocks)}):")
                buy_display = buy_stocks.nlargest(5, 'ML_Probability') if 'ML_Probability' in buy_stocks.columns else buy_stocks.head(5)
                for _, row in buy_display.iterrows():
                    sentiment_str = f"{row['Sentiment']:.2f}" if pd.notna(row['Sentiment']) else "N/A"
                    ml_str = f"{row['ML_Probability']:.3f}" if pd.notna(row.get('ML_Probability')) else "N/A"
                    print(f"  {row['Symbol']}: ${row['Last_Price']:.2f} | Sentiment: {sentiment_str} | ML: {ml_str} | {row['Recommendation']}")
            
            sell_stocks = results_df[results_df['Recommendation'].isin(['STRONG_SELL', 'SELL'])]
            if len(sell_stocks) > 0:
                print(f"\n🔴 SELL Recommendations ({len(sell_stocks)}):")
                sell_display = sell_stocks.nsmallest(5, 'ML_Probability') if 'ML_Probability' in sell_stocks.columns else sell_stocks.head(5)
                for _, row in sell_display.iterrows():
                    sentiment_str = f"{row['Sentiment']:.2f}" if pd.notna(row['Sentiment']) else "N/A"
                    ml_str = f"{row['ML_Probability']:.3f}" if pd.notna(row.get('ML_Probability')) else "N/A"
                    print(f"  {row['Symbol']}: ${row['Last_Price']:.2f} | Sentiment: {sentiment_str} | ML: {ml_str} | {row['Recommendation']}")
            
            # Save results
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            output_file = os.path.join(output_dir, f"sp500_analysis_{timestamp}.csv")
            results_df.to_csv(output_file, index=False)
            print(f"\n💾 Results saved to: {output_file}")
            
            # Create visualizations if we have enough data
            if len(results_df) >= 3:
                try:
                    buy_stocks, sell_stocks = plot_results(results_df)
                    styled_df = save_and_display_results(results_df)
                    display(styled_df)
                except Exception as e:
                    print(f"⚠ Visualization failed: {e}")
                    # Just display the basic DataFrame
                    display(results_df)
            else:
                display(results_df)
                
            print("\n🎉 Analysis completed successfully!")
        else:
            print("\n❌ No successful stock analyses. Please check the errors above.")

# Interactive single stock analysis (uncomment to use)
# print("\n" + "="*50)
# print("Interactive Single Stock Analysis")
# print("="*50)
# interactive_single_stock_analysis("MSFT")

🧪 Running initial test...
Testing analysis pipeline with BTC...
1. Downloading historical data...
✓ Downloaded 202 days of data
  Columns: ['Close', 'High', 'Low', 'Open', 'Volume']
  Data types: {'Close': dtype('float64'), 'High': dtype('float64'), 'Low': dtype('float64'), 'Open': dtype('float64'), 'Volume': dtype('int64')}
2. Calculating technical features...
  Starting with 202 days of data
  Calculating indicators for 202 days...
  Before NaN removal: 202 rows
    Dropped 1 rows due to NaN in Daily_Return
    Dropped 12 rows due to NaN in RSI
    Dropped 6 rows due to NaN in SMA20
  After processing: 183 rows with 54 features
✓ Calculated 54 technical features
  Final dataset shape: (183, 54)
3. Testing sentiment analysis...
  Analyzing sentiment for BTC...
    Trying yahoo_finance...
    ⚠ No articles from yahoo_finance
    Trying finviz...
    ✓ Found 5 articles from finviz
    News sentiment: 0.00 (from 5 articles)
    Estimated analyst sentiment from price change (-0.8%): 0.00


Unnamed: 0,Symbol,Last_Price,RSI,MACD,Sentiment,ML_Probability,ML_Accuracy,Recommendation
0,AAPL,206.860001,43.441106,0.659609,0.517623,0.49764,0.554348,HOLD



🎉 Analysis completed successfully!
