In [None]:
# Cell 1: Import required libraries and setup FinBERT
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from data_exploration import get_historical_data

# Set device (CPU or GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} for inference")

def setup_finbert():
    """Setup FinBERT model for financial sentiment analysis"""
    try:
        # Load FinBERT model for financial sentiment analysis
        model_name = "ProsusAI/finbert"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        
        # Move model to GPU if available
        model = model.to(device)
        
        # Create sentiment analysis pipeline
        nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
        print("✓ Successfully loaded FinBERT model")
        return nlp
    except Exception as e:
        print(f"Error loading FinBERT: {e}")
        print("Falling back to alternative model...")
        
        try:
            # Fallback to another financial model or general sentiment model
            model_name = "distilbert-base-uncased-finetuned-sst-2-english"  # General sentiment model
            nlp = pipeline("sentiment-analysis", model=model_name, device=0 if device == "cuda" else -1)
            print("✓ Successfully loaded fallback sentiment model")
            return nlp
        except Exception as e2:
            print(f"Error loading fallback model: {e2}")
            print("Unable to load any sentiment model. Will use rule-based sentiment.")
            return None

# Initialize sentiment model
sentiment_model = setup_finbert()

# Cell 2: News and sentiment analysis functions
def get_stock_news_robust(symbol, max_articles=5):
    news_data = []
    
    # Try multiple sources with different approaches
    sources = [
        ('yahoo_finance', get_yahoo_news),
        ('finviz', get_finviz_news),
        ('marketwatch', get_marketwatch_news),
        ('seeking_alpha', get_seeking_alpha_news)
    ]
    
    for source_name, source_func in sources:
        if len(news_data) >= max_articles:
            break
            
        try:
            print(f"    Trying {source_name}...")
            source_news = source_func(symbol, max_articles - len(news_data))
            if source_news:
                news_data.extend(source_news)
                print(f"    ✓ Found {len(source_news)} articles from {source_name}")
            else:
                print(f"    ⚠ No articles from {source_name}")
        except Exception as e:
            print(f"    ❌ {source_name} failed: {e}")
        
        # Small delay between sources
        time.sleep(0.5)
    
    # If no news found, create synthetic news entry for basic sentiment
    if len(news_data) == 0:
        print(f"    Using fallback sentiment approach for {symbol}")
        # Create a neutral news entry to ensure we have something to analyze
        news_data.append({
            'headline': f'{symbol} market update',
            'summary': f'Current market analysis for {symbol} shows mixed signals with ongoing price discovery.',
            'source': 'synthetic'
        })
    return news_data

def get_yahoo_news(symbol, max_articles=3):
    news_data = []
    
    try:
        url = f"https://finance.yahoo.com/quote/{symbol}/news"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            return news_data
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Multiple selectors to try for Yahoo Finance news
        selectors = [
            'li[data-testid="stream-item"]',
            'div[data-testid="card-container"]',
            'li.js-stream-content',
            'div.Mb\\(14px\\)',
            'div[data-test-locator="stream-item"]'
        ]
        
        for selector in selectors:
            items = soup.select(selector)
            if items:
                print(f"      Found {len(items)} items with selector: {selector}")
                break
        
        for item in items[:max_articles]:
            try:
                # Try multiple ways to get the headline
                headline_selectors = [
                    'h3', 'h4', 'h5',
                    'a[data-testid="clamp-container"]',
                    '.C\\(\\$c-link\\)',
                    'div[data-testid="clamp-container"] a',
                    'a[href*="/news/"]'
                ]
                
                headline = None
                for h_selector in headline_selectors:
                    h_element = item.select_one(h_selector)
                    if h_element and h_element.get_text().strip():
                        headline = h_element.get_text().strip()
                        break
                
                if headline and len(headline) > 10:  # Only valid headlines
                    # Try to get summary
                    summary_selectors = ['p', 'div.E\\(n\\)', 'span.C\\(\\$c-fuji-grey-l\\)']
                    summary = ""
                    for s_selector in summary_selectors:
                        s_element = item.select_one(s_selector)
                        if s_element:
                            summary = s_element.get_text().strip()
                            if len(summary) > 20:  # Only meaningful summaries
                                break
                    
                    news_data.append({
                        'headline': headline,
                        'summary': summary,
                        'source': 'yahoo_finance'
                    })
                    
            except Exception as e:
                continue
                
    except Exception as e:
        pass
        
    return news_data

def get_finviz_news(symbol, max_articles=3):
    """Get news from Finviz"""
    news_data = []
    
    try:
        url = f"https://finviz.com/quote.ashx?t={symbol}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return news_data
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find news table in Finviz
        news_table = soup.find('table', {'class': 'fullview-news-outer'})
        if not news_table:
            news_table = soup.find('table', id='news-table')
        
        if news_table:
            rows = news_table.find_all('tr')
            for row in rows[:max_articles]:
                try:
                    link_tag = row.find('a')
                    if link_tag:
                        headline = link_tag.get_text().strip()
                        if headline and len(headline) > 10:
                            news_data.append({
                                'headline': headline,
                                'summary': '',
                                'source': 'finviz'
                            })
                except:
                    continue
                    
    except Exception as e:
        pass
        
    return news_data

def get_marketwatch_news(symbol, max_articles=2):
    """Get news from MarketWatch"""
    news_data = []
    
    try:
        url = f"https://www.marketwatch.com/investing/stock/{symbol.lower()}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return news_data
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try different selectors for MarketWatch
        selectors = [
            '.element--article',
            '.article-wrap',
            '.headline',
            'h3.no-margin'
        ]
        
        for selector in selectors:
            items = soup.select(selector)
            if items:
                break
        
        for item in items[:max_articles]:
            try:
                if selector == '.element--article':
                    headline_tag = item.select_one('.headline') or item.select_one('h3')
                else:
                    headline_tag = item
                    
                if headline_tag:
                    headline = headline_tag.get_text().strip()
                    if headline and len(headline) > 10:
                        news_data.append({
                            'headline': headline,
                            'summary': '',
                            'source': 'marketwatch'
                        })
            except:
                continue
                
    except Exception as e:
        pass
        
    return news_data

def get_seeking_alpha_news(symbol, max_articles=2):
    """Get news from Seeking Alpha"""
    news_data = []
    
    try:
        # Seeking Alpha has anti-bot measures, so this is a basic attempt
        url = f"https://seekingalpha.com/symbol/{symbol}/news"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=8)
        if response.status_code != 200:
            return news_data
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try to find article titles
        article_links = soup.find_all('a', href=True)
        for link in article_links[:max_articles * 3]:  # Check more links to filter
            try:
                if '/article/' in link['href'] or '/news/' in link['href']:
                    headline = link.get_text().strip()
                    if headline and len(headline) > 15 and len(headline) < 200:
                        news_data.append({
                            'headline': headline,
                            'summary': '',
                            'source': 'seeking_alpha'
                        })
                        if len(news_data) >= max_articles:
                            break
            except:
                continue
                
    except Exception as e:
        pass
        
    return news_data

def get_analyst_ratings(symbol):
    """Get analyst ratings for a stock symbol with multiple fallback methods"""
    
    # Method 1: Try Yahoo Finance analysis page
    try:
        url = f"https://finance.yahoo.com/quote/{symbol}/analysis"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for recommendation data in various places
            # Method 1a: Look for recommendation table
            tables = soup.find_all('table')
            for table in tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all('td')
                    if len(cells) >= 2:
                        for i, cell in enumerate(cells):
                            text = cell.get_text().lower()
                            if 'recommendation' in text or 'mean' in text:
                                # Look for number in next cells
                                for j in range(i+1, min(len(cells), i+3)):
                                    try:
                                        rating_text = cells[j].get_text().strip()
                                        rating = float(rating_text)
                                        if 1.0 <= rating <= 5.0:
                                            # Convert to our scale (-1 to 1)
                                            normalized = 2 - (rating * 0.5)
                                            print(f"    Found analyst rating from table: {rating} -> {normalized:.2f}")
                                            return normalized
                                    except (ValueError, IndexError):
                                        continue
            
            # Method 1b: Look for recommendation in spans/divs
            recommendation_keywords = ['strong buy', 'buy', 'hold', 'sell', 'strong sell']
            spans = soup.find_all(['span', 'div', 'td'])
            
            for span in spans:
                text = span.get_text().lower().strip()
                if any(keyword in text for keyword in recommendation_keywords):
                    # Try to find a number nearby
                    parent = span.parent
                    if parent:
                        parent_text = parent.get_text()
                        import re
                        numbers = re.findall(r'\d+\.?\d*', parent_text)
                        for num_str in numbers:
                            try:
                                num = float(num_str)
                                if 1.0 <= num <= 5.0:
                                    normalized = 2 - (num * 0.5)
                                    print(f"    Found analyst rating from text: {num} -> {normalized:.2f}")
                                    return normalized
                            except ValueError:
                                continue
                    
                    # Map text-based recommendations
                    if 'strong buy' in text:
                        print("    Found 'strong buy' recommendation")
                        return 1.0
                    elif 'buy' in text and 'strong' not in text:
                        print("    Found 'buy' recommendation")
                        return 0.75
                    elif 'hold' in text:
                        print("    Found 'hold' recommendation")
                        return 0.0
                    elif 'strong sell' in text:
                        print("    Found 'strong sell' recommendation")
                        return -1.0
                    elif 'sell' in text and 'strong' not in text:
                        print("    Found 'sell' recommendation")
                        return -0.75
            
            # Method 1c: Look in script tags for JSON data
            scripts = soup.find_all('script')
            for script in scripts:
                if script.string:
                    script_text = script.string
                    # Look for recommendation patterns in JavaScript
                    import re
                    patterns = [
                        r'"recommendationMean":\s*{\s*"raw":\s*([0-9.]+)',
                        r'"recommendationMean":\s*([0-9.]+)',
                        r'recommendationMean["\']:\s*([0-9.]+)',
                        r'recommendation["\']:\s*([0-9.]+)'
                    ]
                    
                    for pattern in patterns:
                        match = re.search(pattern, script_text)
                        if match:
                            try:
                                rating = float(match.group(1))
                                if 1.0 <= rating <= 5.0:
                                    normalized = 2 - (rating * 0.5)
                                    print(f"    Found analyst rating from script: {rating} -> {normalized:.2f}")
                                    return normalized
                            except (ValueError, IndexError):
                                continue
                        
    except Exception as e:
        print(f"    Yahoo Finance analyst method failed: {e}")
    
    # Method 2: Try Finviz for analyst data
    try:
        url = f"https://finviz.com/quote.ashx?t={symbol}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=8)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for recommendation in Finviz tables
            tables = soup.find_all('table')
            for table in tables:
                cells = table.find_all('td')
                for i, cell in enumerate(cells):
                    text = cell.get_text().strip()
                    if 'Recom' in text or 'Recommendation' in text:
                        # Look for value in adjacent cells
                        try:
                            if i + 1 < len(cells):
                                rec_text = cells[i + 1].get_text().strip()
                                try:
                                    rating = float(rec_text)
                                    if 1.0 <= rating <= 5.0:
                                        normalized = 2 - (rating * 0.5)
                                        print(f"    Found Finviz analyst rating: {rating} -> {normalized:.2f}")
                                        return normalized
                                except ValueError:
                                    pass
                        except IndexError:
                            pass
                            
    except Exception as e:
        print(f"    Finviz analyst method failed: {e}")
    
    # Method 3: Create synthetic analyst sentiment based on price momentum
    # If all else fails, estimate analyst sentiment from recent price performance
    try:
        # Get basic stock info to estimate sentiment
        url = f"https://finance.yahoo.com/quote/{symbol}"
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for price change information
            spans = soup.find_all('span')
            for span in spans:
                text = span.get_text()
                if '%' in text and ('+' in text or '-' in text):
                    try:
                        # Extract percentage change
                        import re
                        pct_match = re.search(r'([-+]?)([0-9.]+)%', text)
                        if pct_match:
                            sign = pct_match.group(1)
                            value = float(pct_match.group(2))
                            pct_change = value if sign != '-' else -value
                            
                            # Convert percentage change to sentiment
                            # +5% or more = positive, -5% or less = negative
                            if pct_change > 5:
                                sentiment = 0.5
                            elif pct_change > 2:
                                sentiment = 0.25
                            elif pct_change < -5:
                                sentiment = -0.5
                            elif pct_change < -2:
                                sentiment = -0.25
                            else:
                                sentiment = 0.0
                            
                            print(f"    Estimated analyst sentiment from price change ({pct_change:.1f}%): {sentiment:.2f}")
                            return sentiment
                    except (ValueError, AttributeError):
                        continue
                        
    except Exception as e:
        print(f"    Fallback analyst method failed: {e}")
    
    print("    All analyst sentiment methods failed, returning neutral")
    return 0.0  # Neutral if not found

def analyze_news_sentiment_rule_based(news_data):
    """Simple rule-based sentiment analysis as fallback"""
    if not news_data:
        return 0.0
    
    # Define positive and negative word lists for financial context
    positive_words = [
        'gain', 'gains', 'up', 'rise', 'rises', 'rising', 'rose', 'bullish', 'outperform',
        'buy', 'growth', 'profit', 'profits', 'positive', 'strong', 'strength', 'higher',
        'record', 'upgrade', 'upgraded', 'beat', 'beats', 'exceed', 'exceeds', 'success',
        'successful', 'increase', 'increases', 'increased', 'boost', 'boosts', 'boosted',
        'opportunity', 'opportunities', 'potential', 'promising', 'optimistic', 'confident',
        'momentum', 'surge', 'rally', 'advance', 'advances', 'breakthrough', 'expansion',
        'outperforming', 'soars', 'jumped', 'climbed', 'accelerate', 'accelerating'
    ]
    
    negative_words = [
        'loss', 'losses', 'down', 'fall', 'falls', 'falling', 'fell', 'bearish', 'underperform',
        'sell', 'decline', 'declines', 'declined', 'negative', 'weak', 'weakness', 'lower',
        'downgrade', 'downgraded', 'miss', 'misses', 'missed', 'fail', 'fails', 'failed',
        'decrease', 'decreases', 'decreased', 'cut', 'cuts', 'risk', 'risks', 'risky',
        'concern', 'concerns', 'warning', 'problem', 'problems', 'threat', 'threats',
        'disappointing', 'crash', 'plunge', 'plummeted', 'volatile', 'uncertainty',
        'recession', 'bear', 'correction', 'selloff', 'slump', 'struggle', 'struggling'
    ]
    
    # Intensifier words that modify sentiment
    intensifiers = {
        'very': 1.5, 'extremely': 2.0, 'significantly': 1.7, 'substantially': 1.7,
        'dramatically': 2.0, 'sharply': 1.8, 'strongly': 1.6, 'heavily': 1.5,
        'massive': 2.0, 'huge': 1.8, 'major': 1.5, 'significant': 1.3
    }
    
    total_sentiment = 0
    article_count = 0
    
    for article in news_data:
        text = (article['headline'] + " " + article.get('summary', '')).lower()
        words = text.split()
        
        # Count positive and negative words
        positive_count = 0
        negative_count = 0
        
        for i, word in enumerate(words):
            # Clean the word
            clean_word = word.strip('.,!?;:"()[]')
            
            # Check for intensifiers
            intensifier = 1.0
            if i > 0:
                prev_word = words[i-1].strip('.,!?;:"()[]')
                if prev_word in intensifiers:
                    intensifier = intensifiers[prev_word]
            
            # Count sentiment words with intensifier
            if clean_word in positive_words:
                positive_count += intensifier
            elif clean_word in negative_words:
                negative_count += intensifier
        
        # Calculate sentiment for this article
        if positive_count > 0 or negative_count > 0:
            # Normalize by total sentiment words found
            total_sentiment_words = positive_count + negative_count
            article_sentiment = (positive_count - negative_count) / total_sentiment_words
            
            # Scale to reasonable range
            article_sentiment = max(-1.0, min(1.0, article_sentiment))
            total_sentiment += article_sentiment
            article_count += 1
    
    # Average sentiment across all articles
    if article_count > 0:
        average_sentiment = total_sentiment / article_count
        return average_sentiment
    else:
        return 0.0

def analyze_news_sentiment_with_finbert(news_data, nlp):
    if not news_data:
        return 0.0  # Neutral sentiment if no news
    
    # If FinBERT model is not available, use rule-based approach
    if nlp is None:
        print("    Using rule-based sentiment analysis (FinBERT not available)")
        return analyze_news_sentiment_rule_based(news_data)
    
    sentiments = []
    
    for article in news_data:
        text = article['headline']
        if len(article.get('summary', '')) > 0:
            text += " " + article['summary']
        
        # Skip empty text
        if not text.strip():
            continue
            
        try:
            # Try FinBERT analysis
            result = nlp(text[:500])  # Limit text length
            
            if result and len(result) > 0:
                # Map FinBERT sentiment labels to scores
                sent_label = result[0]['label'].lower()
                sent_score = result[0]['score']
                
                if 'positive' in sent_label:
                    sentiments.append(sent_score)
                elif 'negative' in sent_label:
                    sentiments.append(-sent_score)
                else:  # neutral
                    sentiments.append(0.0)
            else:
                # Fallback to rule-based for this article
                rule_sentiment = analyze_news_sentiment_rule_based([article])
                sentiments.append(rule_sentiment)
                
        except Exception as e:
            # If FinBERT fails, use rule-based for this article
            try:
                rule_sentiment = analyze_news_sentiment_rule_based([article])
                sentiments.append(rule_sentiment)
            except:
                sentiments.append(0.0)
    
    # Average all sentiment scores
    if sentiments:
        avg_sentiment = sum(sentiments) / len(sentiments)
        # Scale to ensure [-1, 1] range
        return max(-1.0, min(1.0, avg_sentiment))
    else:
        return 0.0

def get_stock_news(symbol, max_articles=5):
    """Get recent news articles about a stock symbol"""
    return get_stock_news_robust(symbol, max_articles)

def get_sentiment_analysis(symbol, data=None):
    """Get comprehensive sentiment analysis for a stock using FinBERT"""
    print(f"  Analyzing sentiment for {symbol}...")
    
    # Initialize sentiment components
    news_sentiment = 0.0
    analyst_sentiment = 0.0
    tech_sentiment = 0.0
    
    # Get news sentiment using FinBERT
    try:
        news_data = get_stock_news(symbol)
        news_sentiment = analyze_news_sentiment_with_finbert(news_data, sentiment_model)
        print(f"    News sentiment: {news_sentiment:.2f} (from {len(news_data)} articles)")
    except Exception as e:
        print(f"    News sentiment failed: {e}")
        news_sentiment = 0.0
    
    # Get analyst ratings
    try:
        analyst_sentiment = get_analyst_ratings(symbol)
        print(f"    Analyst sentiment: {analyst_sentiment:.2f}")
    except Exception as e:
        print(f"    Analyst sentiment failed: {e}")
        analyst_sentiment = 0.0
    
    # Get technical sentiment
    try:
        if data is not None:
            tech_sentiment = get_technical_sentiment(data)
            print(f"    Technical sentiment: {tech_sentiment:.2f}")
        else:
            tech_sentiment = 0.0
    except Exception as e:
        print(f"    Technical sentiment failed: {e}")
        tech_sentiment = 0.0
    
    # Weight the components with fallbacks
    # If we have all three: 40% news, 30% analyst, 30% technical
    # If missing news: 50% analyst, 50% technical
    # If missing analyst: 60% news, 40% technical
    # If only technical: 100% technical
    
    components = []
    weights = []
    
    if abs(news_sentiment) > 0.001:  # We have meaningful news sentiment
        components.append(news_sentiment)
        weights.append(0.4)
    
    if abs(analyst_sentiment) > 0.001:  # We have meaningful analyst sentiment
        components.append(analyst_sentiment)
        weights.append(0.3)
    
    if abs(tech_sentiment) > 0.001:  # We have meaningful technical sentiment
        components.append(tech_sentiment)
        weights.append(0.3)
    
    # Normalize weights
    if weights:
        total_weight = sum(weights)
        weights = [w/total_weight for w in weights]
        combined_sentiment = sum(c*w for c, w in zip(components, weights))
    else:
        # Fallback to neutral sentiment
        combined_sentiment = 0.0
    
    print(f"    Combined sentiment: {combined_sentiment:.2f}")
    
    # Add a small random variation to avoid all stocks having identical sentiment scores
    final_sentiment = combined_sentiment + (random.uniform(-0.05, 0.05))
    final_sentiment = max(-1.0, min(1.0, final_sentiment))
    
    # Add a small delay to avoid hitting rate limits
    time.sleep(random.uniform(1, 2))
    
    return final_sentiment