In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import dspy
import json
import time
from datetime import datetime
from zoneinfo import ZoneInfo
import numpy as np
import os
from typing import List, Dict, Optional
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_KEY') or os.getenv('OPENAI')
if not OPENAI_API_KEY:\n                raise RuntimeError("OpenAI API key not found. Set OPENAI_API_KEY in your .env file.")

BASE_URL = "https://www.prnewswire.com/news-releases/all-public-company-news/"

class PRNewsStockPredictor:
    def __init__(self, openai_api_key: str, data_file: str = "stock_predictions.csv"):
        """Initialize the predictor with OpenAI API key and data file path."""
        self.data_file = data_file
        self.df_base = self._load_data()
        
        # Configure DSPY
        dspy.configure(
            lm=dspy.LM(
                model='openai/gpt-5-mini',
                api_key=openai_api_key,
                temperature=1.0,
                max_tokens=16000
            )
        )
        
        # Define prediction signature
        self.predictor = dspy.ChainOfThought(PricePredictionSignature)
    
    def _load_data(self) -> pd.DataFrame:
        """Load existing data or create empty DataFrame."""
        if os.path.exists(self.data_file):
            return pd.read_csv(self.data_file)
        
        columns = [
            'date', 'time', 'title', 'link', 'content', 'has_exchange',
            'company', 'ticker', 'short_run_days', 'short_run_range_low_percent',
            'short_run_range_high_percent', 'long_run_range_percent', 'ai_comments'
        ]
        return pd.DataFrame(columns=columns)
    
    def _save_data(self):
        """Save data to CSV file."""
        self.df_base.to_csv(self.data_file, index=False)
    
    def try_request(self, url: str, timeout: int = 10) -> Optional[requests.Response]:
        """Make HTTP request with error handling."""
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        try:
            r = requests.get(url, headers=headers, timeout=timeout)
            return r if r.status_code == 200 else None
        except:
            return None
    
    def extract_time_and_clean_title(self, raw_title: str) -> tuple:
        """Extract timestamp and clean title from raw title."""
        m = re.match(r"^(\d{1,2}:\d{2}\s*(?:[AP]M\s*)?ET)\s*(.*)", raw_title, re.IGNORECASE)
        if m:
            return m.group(1).strip(), m.group(2).strip()
        return None, raw_title.strip()
    
    def get_article_content(self, article_url: str) -> str:
        """Fetch and extract article content."""
        resp = self.try_request(article_url)
        if not resp:
            return ""
        
        soup = BeautifulSoup(resp.content, 'html.parser')
        paragraphs = [
            p.get_text(strip=True)
            for p in soup.find_all('p')
            if len(p.get_text(strip=True)) > 50
        ]
        content = "\n\n".join(paragraphs[:8])  # Get more content for better analysis
        
        # Remove disclaimer section
        cut = content.upper().find("DISCLAIMER")
        if cut != -1:
            content = content[:cut].strip()
        
        return content
    
    def has_exchange_info(self, text: str) -> int:
        """Check if text contains exchange information."""
        pattern = re.compile(r'(NYSE|DOW|NASDAQ|TSX|LSE)', re.IGNORECASE)
        return 1 if pattern.search(text or "") else 0
    
    def fetch_new_articles(self, max_pages: int = 5, sleep_time: float = 0.2) -> pd.DataFrame:
        """Fetch new articles for today."""
        now_ct = datetime.now(ZoneInfo("America/Chicago"))
        year, month, day = now_ct.year, now_ct.month, now_ct.day
        date_str = f"{year:04d}-{month:02d}-{day:02d}"
        
        print(f"Fetching articles for {date_str}...")
        
        # Get existing links for today
        if not self.df_base.empty and "link" in self.df_base.columns and "date" in self.df_base.columns:
            seen_links = set(self.df_base.loc[self.df_base["date"] == date_str, "link"].astype(str))
        else:
            seen_links = set()
        
        collected = []
        seen_this_run = set()
        
        for page in range(1, max_pages + 1):
            list_url = f"{BASE_URL}?month={month:02d}&day={day:02d}&year={year}&page={page}&pagesize=100"
            resp = self.try_request(list_url)
            if not resp:
                break
            
            soup = BeautifulSoup(resp.content, "html.parser")
            anchors = soup.find_all("a", href=re.compile(r"^/news-releases/.*\.html$"))
            if not anchors:
                break
            
            for a in anchors:
                href = a.get("href")
                if not href:
                    continue
                
                full_url = "https://www.prnewswire.com" + href
                
                # Skip if already seen
                if full_url in seen_links or full_url in seen_this_run:
                    continue
                
                seen_this_run.add(full_url)
                
                raw_title = a.get_text(strip=True)
                news_time, title = self.extract_time_and_clean_title(raw_title)
                
                content = self.get_article_content(full_url)
                has_exch = self.has_exchange_info(content)
                
                collected.append({
                    "date": date_str,
                    "time": news_time,
                    "title": title,
                    "link": full_url,
                    "content": content,
                    "has_exchange": has_exch,
                })
                
                time.sleep(sleep_time)
                print(f"Fetched: {title[:50]}...")
        
        return pd.DataFrame(collected)
    
    def predict_stock_impact(self, content: str) -> List[Dict]:
        """Use DSPY to predict stock impact from news content."""
        try:
            pred = self.predictor(text=content)
            raw = getattr(pred, "records_json", "[]") or "[]"
            print(f"Raw prediction: {raw[:100]}...")
            
            # Clean the response if it contains extra text
            if not raw.strip().startswith('['):
                # Try to extract JSON from the response
                json_match = re.search(r'\[.*\]', raw, re.DOTALL)
                if json_match:
                    raw = json_match.group(0)
                else:
                    return []
            
            return json.loads(raw)
        except Exception as e:
            print(f"Prediction error: {e}")
            return []
    
    def process_new_articles(self, limit: Optional[int] = None) -> pd.DataFrame:
        """Process new articles and generate predictions."""
        # Fetch new articles
        df_new = self.fetch_new_articles()
        
        if df_new.empty:
            print("No new articles found.")
            return pd.DataFrame()
        
        # Filter articles with exchange information
        df_focus = df_new[df_new['has_exchange'] == 1]
        print(f"Found {len(df_focus)} articles with exchange information.")
        
        if limit:
            df_focus = df_focus.head(limit)
        
        all_predictions = []
        
        for idx, row in df_focus.iterrows():
            print(f"Processing: {row['title'][:50]}...")
            
            predictions = self.predict_stock_impact(row['content'])
            
            if predictions:
                for pred in predictions:
                    record = {
                        'date': row['date'],
                        'time': row['time'],
                        'title': row['title'],
                        'link': row['link'],
                        'content': row['content'],
                        'has_exchange': row['has_exchange'],
                        'company': pred.get('company', 'N/A'),
                        'ticker': pred.get('ticker', np.nan),
                        'short_run_days': pred.get('short_run_days', np.nan),
                        'short_run_range_low_percent': pred.get('short_run_range_low_percent', np.nan),
                        'short_run_range_high_percent': pred.get('short_run_range_high_percent', np.nan),
                        'long_run_range_percent': pred.get('long_run_range_percent', np.nan),
                        'ai_comments': pred.get('ai_comments', '')
                    }
                    all_predictions.append(record)
            else:
                # Add record without predictions
                record = {
                    'date': row['date'],
                    'time': row['time'],
                    'title': row['title'],
                    'link': row['link'],
                    'content': row['content'],
                    'has_exchange': row['has_exchange'],
                    'company': 'N/A',
                    'ticker': np.nan,
                    'short_run_days': np.nan,
                    'short_run_range_low_percent': np.nan,
                    'short_run_range_high_percent': np.nan,
                    'long_run_range_percent': np.nan,
                    'ai_comments': ''
                }
                all_predictions.append(record)
        
        df_predictions = pd.DataFrame(all_predictions)
        
        # Add to base data
        self.df_base = pd.concat([self.df_base, df_predictions], ignore_index=True)
        self._save_data()
        
        print(f"Added {len(df_predictions)} predictions. Total records: {len(self.df_base)}")
        return df_predictions
    
    def get_predictions_by_ticker(self, ticker: str) -> pd.DataFrame:
        """Get all predictions for a specific ticker."""
        return self.df_base[self.df_base['ticker'].str.upper() == ticker.upper()]
    
    def get_recent_predictions(self, days: int = 1) -> pd.DataFrame:
        """Get predictions from recent days."""
        recent_date = (datetime.now() - pd.Timedelta(days=days)).strftime('%Y-%m-%d')
        return self.df_base[self.df_base['date'] >= recent_date]
    
    def export_predictions(self, filename: str):
        """Export predictions to CSV."""
        self.df_base.to_csv(filename, index=False)
        print(f"Exported {len(self.df_base)} records to {filename}")


class PricePredictionSignature(dspy.Signature):
    """
    You are a senior finance analyst. Analyze the news content and predict stock price impact.
    Return a JSON array with predictions for each company mentioned.
    
    For each company, provide:
    {
      "company": "company name or 'N/A'",
      "ticker": "company ticker or 'N/A'", 
      "ai_comments": "brief analysis of expected stock impact",
      "short_run_days": <number of days for short-term prediction>,
      "short_run_range_low_percent": <lower bound percentage change>,
      "short_run_range_high_percent": <upper bound percentage change>,
      "long_run_range_percent": <long-term percentage change prediction>
    }
    
    Return ONLY the JSON array, no additional text.
    """
    text: str = dspy.InputField(desc="news content to analyze")
    records_json: str = dspy.OutputField(desc="JSON array of predictions")


# Usage example
def main():
    # Initialize predictor (read API key from environment)
    predictor = PRNewsStockPredictor(OPENAI_API_KEY)
    
    # Process new articles (limit to 5 for testing)
    predictions = predictor.process_new_articles(limit = 3)
    
    # Show results
    if not predictions.empty:
        print("\n=== Recent Predictions ===")
        for _, row in predictions.iterrows():
            print(f"Company: {row['company']}")
            print(f"Ticker: {row['ticker']}")
            print(f"Short-term ({row['short_run_days']} days): {row['short_run_range_low_percent']}% to {row['short_run_range_high_percent']}%")
            print(f"Long-term: {row['long_run_range_percent']}%")
            print(f"Analysis: {row['ai_comments']}")
            print("-" * 50)
    
    # Export results
    predictor.export_predictions("stock_predictions_export.csv")
    return predictions


# if __name__ == "__main__":
results_out = main()

Fetching articles for 2025-09-14...
Fetched: Hugel hosts global H.E.L.F. symposium, marking 15 ...
Fetched: ADNOC Gas wird in den FTSE Emerging Index aufgenom...
Fetched: Sep 13, 2025, 14:29 ETDesay SV Shines at IAA Mobil...
Fetched: Sep 13, 2025, 14:17 ETDesay SV Shines at IAA Mobil...
Fetched: Sep 13, 2025, 10:00 ETINVESTOR ALERT: Pomerantz La...
Fetched: Sep 13, 2025, 07:03 ETZoomlion livre sa première g...
Fetched: Sep 13, 2025, 07:00 ETSchneider Electric Reinforce...
Fetched: Sep 13, 2025, 03:00 ETSKY Perfect Modernizes Playo...
Fetched: Sep 12, 2025, 22:59 ETSunwoda stellt 684-Ah- und 5...
Fetched: Sep 12, 2025, 22:27 ETFLYWIRE SHAREHOLDER ALERT BY...
Fetched: Sep 12, 2025, 22:20 ETSNAP INVESTOR NOTICE: Snap I...
Fetched: Sep 12, 2025, 22:00 ETDOW INVESTOR NOTICE: Dow Inc...
Fetched: Sep 12, 2025, 21:38 ETDANAHER INVESTIGATION INITIA...
Fetched: Sep 12, 2025, 21:37 ETUNITED NATURAL FOODS INVESTI...
Fetched: Sep 12, 2025, 21:37 ETAGILON HEALTH INVESTIGATION ...
Fetched: Sep 12, 20



Prediction error: litellm.BadRequestError: GetLLMProvider Exception - 'tuple' object has no attribute 'startswith'

original model: openai/gpt-5-mini
Processing: Sep 13, 2025, 03:00 ETSKY Perfect Modernizes Playo...




Prediction error: litellm.BadRequestError: GetLLMProvider Exception - 'tuple' object has no attribute 'startswith'

original model: openai/gpt-5-mini
Processing: Sep 12, 2025, 22:27 ETFLYWIRE SHAREHOLDER ALERT BY...




Prediction error: litellm.BadRequestError: GetLLMProvider Exception - 'tuple' object has no attribute 'startswith'

original model: openai/gpt-5-mini
Added 3 predictions. Total records: 103

=== Recent Predictions ===
Company: N/A
Ticker: nan
Short-term (nan days): nan% to nan%
Long-term: nan%
Analysis: 
--------------------------------------------------
Company: N/A
Ticker: nan
Short-term (nan days): nan% to nan%
Long-term: nan%
Analysis: 
--------------------------------------------------
Company: N/A
Ticker: nan
Short-term (nan days): nan% to nan%
Long-term: nan%
Analysis: 
--------------------------------------------------
Exported 103 records to stock_predictions_export.csv


In [8]:
results_out

Unnamed: 0,date,time,title,link,content,has_exchange,company,ticker,short_run_days,short_run_range_low_percent,short_run_range_high_percent,long_run_range_percent,ai_comments
0,2025-09-14,,"Sep 13, 2025, 20:30 ETSynopsys, Inc. (SNPS) Sh...",https://www.prnewswire.com/news-releases/synop...,"SAN FRANCISCO,Sept. 13, 2025/PRNewswire/ -- On...",1,,,,,,,
1,2025-09-14,,"Sep 13, 2025, 18:12 ETLINE Investors Have Oppo...",https://www.prnewswire.com/news-releases/line-...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Why: Ro...",1,,,,,,,
2,2025-09-14,,"Sep 13, 2025, 14:45 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
3,2025-09-14,,"Sep 13, 2025, 14:30 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
4,2025-09-14,,"Sep 13, 2025, 14:00 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2025-09-14,,"Sep 13, 2025, 10:00 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
87,2025-09-14,,"Sep 13, 2025, 10:00 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
88,2025-09-14,,"Sep 13, 2025, 10:00 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
89,2025-09-14,,"Sep 13, 2025, 10:00 ETINVESTOR ALERT: Pomerant...",https://www.prnewswire.com/news-releases/inves...,"NEW YORK,Sept. 13, 2025/PRNewswire/ -- Pomeran...",1,,,,,,,
