This script gets financial news data + stock price data from 
- Yahoo Finance News (scraping for more and older news)
- Yahoo Finance RSS (get more recent news)
- Google News (scraping for older news)
- SEC (filings for finance reports and major event)
- yfinance (for stock price data)

It stores data into CSV (for FinBERT use) & JSONL (for Deepseek-r1)

In [None]:
import yfinance as yf
import requests
import csv
import json
import time
import feedparser
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

"""
Stock Tickers selected for training and prediction:
    General Market Tickers (for market-wide behavior prediction)
        - SPY (S&P 500 ETF)
        - DIA (Dow Jones)
        - QQQ (Nasdaq 100)
    Individual Stocks (for top company stock predictions)
        - Tech: AAPL (Apple) / MSFT (Microsoft) / NVDA (Nvidia) / GOOGL (Google) /  SAP (German internation stock) / META
        - Retail: TSLA (TESLA) / AMZN (AMAZON)
        - FINANCE: JPM (JP MORGAN) / GS (GOLDMAN SACHS) / MS (Morgan Stanley)
        - Healthcare : UNH (UnitedHealth)
        - MARKET MOVER (WARREN BUFFET HOLDING COMPANY) : BRK.B (BERKSHIRE HATHAWAY)
"""

STOCK_TICKERS = ["SPY", "QQQ", "DIA", # General Market Indices
                 "AAPL", "MSFT", "NVDA","GOOGL", "SAP", "META", "PLTR", "AMD", "INTC", #Tech
                 "TSLA", "AMZN", # tech/retail
                 "WMT", "COST", #Consumer retail
                 "JPM", "GS", "MS", "HOOD", # finance
                 "UNH", "PFE", "TEM", # Health
                 "ACN", "IBM", # Consulting
                 "XOM", "SHEL", "CVX",
                 "BRK.B" # others
NEWS_YEARS_BACK = 5  # How many years of news history to fetch
STOCK_YEARS_BACK = 10  # How many years of stock data to fetch


YAHOO_FINANCE_NEWS_URL = "https://finance.yahoo.com/quote/{ticker}/news"
YAHOO_FINANCE_RSS_URL = "https://feeds.finance.yahoo.com/rss/2.0/headline?s={ticker}&region=US&lang=en-US"
GOOGLE_NEWS_SEARCH_URL = "https://www.google.com/search?q={query}+stock+news&tbm=nws&tbs=cdr:1,cd_min:{start},cd_max:{end}"
SEC_FILINGS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={ticker}&type=8-K&count=50"

HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape_yahoo_news(ticker):
    """Scrapes Yahoo Finance's historical news for a given stock ticker."""
    url = YAHOO_FINANCE_NEWS_URL.format(ticker=ticker)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")

    news_data = []
    articles = soup.find_all("li", class_="js-stream-content")
    
    num_articles = min(50, len(articles))
    for article in articles[:num_articles]:
        try:
            headline = article.find("h3").text.strip()
            link = "https://finance.yahoo.com" + article.find("a")["href"]
            date_tag = article.find("time")
            date_published = date_tag["datetime"][:10] if date_tag else "unknown"
            news_data.append({"date": date_published, "headline": headline, "url": link})
        except Exception:
            continue

    return news_data

def fetch_yahoo_rss_news(ticker):
    """Fetches the latest financial news from Yahoo Finance's RSS feed."""
    url = YAHOO_FINANCE_RSS_URL.format(ticker=ticker)
    feed = feedparser.parse(url)
    
    news_data = []
    num_entries = min(50, len(feed.entries))
    for entry in feed.entries[:num_entries]:
        news_data.append({"date": entry.published[:10], "headline": entry.title, "url": entry.link})
    
    return news_data

def scrape_google_news(ticker):
    """Scrapes Google News for historical articles (up to NEWS_YEARS_BACK)."""
    news_data = []
    end_date = datetime.today()
    start_date = end_date - timedelta(days=365 * NEWS_YEARS_BACK)

    search_url = GOOGLE_NEWS_SEARCH_URL.format(
        query=ticker,
        start=start_date.strftime("%m/%d/%Y"),
        end=end_date.strftime("%m/%d/%Y")
    )
    
    response = requests.get(search_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    
    articles = soup.find_all("div", class_="BNeawe vvjwJb AP7Wnd")
    num_articles = min(50, len(articles))
    for article in articles[:num_articles]:  # Limit to 30 news articles
        try:
            headline = article.text.strip()
            news_data.append({"date": "unknown", "headline": headline, "url": "Google News"})
        except Exception:
            continue

    return news_data

def scrape_sec_filings(ticker):
    """Scrapes SEC 8-K filings for earnings reports and major events."""
    url = SEC_FILINGS_URL.format(ticker=ticker)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")

    news_data = []
    for row in soup.find_all("tr")[1:10]:  # Get the latest 10 filings
        try:
            cells = row.find_all("td")
            date_published = cells[3].text.strip()
            link = "https://www.sec.gov" + cells[1].find("a")["href"]
            news_data.append({"date": date_published, "headline": f"SEC Filing for {ticker}", "url": link})
        except Exception:
            continue

    return news_data


In [None]:
# File names
finbert_csv_file = "finbert_training_data.csv"
deepseek_jsonl_file = "deepseek_training_data.jsonl"

# Open CSV for FinBERT training
with open(finbert_csv_file, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["ticker", "date", "text", "label", "open", "high", "low", "close", "volume", "price_change"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Open JSONL for Deepseek-r1 training
    with open(deepseek_jsonl_file, "w", encoding="utf-8") as jsonlfile:

        for ticker in STOCK_TICKERS:
            print(f"Fetching news & stock data for {ticker}...")

            # Scrape Yahoo News
            yahoo_news = scrape_yahoo_news(ticker)

            # Fetch Yahoo RSS News
            rss_news = fetch_yahoo_rss_news(ticker)

            # Scrape Google News
            google_news = scrape_google_news(ticker)

            # Scrape SEC Filings
            sec_news = scrape_sec_filings(ticker)

            # Merge all sources
            all_news = yahoo_news + rss_news + google_news + sec_news  

            # Fetch Stock Price Data
            stock_data = yf.Ticker(ticker).history(period=f"{STOCK_YEARS_BACK}y")

            # Process news articles
            for item in all_news:
                date_published = item["date"]
                news_text = item["headline"]
                sentiment = "neutral"  # Placeholder for sentiment analysis

                # Find stock data for the same date
                if date_published in stock_data.index:
                    stock_info = stock_data.loc[date_published]
                    open_price = stock_info["Open"]
                    close_price = stock_info["Close"]
                    price_change = round(((close_price - open_price) / open_price) * 100, 2)

                    # Save to FinBERT CSV
                    writer.writerow({
                        "ticker": ticker,
                        "date": date_published,
                        "text": news_text,
                        "label": sentiment,
                        "open": open_price,
                        "high": stock_info["High"],
                        "low": stock_info["Low"],
                        "close": close_price,
                        "volume": stock_info["Volume"],
                        "price_change": price_change
                    })

                    # Save to Deepseek-r1 JSONL
                    json_data = {
                        "ticker": ticker,
                        "date": date_published,
                        "title": item["headline"],
                        "summary": news_text,
                        "sentiment": sentiment,
                        "open": open_price,
                        "high": stock_info["High"],
                        "low": stock_info["Low"],
                        "close": close_price,
                        "volume": stock_info["Volume"],
                        "price_change": price_change
                    }
                    jsonlfile.write(json.dumps(json_data) + "\n")

            time.sleep(3)

print(f"FinBERT training data saved to {finbert_csv_file}")
print(f"Deepseek-r1 training data saved to {deepseek_jsonl_file}")