This script gets data from Alpha Vantage API, and contains:

For 50 of the most traded tickers/companies:

 - news articles (earliest is 2019 if available, and onwards)
 - earning call transcripts (Q1-4 2024),
 - daily stock price (past 100 days)

News articles in specific topics:
 - ‘ipo’, ‘earnings’, ‘mergers_and_acquisitions’, ‘economy_macro’, ‘economy_fiscal’, ‘economy_monetary’, ‘technology’, ‘finance’

Economic indicators (i think its only most recent data):
 - GDP, unemployment, inflation, interest rates

Data is in raw json format, needs extraction and re-organizing for model inputs

In [5]:
import requests
import json
from datetime import datetime, timedelta
import time

# List of stock tickers and topics for news and data fetching
tickers = [
    "SPY", "QQQ", "DIA", "AAPL", "MSFT", "NVDA", "GOOGL", "GOOG", "SAP", "META", "PLTR", "AMD", "INTC",
    "TSLA", "AMZN", "WMT", "COST", "JPM", "GS", "MS", "HOOD", "UNH", "PFE", "TEM", "ACN", "IBM",
    "XOM", "SHEL", "CVX", "BRK-A", "MSTR", "NFLX", "V", "CSCO", "LLY", "TME", "NVO", "NOW", 'SNOW', 'HD', 'COR', 'HMC', 'PANW', 'MRK', 'ORCL', 'BAC', 'MA', 'RDFN', 'VNET', 'AVGO'
]
topics =['ipo', 'earnings', 'mergers_and_acquisitions', 'economy_macro', 'economy_fiscal', 'economy_monetary', 'technology', 'finance']

# API Key for Alpha Vantage DON'T CHANGE THIS TO YOUR OWN, ONLY THIS KEY WILL ALLOW UNLIMITED CALLS PER DAY
# NOTE: this API KEY allows 75 API Calls / Minute
ALPHA_VANTAGE_API_KEY = "83OBSO8TRSM5ASNS"

def get_news_from_alphavantage(ticker, api_key):
    """
    Get Financial news for each stock ticker/company, includes sentiment scores
    """
    if ticker in ("SPY", "QQQ", "DIA"):
        return []
    base_url = "https://www.alphavantage.co/query"
    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ticker,
        "time_from": "20190101T0000",
        "limit": 1000,
        "apikey": api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        articles = response.json()
        return articles
    return []


def get_news_by_topic_from_alphavantage(topic, api_key):
    """ 
    Gets financial news for specific topic from Alpha Vantage, includes sentiment score 
    """
    base_url = "https://www.alphavantage.co/query"
    params = {
        "function": "NEWS_SENTIMENT",
        "topics": topic,
        "time_from": "20190101T0000",
        "limit": 1000,
        "apikey": api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        articles = response.json()
        return articles
    return []


def get_stock_prices_from_alphavantage(ticker, api_key):
    """ 
    Gets historical stock prices for a company from Alpha Vantage
    """
    base_url = "https://www.alphavantage.co/query"
    params = {
        "function": "TIME_SERIES_DAILY",
        "symbol": ticker,
        "outputsize": "compact",
        "apikey": api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json().get("Time Series (Daily)", {})
        return [{
            "date": date,
            "open": float(values["1. open"]),
            "high": float(values["2. high"]),
            "low": float(values["3. low"]),
            "close": float(values["4. close"]),
            "volume": int(values["5. volume"])
        } for date, values in data.items()]
    return []


def get_earnings_from_alphavantage(ticker, api_key):
    """ 
    Gets earning call transcripts for all quarters in 2024 for a company from Alpha Vantage
    """
    if ticker in ("SPY", "QQQ", "DIA"):
        return []
    quarters = ['2024Q4', '2024Q3', '2024Q2', '2024Q1']
    base_url = "https://www.alphavantage.co/query"
    responses = []
    for q in quarters:
        params = {
            "function": "EARNINGS_CALL_TRANSCRIPT",
            "symbol": ticker,
            "quarter": q,
            "apikey": api_key
        }
        
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            responses.append(response.json())
    return responses


def get_economic_indicators(api_key):
    """ 
    Gets economic indicators for US from Alpha Vantage
    """
    indicators = ["REAL_GDP", "FEDERAL_FUNDS_RATE", "INFLATION", "UNEMPLOYMENT"]
    data = {}
    for indicator in indicators:
        base_url = "https://www.alphavantage.co/query"
        params = {"function": indicator, "apikey": api_key}
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data[indicator] = response.json()
    return data

# Collect data and write to json file
final_data = []
for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    news = get_news_from_alphavantage(ticker, ALPHA_VANTAGE_API_KEY)
    prices = get_stock_prices_from_alphavantage(ticker, ALPHA_VANTAGE_API_KEY)
    earnings_transcript = get_earnings_from_alphavantage(ticker, ALPHA_VANTAGE_API_KEY)
    final_data.append({
        "ticker": ticker,
        "news": news,
        "earnings_call_transcript": earnings_transcript,
        "prices": prices,
    })
    time.sleep(15)

topic_news = []
for topic in topics:
    print(f"Fetching news data for topic {topic}...")
    news = get_news_by_topic_from_alphavantage(topic, ALPHA_VANTAGE_API_KEY)
    topic_news.append({
        "topic": topic,
        "news": news
    })
    time.sleep(10)
# Get economic indicators
economic_indicators = get_economic_indicators(ALPHA_VANTAGE_API_KEY)

# Save data to JSON
with open("news_earnings_stock_data.json", "w") as f:
    json.dump({"stocks": final_data, "topic_news":topic_news, "economic_indicators": economic_indicators}, f, indent=4)

print("Data collection complete! JSON file saved.")


Fetching data for SPY...
Fetching data for QQQ...
Fetching data for DIA...
Fetching data for AAPL...
Fetching data for MSFT...
Fetching data for NVDA...
Fetching data for GOOGL...
Fetching data for GOOG...
Fetching data for SAP...
Fetching data for META...
Fetching data for PLTR...
Fetching data for AMD...
Fetching data for INTC...
Fetching data for TSLA...
Fetching data for AMZN...
Fetching data for WMT...
Fetching data for COST...
Fetching data for JPM...
Fetching data for GS...
Fetching data for MS...
Fetching data for HOOD...
Fetching data for UNH...
Fetching data for PFE...
Fetching data for TEM...
Fetching data for ACN...
Fetching data for IBM...
Fetching data for XOM...
Fetching data for SHEL...
Fetching data for CVX...
Fetching data for BRK-A...
Fetching data for MSTR...
Fetching data for NFLX...
Fetching data for V...
Fetching data for CSCO...
Fetching data for LLY...
Fetching data for TME...
Fetching data for NVO...
Fetching data for NOW...
Fetching data for SNOW...
Fetching 