**Fetches the latest (past two weeks) news articles and their sentiment data for top 50 S&P 500 stocks**

In [None]:
# Get top 50 S&P 500 stocks
import yfinance as yf
import pandas as pd
from tqdm import tqdm

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
tickers = sp500['Symbol'].tolist()

market_caps = []

for ticker in tickers:
  info = yf.Ticker(ticker).info
  market_cap = info.get('marketCap', None)
  if market_cap:
    market_caps.append([ticker, market_cap])

market_caps.sort(key=lambda x: x[1], reverse=True)

top_50_tickers = [stock[0] for stock in market_caps[:50]]

print(top_50_tickers)


['AAPL', 'MSFT', 'NVDA', 'GOOG', 'GOOGL', 'AMZN', 'META', 'AVGO', 'TSLA', 'WMT', 'LLY', 'JPM', 'V', 'UNH', 'MA', 'XOM', 'COST', 'NFLX', 'PG', 'ORCL', 'JNJ', 'HD', 'ABBV', 'KO', 'TMUS', 'BAC', 'PM', 'CRM', 'CVX', 'PLTR', 'CSCO', 'MCD', 'IBM', 'ABT', 'LIN', 'WFC', 'GE', 'T', 'MRK', 'PEP', 'VZ', 'AXP', 'ACN', 'MS', 'ISRG', 'RTX', 'NOW', 'TMO', 'INTU', 'BX']


In [None]:
import requests
import pandas as pd
import datetime
import os
from datetime import datetime, timedelta
from google.colab import userdata

ALPHA_VANTAGE_API_KEY = userdata.get('VANTAGE_API_KEY')

# Fetches relevant articles with sentiment information from the past 2 weeks
def fetch_news_sentiment_articles(ticker, limit=1000, time_from=None):

    if time_from is None:
        days_back = 14
        time_from = (datetime.now() - timedelta(days=days_back)).strftime("%Y%m%dT0000")

    url = (
        f"https://www.alphavantage.co/query?function=NEWS_SENTIMENT"
        f"&tickers={ticker}&limit={limit}&time_from={time_from}"
        f"&apikey={ALPHA_VANTAGE_API_KEY}"
    )

    response = requests.get(url)
    if response.status_code == 200:
      data = response.json()

      if "feed" in data:
          df = pd.DataFrame(data["feed"])
          useful_columns = ["title", "summary", "source", "time_published", "topics", "overall_sentiment_label", "overall_sentiment_score"]
          available_cols = [col for col in useful_columns if col in df.columns]
          if not available_cols:
            print(f"No expected columns found for {ticker}")
            return pd.DataFrame

          df = df[available_cols]
          df["ticker"] = ticker
          return df

      else:
          print("Error fetching data or no data available:", data)
          return pd.DataFrame()
    else:
        print("Error fetching data:", response.status_code)

output_folder = "news_data"
os.makedirs(output_folder, exist_ok=True)

# Collecting articles from past 2 weeks for top 50 S&P 500 stocks
for i in range(11,37):
    ticker = top_50_tickers[i]
    df_news = fetch_news_sentiment_articles(ticker)
    if not df_news.empty:
      path = f"{output_folder}/{ticker}_news.csv"
      df_news.to_csv(path, index=False)

