In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import yfinance as yf
import time
import os

# Yahoo categories to scrape
category_urls = {
    "Top Gainers": "https://finance.yahoo.com/markets/stocks/gainers",
    "Most Active": "https://finance.yahoo.com/most-active",
    "Trending Now": "https://finance.yahoo.com/trending-tickers"
}

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "From": "hrahman@ucdavis.edu"
}


os.makedirs("./historical-data/stocks", exist_ok=True)

# Scrape symbol-name pairs from Yahoo Finance
def extract_table_rows(url, paginated=False, max_rows=100):
    results = []
    pages = 1 if not paginated else (max_rows // 100 + 1)
    for page in range(pages):
        full_url = f"{url}?start={page * 100}&count=100" if paginated else url
        response = requests.get(full_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        rows = soup.select("table tbody tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                symbol = cols[0].text.strip()
                name = cols[1].text.strip()
                results.append((symbol, name))
            if len(results) >= max_rows:
                break
        if len(results) >= max_rows:
            break
    return results

# Fetch % change over time period
def get_change_pct(ticker, period):
    try:
        hist = yf.Ticker(ticker).history(period=period)
        if hist.empty or len(hist["Close"]) < 2:
            return None
        return round(((hist["Close"].iloc[-1] - hist["Close"].iloc[0]) / hist["Close"].iloc[0]) * 100, 2)
    except:
        return None

# Fetch all change periods for summary
def get_all_changes(symbol):
    return {
        "Symbol": symbol,
        "1W Change %": get_change_pct(symbol, "5d"),
        "1M Change %": get_change_pct(symbol, "1mo"),
        "3M Change %": get_change_pct(symbol, "3mo"),
        "6M Change %": get_change_pct(symbol, "6mo"),
        "1Y Change %": get_change_pct(symbol, "1y"),
        "5Y Change %": get_change_pct(symbol, "5y")
    }

# Save full historical price series
def save_historical_data(symbol, period="2y"):
    try:
        ticker = yf.Ticker(symbol)
        df = ticker.history(period=period, interval="1d")

        if df.empty:
            print(f"-> No historical data found for {symbol}")
            return

        df.to_csv(f"historical-data/stocks/{symbol}.csv")
        print(f"-> Saved historical data for {symbol}")
    except Exception as e:
        print(f"-> Exception fetching {symbol}: {e}")

# Master loop
all_data = []

for category, url in category_urls.items():
    print(f"-----Scraping: {category}")
    is_paginated = category in ["Top Gainers"]
    symbol_name_pairs = extract_table_rows(url, paginated=is_paginated, max_rows=100)

    for symbol, name in symbol_name_pairs:
        print(f"→ {symbol} | {name}")
        data = get_all_changes(symbol)
        data["Name"] = name
        data["Category"] = category
        all_data.append(data)

        # Save full historical series
        save_historical_data(symbol, period="2y")
        time.sleep(1)

df = pd.DataFrame(all_data)
df.to_csv("market-data/stock-market-data.csv", index=False)
print("-> Saved to market-data/stock-market-data.csv")
display(df.head())

-----Scraping: Top Gainers
→ MRUS | Merus N.V.
-> Saved historical data for MRUS
→ RGC | Regencell Bioscience Holdings Limited
-> Saved historical data for RGC
→ UEC | Uranium Energy Corp.
-> Saved historical data for UEC
→ OKLO | Oklo Inc.
-> Saved historical data for OKLO
→ X | United States Steel Corporation
-> Saved historical data for X
→ SMR | NuScale Power Corporation
-> Saved historical data for SMR
→ INFA | Informatica Inc.
-> Saved historical data for INFA
→ NXE | NexGen Energy Ltd.
-> Saved historical data for NXE
→ CCJ | Cameco Corporation
-> Saved historical data for CCJ
→ BWXT | BWX Technologies, Inc.
-> Saved historical data for BWXT
→ OTLY | Oatly Group AB
-> Saved historical data for OTLY
→ TNXP | Tonix Pharmaceuticals Holding Corp.
-> Saved historical data for TNXP
→ FLR | Fluor Corporation
-> Saved historical data for FLR
→ INTU | Intuit Inc.
-> Saved historical data for INTU
→ GME | GameStop Corp.
-> Saved historical data for GME
→ FRO | Frontline plc
-> Saved histo

Unnamed: 0,Symbol,1W Change %,1M Change %,3M Change %,6M Change %,1Y Change %,5Y Change %,Name,Category
0,MRUS,33.32,24.3,17.95,21.48,-8.08,258.98,Merus N.V.,Top Gainers
1,RGC,59.08,1803.47,12127.07,9383.49,11077.64,5233.33,Regencell Bioscience Holdings Limited,Top Gainers
2,UEC,25.0,22.39,15.18,-20.17,-11.16,508.49,Uranium Energy Corp.,Top Gainers
3,OKLO,23.01,100.95,43.4,113.31,381.48,389.68,Oklo Inc.,Top Gainers
4,X,24.96,23.22,38.16,35.15,45.55,569.67,United States Steel Corporation,Top Gainers
