In [1]:
import yfinance as yf
import pandas as pd

from datetime import datetime, timedelta
import feedparser
import os
import shutil

In [2]:
share_symbol="NVDA"
start_date="2025-07-15"
end_date="2025-07-25"

## Step 1: Collect stock price info 

In [3]:
def save_price_data(ticker="NVDA", start_date="2020-01-01", end_date="2023-12-31", output_dir="data/price/preprocessed"):
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{ticker}.txt")

    df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=True)
    df = df[["Open", "High", "Low", "Close", "Volume"]].copy()
    df.dropna(inplace=True)

    prev_close = df["Close"].shift(1)
    df = df.iloc[1:].copy()  # drop first row with NaN

    df["close_norm"] = df["Close"].values / prev_close.iloc[1:].values
    df["open_norm"] = df["Open"].values / prev_close.iloc[1:].values
    df["high_norm"] = df["High"].values / prev_close.iloc[1:].values
    df["low_norm"] = df["Low"].values / prev_close.iloc[1:].values
    df["close_change"] = (df["Close"].values - prev_close.iloc[1:].values) / prev_close.iloc[1:].values

    with open(output_file, "w") as f:
        for date, row in df.iterrows():
            date_str = date.strftime("%Y-%m-%d")
            f.write(
                f"{date_str}\t"
                f"{row['close_norm'].item():.6f}\t"
                f"{row['open_norm'].item():.6f}\t"
                f"{row['high_norm'].item():.6f}\t"
                f"{row['low_norm'].item():.6f}\t"
                f"{row['close_change'].item():.6f}\t"
                f"{row['Volume'].item()}\n"
            )

    print(f"Saved {ticker} price data to {output_file}")


In [4]:
# Example usage:
save_price_data(ticker=share_symbol, start_date=start_date, end_date=end_date)

[*********************100%***********************]  1 of 1 completed

Saved NVDA price data to data/price/preprocessed/NVDA.txt





## Step 2: News Headlines (used as "Tweet") info

In [5]:
def scrape_google_news_rss(ticker, date_str=None, max_headlines=50, output_dir="data/news/raw"):
    day = date_str or datetime.utcnow().strftime("%Y-%m-%d")
    out_dir = os.path.join(output_dir, ticker)
    os.makedirs(out_dir, exist_ok=True)
    outfile = os.path.join(out_dir, f"{day}.json")

    # Broader query to get more results (remove source filter)
    query = f"https://news.google.com/rss/search?q={ticker}"
    feed = feedparser.parse(query)

    entries = feed.entries or []
    saved = 0

    with open(outfile, "w", encoding="utf-8") as f:
        for entry in entries:
            if saved >= max_headlines:
                break
            title = entry.title.strip().replace('"', r'\"')
            if ticker.lower() in title.lower():
                f.write(f'{{"text": "{title}"}}\n')
                saved += 1

    print(f"[GoogleRSS] Saved {saved} headlines for {ticker} on {day}")

def scrape_google_news_range(ticker, start_date="2025-07-20", end_date="2025-07-28", max_headlines=50, output_dir="data/news/raw"):
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")

    for single_date in (start_dt + timedelta(n) for n in range((end_dt - start_dt).days + 1)):
        dstr = single_date.strftime("%Y-%m-%d")
        scrape_google_news_rss(ticker=ticker, date_str=dstr, max_headlines=max_headlines, output_dir=output_dir)

def move_news_to_sep(ticker, raw_base="data/news/raw", sep_base="data/tweet/raw"):
    src_dir = os.path.join(raw_base, ticker)
    dst_dir = os.path.join(sep_base, ticker)
    os.makedirs(dst_dir, exist_ok=True)

    for fname in os.listdir(src_dir):
        if fname.endswith(".json"):
            src_path = os.path.join(src_dir, fname)
            dst_path = os.path.join(dst_dir, fname.replace(".json", ""))  # SEP expects no extension
            shutil.copy(src_path, dst_path)

    print(f"Successfully integrated {ticker} news files into SEP tweet format {dst_dir}")

In [None]:
# get news headlines from google financial news
scrape_google_news_range(share_symbol, start_date, end_date, max_headlines=20, output_dir="data/tweet/raw")
 
# integrate the collected data into the SEP tweet format and location for training/ inference
move_news_to_sep(share_symbol)


[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-15
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-16
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-17
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-18
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-19
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-20
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-21
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-22
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-23
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-24
[GoogleRSS] Saved 20 headlines for NVDA on 2025-07-25
Successfully integrated NVDA news files into SEP tweet format data/tweet/raw/NVDA
