# Dependencies

In [23]:
! pip install GoogleNews pandas numpy sentence-transformers



In [24]:
from GoogleNews import GoogleNews
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from datetime import datetime, timedelta, timezone
import re


# Hyperparameters

In [25]:
QUERY = "Meta"
START_DATE = "01/01/2024"
END_DATE = "01/01/2025"
OUT_CSV = f"{QUERY.lower()}_news_embeddings_1yr.csv"

# Data Fetching

In [26]:
print(f"Fetching '{QUERY}' news from {START_DATE} to {END_DATE}...")

googlenews = GoogleNews(lang='en', start=START_DATE, end=END_DATE)
googlenews.search(QUERY)

results = []
for page in range(1, 6):
    googlenews.getpage(page)
    res = googlenews.result()
    results.extend(res)

df = pd.DataFrame(results)
if df.empty:
    print("⚠️ No articles found. Try expanding date range or query.")
    exit()

df = df.drop_duplicates(subset=["title"]).dropna(subset=["title"])
print(f"✅ Collected {len(df)} raw articles.")

Fetching 'Meta' news from 01/01/2024 to 01/01/2025...
✅ Collected 44 raw articles.


# Analysis

In [27]:
print("Embedding article titles using SentenceTransformer...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["title"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings)

emb_cols = [f"emb_{i}" for i in range(embeddings.shape[1])]
emb_df = pd.DataFrame(embeddings, columns=emb_cols)

Embedding article titles using SentenceTransformer...


Batches: 100%|██████████| 2/2 [00:00<00:00, 13.46it/s]


# Output

In [28]:
def clean_datestr(s: str) -> datetime | None:
    if not isinstance(s, str):
        return None
    s = s.strip()
    now = datetime.now(timezone.utc)
    if "hour" in s:
        m = re.search(r"(\d+)\s*hour", s)
        if m:
            return now - timedelta(hours=int(m.group(1)))
    if "day" in s:
        m = re.search(r"(\d+)\s*day", s)
        if m:
            return now - timedelta(days=int(m.group(1)))
    if "week" in s:
        m = re.search(r"(\d+)\s*week", s)
        if m:
            return now - timedelta(weeks=int(m.group(1)))
    if "month" in s:
        m = re.search(r"(\d+)\s*month", s)
        if m:
            return now - timedelta(days=30 * int(m.group(1)))
    try:
        return pd.to_datetime(s)
    except Exception:
        return None

In [29]:
df["parsed_date"] = df["date"].apply(clean_datestr)
merged = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

merged = merged.dropna(subset=["parsed_date"]).copy()
merged["date"] = pd.to_datetime(merged["parsed_date"]).dt.date
merged["ticker"] = "GOOG"

cols_to_keep = ["ticker", "date", "title", "link", "media"] + emb_cols
merged = merged[cols_to_keep].sort_values("date").reset_index(drop=True)

merged.to_csv(OUT_CSV, index=False)
print(f"✅ Saved {len(merged)} articles with embeddings to {OUT_CSV}")

✅ Saved 44 articles with embeddings to meta_news_embeddings_1yr.csv
