In [1]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://finance.yahoo.com"

def get_top_news(n=3):
    url = f"{BASE_URL}/news/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    }
    res = requests.get(url, headers=headers)

    if res.status_code != 200:
        print("Error al cargar la página:", res.status_code)
        return []

    soup = BeautifulSoup(res.content, "lxml")

    # 🧪 Inspección rápida del contenido
    with open("yahoo_preview.html", "w", encoding="utf-8") as f:
        f.write(soup.prettify())

    headlines = []
    seen = set()

    for tag in soup.select("a[href*='/news/']"):
        title = tag.get_text(strip=True)
        href = tag.get("href")

        if title and len(title) > 40 and href and href not in seen:
            full_url = href if href.startswith("http") else BASE_URL + href
            headlines.append((title, full_url))
            seen.add(href)

        if len(headlines) >= n:
            break

    return headlines

def extract_article_text(url, max_paragraphs=3):
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, "lxml")

    # Buscar todos los párrafos <p>
    paragraphs = soup.find_all("p")

    # Extraer texto de los primeros párrafos válidos
    contenido = []
    for p in paragraphs:
        text = p.get_text(strip=True)
        if len(text) > 40:  # filtramos textos muy cortos
            contenido.append(text)
        if len(contenido) >= max_paragraphs:
            break

    if not contenido:
        return "No se encontró contenido útil en los párrafos."

    return "\n\n".join(contenido)




In [2]:
# Prueba
noticias = get_top_news()

if not noticias:
    print("❗ No se encontraron titulares. Revisa el archivo yahoo_preview.html para inspeccionar la estructura.")
else:
    for i, (title, url) in enumerate(noticias, 1):
        print(f"\n📰 {i}. {title}")
        print(f"🔗 {url}")
        print("📄 Contenido:")
        print(extract_article_text(url)[:1000], "...")



📰 1. Big changes to Social Security have some people worried. What to expect.Controversial changes at the Social Security Administration ramped up worries from Yahoo Finance readers. Here are some of your questions and comments.
🔗 https://finance.yahoo.com/news/big-changes-to-social-security-have-some-people-worried-what-to-expect-090027268.html
📄 Contenido:


Big changes are happening with Social Security, and you have questions.

My recent columns drew thousands of comments, largely centered on three big themes: fear of delayed checks and crumbling customer service at the Social Security Administration, Roth conversions on tax-deferred retirement accounts, and heart-wrenching explanations of why filing for benefits at 62 is not always a matter of choice.

The Trump administration has made considerable andcontroversial changes, including massive staff and field office cuts, limiting phone service. and significantly ramping up the amount it garnishes per monthly check from Social Security beneficiaries who receive overpayments. These changes are impacting the nearly74 million retired senior citizensand disabled workers who currently receive benefits and the millions more on the cusp of doing so. ...

📰 2. 13 jobs that don't require a college degree -- and won't be replaced by AIThese 13 positions don't require a college degree, offer good pa

If you think most Americans finish college, think again.

Going to college is an American rite of passage. But not everyone goes to college, and many students never make it to graduation. Among Americans ages 25 and over, only 38% are college graduates, according to theEducation Data Initiative.

Anew reportfrom the resume-writing service Resume Now identifies 13 careers that offer good pay and long-term stability, and that don’t require a college degree. Better still, none of the jobs is likely to be replaced by AI. ...

📰 3. Chubb Limited (CB) Hikes Dividend as It Continues Dividend Aristocrat TraditionChubb Limited (NYSE:CB) announced a hike in its quarterly dividend, staying true to its shareholder-friendly track record. On May 15, Chubb Limited (NYSE:CB) declared a 6.6% hike in its quarterly dividend to $0.97 per share. Through this increase, the Zurich-based property-and-casualty insurance giant stretched its dividend growth streak to 32 years. The stock will trade […]
🔗 https://

Chubb Limited (NYSE:CB) announced a hike in its quarterly dividend, staying true to its shareholder-friendly track record.

On May 15, Chubb Limited (NYSE:CB) declared a 6.6% hike in its quarterly dividend to $0.97 per share. Through this increase, the Zurich-based property-and-casualty insurance giant stretched its dividend growth streak to 32 years. The stock will trade ex-dividend on June 13. As of May 16, CB has a dividend yield of 1.32%.

Chubb Limited (NYSE:CB) has consistently maintained a strong dividend policy over the years. As a Dividend Aristocrat, it has achieved a nearly 4% dividend growth rate over the past five years—an impressive figure when considering its overall shareholder returns. The stock has surged by 8% since the start of 2025, and its 12-month return came in at over 11%. ...


In [3]:
import json
import os

# Guardar las noticias extraídas en un archivo JSON
noticias = get_top_news(n=3)

# Guardamos cada noticia con título, URL y contenido
noticias_extraidas = []

for title, url in noticias:
    contenido = extract_article_text(url, max_paragraphs=3)
    noticia = {
        "titulo": title,
        "url": url,
        "contenido": contenido
    }
    noticias_extraidas.append(noticia)

# Define ubicación actual
#src_path = "./src"
#os.chdir(src_path)

# Guardar en un archivo JSON
output_path = "../data/noticias_yahoo.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(noticias_extraidas, f, indent=2, ensure_ascii=False)

print(f"✅ Noticias guardadas en: {output_path}")


✅ Noticias guardadas en: ../data/noticias_yahoo.json
