In [None]:
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display, update_display

In [None]:
load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")
openai = OpenAI()
MODEL = 'gpt-4o'

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [None]:
AI_NEWS_SITES = [
    "https://the-decoder.com/",
    "https://venturebeat.com/category/ai/",
    "https://huggingface.co/blog"
]

In [None]:
class NewsScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.articles = []
        self.scrape()

    def scrape(self):
        try:
            res = requests.get(self.base_url, headers=headers)
            res.encoding = res.apparent_encoding
            soup = BeautifulSoup(res.content, "html.parser")


            anchors = soup.find_all("a", href=True)
            links = [a["href"] for a in anchors if a["href"].startswith("http") or a["href"].startswith("/")]
            links = list(set(links))[:10]  # Limit to 10 for performance

            for link in links:
                full_link = link if link.startswith("http") else self.base_url.rstrip("/") + link
                try:
                    article_res = requests.get(full_link, headers=headers, timeout=5)
                    res.encoding = res.apparent_encoding
                    article_soup = BeautifulSoup(article_res.content, "html.parser")
                    title = article_soup.title.string if article_soup.title else "(No title)"
                    p_tags = article_soup.find_all("p")
                    text = " ".join([p.get_text() for p in p_tags])
                    text = text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
                    self.articles.append({
                        "title": title.strip(),
                        "url": full_link,
                        "content": text.strip()[:4000]  # Truncate to 4000 chars to fit model context
                    })
                except:
                    continue
        except Exception as e:
            print(f"Error scraping {self.base_url}: {e}")

In [None]:
def summarize_article(article, stream=False):
    prompt = f"""
    You are an assistant summarizing breaking news in AI. 
    Give a 3-sentence summary of the following article. Focus on what happened, who was involved, and why it matters.
    End it with a recommendation of cool projects I can build, if applicable.
    Be snarky and sarcastic!

    Title: {article['title']}
    Content: {article['content']}
    """

    if stream:
        chat_stream = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You summarize recent AI news."},
                {"role": "user", "content": prompt}
            ],
            stream=True
        )
        response = f"### [{article['title']}]({article['url']})\n"
        display_handle = display(Markdown(""), display_id=True)
        for chunk in chat_stream:
            delta = chunk.choices[0].delta.content or ''
            response += delta
            update_display(Markdown(response), display_id=display_handle.display_id)
    else:
        try:
            response = openai.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": "You summarize recent AI news."},
                    {"role": "user", "content": prompt}
                ]
            )
            summary = response.choices[0].message.content.strip()
            return f"### [{article['title']}]({article['url']})\n{summary}\n"
        except Exception as e:
            print(f"Error summarizing article: {e}")
            return ""

In [None]:
all_articles = []
for site in AI_NEWS_SITES:
    scraper = NewsScraper(site)
    all_articles.extend(scraper.articles)

for article in all_articles[:5]:
    summarize_article(article, stream=True)