In [1]:
import feedparser

def get_rss_updates(rss_url):
    feed = feedparser.parse(rss_url)
    updates = []
    for entry in feed.entries:
        updates.append({
            'title': entry.get('title', ''),
            'link': entry.get('link', ''),
            'published': entry.get('published', ''),
            'summary': entry.get('summary', '')
        })
    return updates

# Example usage:
# rss_url = 'https://rss.arxiv.org/rss/cs'
rss_url= 'https://www.science.org/action/showFeed?type=etoc&feed=rss&jc=science'
updates = get_rss_updates(rss_url)
print(updates[0].keys())

dict_keys(['title', 'link', 'published', 'summary'])


In [None]:
import os
import json
from typing import List, Tuple, Dict
from pydantic import BaseModel, Field, ValidationError
from langchain.output_parsers import PydanticOutputParser

from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1")
TEMPERATURE = float(os.getenv("MODEL_TEMPERATURE", 0.5))
MAX_TOKENS = int(os.getenv("MODEL_MAX_TOKENS", 150))
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE") or None

_llm_kwargs = {"model_name": MODEL_NAME, "temperature": TEMPERATURE}
if MAX_TOKENS:
    _llm_kwargs["max_tokens"] = MAX_TOKENS
if OPENAI_API_BASE:
    _llm_kwargs["openai_api_base"] = OPENAI_API_BASE

LLM = ChatOpenAI(**_llm_kwargs)


class SummarizationResult(BaseModel):
    summary: str = Field(..., description="Concise summary of the article")
    recipients: List[str] = Field(default_factory=list, description="Usernames to send the summary to")

parser = PydanticOutputParser(pydantic_object=SummarizationResult)

def summarize_articles(
    items: List[Tuple[str, str, str, str]], users: List[Dict[str, List[str]]]
) -> List[SummarizationResult]:
    """
    Summarize multiple articles (title, link, published, feed_summary) and
    select recipients based on user interests. Returns structured results.
    """

    # Format user interests
    user_info = "\n".join(
        f"- {u['username']}: {', '.join(u['interests'])}" for u in users
    )

    # Format articles
    article_lines = []
    for title, link, published, feed_summary in items:
        article_lines.append(
            f"Title: {title}\nLink: {link}\nPublished: {published}\nFeed Summary: {feed_summary}\n"
        )

    # Instructions for format
    system_prompt = (
        "You are an assistant that summarizes news articles and recommends them to users by matching topics of interest.\n"
    )

    full_prompt = (
        f"Users and their interests:\n{user_info}\n\n"
        f"Articles to summarize:\n{''.join(article_lines)}"
    )
    llm = LLM.with_structured_output(SummarizationResult)
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=full_prompt)
    ]

    response = llm.invoke(messages)

    # Try parsing using the parser
    try:
        
        return [response.__dict__]  # Because it's a single result
    except Exception as e:
        raise ValueError(f"Model returned invalid structured output:\n{response}\n\nError: {e}")



In [3]:
# Example usage

summary = summarize_articles(
    items=[
        ("Sample Title", "http://example.com/article", "2023-10-01", "This is a sample summary of the article."),
    ],
    users=[
        {"username": "user1", "interests": ["science", "technology"]},
        {"username": "user2", "interests": ["health", "environment"]},
    ]
)

In [6]:
summary[0].dict().get('summary', 'No summary provided')

/var/folders/zj/4134lvvs5wjcfkw_mzqz721jy4h4ck/T/ipykernel_78416/3905111262.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  summary[0].dict().get('summary', 'No summary provided')


'This is a sample summary of the article.'

In [1]:
import os
os.environ["DATABASE_URL"] = "postgresql://rss_user:rss_password@localhost:7980/rss_db"

In [2]:
import sys 
sys.path.append('/Users/jt35560/WorkSpace/RSS_llm/backend')
from app.api.views import trigger_fetch, trigger_dispatch
from backend.app.core import _initial_fetch,load_config

import os
import logging
import asyncio
import pkgutil
import importlib

import feedparser
import yaml

from datetime import datetime
from sqlalchemy.orm import Session

from app.db import SessionLocal
from app.models.article import Article, ArticleStatus
import json
import requests
from app.services.summarize import summarize_article, summarize_articles
from app.services.dispatcher import dispatch_summary
from app.models.feed import Feed
from app.models.user import User

print(load_config())
def fetch_and_store(session: Session, feed: dict):
    print(f"Fetching feed: {feed['name']} from {feed['url']}")
    parsed = feedparser.parse(feed["url"])
    print(parsed)
    for entry in parsed.entries:
        entry_id = entry.get("id") or entry.get("link")
        if not session.query(Article).filter_by(feed_name=feed["name"], entry_id=entry_id).first():
            published = None
            if entry.get("published_parsed"):
                published = datetime(*entry.published_parsed[:6])
            article = Article(
                feed_name=feed["name"], entry_id=entry_id,
                title=entry.get("title"), link=entry.get("link"),
                published=published, summary=entry.get("summary"),
                status=ArticleStatus.new)
            session.add(article)
    session.commit()
    


([{'name': 'arxiv_cs', 'url': 'https://rss.arxiv.org/rss/cs'}], 300)


In [3]:
from app.db import SessionLocal
db = SessionLocal()
def _initial_fetch() -> None:
    session = SessionLocal()
    try:
        feeds, _ = load_config()
        for feed in feeds:
            fetch_and_store(session, feed)
    finally:
        session.close()
        
res = _initial_fetch()

Fetching feed: arxiv_cs from https://rss.arxiv.org/rss/cs
{'bozo': False, 'entries': [], 'feed': {'title': 'cs updates on arXiv.org', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://rss.arxiv.org/rss/cs', 'value': 'cs updates on arXiv.org'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'http://rss.arxiv.org/rss/cs'}, {'href': 'http://rss.arxiv.org/rss/cs', 'rel': 'self', 'type': 'application/rss+xml'}], 'link': 'http://rss.arxiv.org/rss/cs', 'subtitle': 'cs updates on the arXiv.org e-print archive.', 'subtitle_detail': {'type': 'text/html', 'language': None, 'base': 'https://rss.arxiv.org/rss/cs', 'value': 'cs updates on the arXiv.org e-print archive.'}, 'docs': 'http://www.rssboard.org/rss-specification', 'language': 'en-us', 'updated': 'Sat, 28 Jun 2025 04:00:01 +0000', 'updated_parsed': time.struct_time(tm_year=2025, tm_mon=6, tm_mday=28, tm_hour=4, tm_min=0, tm_sec=1, tm_wday=5, tm_yday=179, tm_isdst=0), 'authors': [{'email': 'rss-help@arxiv

In [9]:
session = SessionLocal()
feed = {'name':'arxiv cs','url':'https://www.nature.com/nature.rss'}
fetch_and_store(session, feed)

Fetching feed: arxiv cs from https://www.nature.com/nature.rss
{'bozo': False, 'entries': [{'id': 'https://www.nature.com/articles/d41586-025-02024-9', 'title': 'Exclusive: NIH still screens grants in process a judge ruled illegal', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://www.nature.com/nature.rss', 'value': 'Exclusive: NIH still screens grants in process a judge ruled illegal'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://www.nature.com/articles/d41586-025-02024-9'}], 'link': 'https://www.nature.com/articles/d41586-025-02024-9', 'content': [{'type': 'text/html', 'language': None, 'base': 'https://www.nature.com/nature.rss', 'value': '<p>Nature, Published online: 27 June 2025; <a href="https://www.nature.com/articles/d41586-025-02024-9">doi:10.1038/d41586-025-02024-9</a></p>Directives by the Trump administration are still being applied to grant materials despite court order.'}], 'summary': '<p>Nature, Published online: 27 June 