In [None]:
!pip install feedparser html2text 

In [1]:
from agents.basic_agents import BaseAgent
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage

from feedparser.util import FeedParserDict

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
class RSSDigestAgent(BaseAgent):
    def find_useful_keys_in_dict(self, entry: FeedParserDict):
        prompt = f"""This is an RSS feed entry. Determine which keys in the json will be useful in evaluating the entry for usefulness for a reader to skim. Feed entry: {entry}"""

        return self.llm.invoke(prompt, config=self.langchain_config)
    
    def find_content_links(self, entry: FeedParserDict):
        prompt = """This is an RSS feed entry. Determine how to get the links to referenced content."""
        prompt += f'Feed entry: {entry}'
        
        return self.llm.invoke(prompt, config=self.langchain_config)

In [4]:
import feedparser
from feedparser.util import FeedParserDict

import ssl
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

# Define the RSS feed URL
feed_url = "https://hnrss.org/newest?points=300&count=100"

# Parse the RSS feed
feed: FeedParserDict = feedparser.parse(feed_url)


In [5]:
agent = RSSDigestAgent(config='stdout')

In [6]:
response = agent.find_useful_keys_in_dict(feed.entries[0])

In [7]:
print(response.content)

The following keys in the JSON object are useful in evaluating the entry for usefulness for a reader to skim:

* `title`: This gives a quick overview of the topic of the post.
* `summary`: This provides a brief description of the post. In this case, it includes the article URL, comments URL, points, and number of comments.
* `published`: This indicates when the post was published.
* `authors` or `author`: These keys provide the name of the author(s), which can help the reader determine if they are interested in the post.
* `comments`: This link can help the reader quickly access the comments section, if they want to see other readers' reactions or engage in discussion.

The `links` key could also be useful if the reader wants to explore the topic further, as it provides an alternate link to the post. However, the `link` key already includes the direct link to the post, so `links` may not be necessary for skimming purposes.

The `title_detail`, `summary_detail`, `published_parsed`, and 

In [3]:
from typing import List

def strip_rss_entries(entries: List[FeedParserDict], useful_keys):
    feed = []
    for i in entries:
        stripped_entry = {k:v for k,v in i.items() if k in useful_keys}
        feed.append(stripped_entry)
    return feed


In [None]:
stripped_feed = strip_rss_entries(
    feed.entries, ['title', 'summary', 'published', 'links', 'comments'])

stripped_feed

In [4]:
from typing import Optional
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

@tool
def search_online(urls: str, content_limit:Optional[int]=4000):
    """Look up things online."""
    loader = AsyncHtmlLoader(urls)
    docs = loader.load()

    html2text = Html2TextTransformer()
    docs = html2text.transform_documents(docs)
    
    if content_limit:
        for i in docs:
            if len(i.page_content) > content_limit:
                i.page_content = i.page_content[:content_limit] + '...'
    return docs


In [5]:
from agents.basic_agents import BaseAgent

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

class RSSTechAgent(BaseAgent):
    def find_interesting_tech_from_feed(self, entry: FeedParserDict):
        article = entry['links'][0].get('href')
        if article:
            try:
                articles = search_online.invoke(article)
                entry['linked_article'] = articles
            except Exception as e:
                print(f'unable to get article: {e}')
                return
                
        prompt = """Read an entry in an RSS feed and determine which if it contains useful new programming technologies or tools.
Your response should only be about the intended output. Ignore anything irrelevant to the output. Your response must follow the format of the examples given.
Examples:
Two useful techs: {
    "technology": [
        {
            "name": "Kaniko",
            "summary": "Kaniko is an open-source tool that allows you to build container images in Kubernetes without needing to run a Docker daemon inside the cluster.",
            "link": ""
        },
        {
            "name": "needle-in-a-needlestack",
            "summary": "Needle in a Needlestack is a new benchmark to measure how well LLMs pay attention to the information in their context window. NIAN creates a prompt that includes thousands of limericks and the prompt asks a question about one limerick at a specific location.",
            "link": "https://github.com/llmonpy/needle-in-a-needlestack"
        }
    ],
}

No useful techs:
{"technology": []}


"""
        
        prompt += f"RSS feed entry:{entry}"
        
        return self.llm.invoke(prompt,response_format={"type": "json_object"})
        

In [6]:
agent2 = RSSTechAgent()

In [None]:
from utilities import print_long_text

responses = []

for i in stripped_feed:
    response = agent2.find_interesting_tech_from_feed(i)
    if response:
        print_long_text(response.content)
        print('\n\n')

    responses.append(response)
    

In [99]:
import json

for i in responses:
    try:
        content = json.loads(i.content)
        if content and content.get('technology'):
            print(content)
    except:
        print(i.content)
        continue

{'technology': [{'name': 'coq-of-rust', 'summary': "coq-of-rust is a tool for translating Rust code to the formal proof system Coq, which is used for formal verification of Rust programs. It has been used to translate the core and alloc crates of Rust, resulting in a 'best effort' translation of these large code bases.", 'link': 'https://formal.land/blog/2024/04/26/translation-core-alloc-crates'}], 'summary': "Introduction and explanation of the translation of the Rust's core and alloc crates using the coq-of-rust tool for formal verification."}
{'technology': [{'name': 'Gemini Flash', 'summary': "Google DeepMind's Gemini Flash is a lightweight, fast, and cost-efficient AI model featuring multimodal reasoning and a breakthrough long context window of up to one million tokens.", 'link': 'https://deepmind.google/technologies/gemini/flash/'}]}
{'technology': [{'name': 'Veo', 'summary': "Veo is Google DeepMind's most capable generative video model, capable of producing high-quality, 1080p 

In [8]:
import feedparser

reearch_journal_feed_url = "https://feeds.feedburner.com/blogspot/gJZg"

# Parse the RSS feed
research_feed: FeedParserDict = feedparser.parse(reearch_journal_feed_url)

In [9]:
rss_digest_agent = RSSDigestAgent(config='stdout')

result = rss_digest_agent.find_useful_keys_in_dict(research_feed.entries[0])

In [12]:
print(result.content)

The following keys in the JSON may be useful in evaluating the entry for usefulness for a reader to skim:

1. 'title': This key provides the title of the post, which can give a quick overview of the content.
2. 'published': This key gives the date and time the post was published, which can help the reader determine if the post is relevant to current events.
3. 'tags': This key provides a list of tags related to the post, which can give an idea of the topics covered in the post.
4. 'summary': This key provides a summary of the post, which can give a more detailed overview of the content.
5. 'links': This key provides links to related resources, such as comments or the full post, which can provide more information if the reader is interested.

These keys can help a reader quickly determine if a post is relevant and interesting to them, and if they want to read more.


In [15]:
stripped_research_feed = strip_rss_entries(research_feed.entries,
                                           ['title','published','tags','summary','links'])
stripped_research_feed[0]

{'published': '2024-03-29T11:03:00.000-07:00',
 'tags': [{'term': 'Climate',
   'scheme': 'http://www.blogger.com/atom/ns#',
   'label': None},
  {'term': 'Machine Learning',
   'scheme': 'http://www.blogger.com/atom/ns#',
   'label': None},
  {'term': 'Weather',
   'scheme': 'http://www.blogger.com/atom/ns#',
   'label': None}],
 'title': 'Generative AI to quantify uncertainty in weather forecasting',
 'summary': '<span class="byline-author">Posted by Lizao (Larry) Li, Software Engineer, and Rob Carver, Research Scientist, Google Research</span>\n\n<img src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEglI5U51vvhkA4cAuVvMLn0TbbL5pdlFL-LO1sNnqLyUieA6A88I5HrhJlszxR1GKQqSK5wsdlATDKSy6EC1BsNF7tzS6oVlFLtau13mVFLk954nFu85HDMP3PrQboG4eXExEtUjEuDRFpcrMqE_F0ikSwXiWBECAfJiLbjr6h6523DROJkbC284xX35zC7/s1000/image3.gif" style="display: none;" />\n\n<p>\nAccurate weather forecasts can have a direct impact on people’s lives, from helping make routine decisions, like what to pack for a 

In [101]:
for i in stripped_research_feed:
    response = agent2.find_interesting_tech_from_feed(i)
    if response:
        print_long_text(response.content)
        print('\n\n')

    responses.append(response)

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.76it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.10it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.20it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.91it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.24it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.83it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.96it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.06it/s]
  k = self.parse_starttag(i)


{"technology": []}





Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.16it/s]
  k = self.parse_starttag(i)
