In [2]:
!pip install feedparser html2text

In [13]:
from datetime import datetime

# Get the current date and time
now = datetime.now()

# Print the timestamp in a specific format (optional)
timestamp = now.strftime("%Y-%m-%d %H:%M:%S")  # Example format
print(timestamp)

2024-05-15 14:35:14


In [1]:
from agents.basic_agents import BasicAgent
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage

from feedparser.util import FeedParserDict

class RSSDigestAgent(BasicAgent):
    def find_useful_keys_in_dict(self, entry: FeedParserDict):
        prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """This is an RSS feed entry from hackernews. Determine which keys in the json will be useful in evaluating the entry for usefulness for a reader to skim.
                    """.replace('\t', '')
                ),
                MessagesPlaceholder(variable_name="question"),
            ]
        )

        generate = prompt | self.llm


        request = HumanMessage(
            content=f"""Feed entry: {entry}""".replace('\t', '')
        )

        return generate.invoke({'question': [request]}, config=self.langchain_config)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import feedparser
from feedparser.util import FeedParserDict

import ssl
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

# Define the RSS feed URL
feed_url = "https://hnrss.org/newest?count=100"
# feed_url = "https://hnrss.org/newcomments"

# Parse the RSS feed
feed: FeedParserDict = feedparser.parse(feed_url)


print(len(feed.entries))

# Loop through each entry in the feed
for entry in feed.entries:
  # Print title, link, and summary (if available)
  print(f"Title: {entry.title}")
  print(f"Link: {entry.link}")
  if hasattr(entry, "summary"):
    print(f"Summary: {entry.summary}")
  print("-"*20)

# Additional parsing (optional)
# You can access other elements of the entry object like:
# - entry.published (date published)
# - entry.author (author name, if available)
# - entry.tags (list of tags)

100
Title: Tech Lost Gen Z
Link: https://fortune.com/2024/05/13/how-tech-lost-gen-z/
Summary: <p>Article URL: <a href="https://fortune.com/2024/05/13/how-tech-lost-gen-z/">https://fortune.com/2024/05/13/how-tech-lost-gen-z/</a></p>
<p>Comments URL: <a href="https://news.ycombinator.com/item?id=40364479">https://news.ycombinator.com/item?id=40364479</a></p>
<p>Points: 1</p>
<p># Comments: 0</p>
--------------------
Title: State of HTML 2023 Survey Features Results
Link: https://2023.stateofhtml.com/en-US/features/
Summary: <p>Article URL: <a href="https://2023.stateofhtml.com/en-US/features/">https://2023.stateofhtml.com/en-US/features/</a></p>
<p>Comments URL: <a href="https://news.ycombinator.com/item?id=40364472">https://news.ycombinator.com/item?id=40364472</a></p>
<p>Points: 1</p>
<p># Comments: 0</p>
--------------------
Title: Show HN: Test Free SEO Keyword Tool from GG Rewriter
Link: https://ggrewriter.com/search-keyword-research
Summary: <p>Hello, HN community! I've developed a

In [3]:
agent = RSSDigestAgent(config='stdout')

In [68]:
response = agent.find_useful_keys_in_dict(feed.entries[0])



[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new ChatPromptTemplate chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [71]:
print(response.content)

To evaluate the RSS feed entry for usefulness for a reader to skim, the following keys in the JSON would be useful:

1. `title`: This gives a quick summary of the linked article.
2. `summary`: This provides more context about the article, including the article URL, comments URL, and the number of points and comments.
3. `published`: This indicates when the post was published, which can help the reader determine if it's a new or old post.
4. `links` or `link`: This contains the URL of the linked article, which the reader can click to access the full article.
5. `comments`: This gives the URL of the Hacker News comments thread for the post. The reader can click this link to view other users' thoughts and reactions to the post.

Therefore, the useful keys to consider for this purpose are `title`, `summary`, `published`, `links/link`, and `comments`. The other keys, such as `title_detail`, `summary_detail`, `published_parsed`, `authors`, `author_detail`, and `id` may not be as useful for s

In [4]:
from typing import List


useful_keys = ['title', 'summary','published','links','comments']

def strip_rss_entries(entries: List[FeedParserDict]):
    feed = []
    for i in entries:
        stripped_entry = {k:v for k,v in i.items() if k in useful_keys}
        feed.append(stripped_entry)
    return feed

stripped_feed = strip_rss_entries(feed.entries)

In [5]:
stripped_feed

[{'title': 'Tech Lost Gen Z',
  'summary': '<p>Article URL: <a href="https://fortune.com/2024/05/13/how-tech-lost-gen-z/">https://fortune.com/2024/05/13/how-tech-lost-gen-z/</a></p>\n<p>Comments URL: <a href="https://news.ycombinator.com/item?id=40364479">https://news.ycombinator.com/item?id=40364479</a></p>\n<p>Points: 1</p>\n<p># Comments: 0</p>',
  'published': 'Wed, 15 May 2024 08:32:46 +0000',
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://fortune.com/2024/05/13/how-tech-lost-gen-z/'}],
  'comments': 'https://news.ycombinator.com/item?id=40364479'},
 {'title': 'State of HTML 2023 Survey Features Results',
  'summary': '<p>Article URL: <a href="https://2023.stateofhtml.com/en-US/features/">https://2023.stateofhtml.com/en-US/features/</a></p>\n<p>Comments URL: <a href="https://news.ycombinator.com/item?id=40364472">https://news.ycombinator.com/item?id=40364472</a></p>\n<p>Points: 1</p>\n<p># Comments: 0</p>',
  'published': 'Wed, 15 May 2024 08:31:30 

In [105]:
import warnings
import contextlib

import requests
from urllib3.exceptions import InsecureRequestWarning

old_merge_environment_settings = requests.Session.merge_environment_settings


@contextlib.contextmanager
def no_ssl_verification():
    opened_adapters = set()

    def merge_environment_settings(self, url, proxies, stream, verify, cert):
        # Verification happens only once per connection so we need to close
        # all the opened adapters once we're done. Otherwise, the effects of
        # verify=False persist beyond the end of this context manager.
        opened_adapters.add(self.get_adapter(url))

        settings = old_merge_environment_settings(
            self, url, proxies, stream, verify, cert)
        settings['verify'] = False

        return settings

    requests.Session.merge_environment_settings = merge_environment_settings

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', InsecureRequestWarning)
            yield
    finally:
        requests.Session.merge_environment_settings = old_merge_environment_settings

        for adapter in opened_adapters:
            try:
                adapter.close()
            except:
                pass

In [6]:
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

@tool
def search_online(urls: str):
    """Look up things online."""
    # with no_ssl_verification():
    loader = AsyncHtmlLoader(urls)
    docs = loader.load()

    html2text = Html2TextTransformer()
    return html2text.transform_documents(docs)

In [30]:
from agents.basic_agents import BaseAgent

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

class RSSTechAgent(BaseAgent):
    def find_interesting_tech_from_feed(self, entry: FeedParserDict):
        article = entry['links'][0].get('href')
        if article:
            try:
                article = search_online.invoke(article)
                i['linked_article'] = article
            except ssl.SSLCertVerificationError:
                pass
                
        prompt = f"""Read the following RSS feed and determine which if it contains useful new technologies with a strong grounding. Your response should only be about the intended output. Ignore anything irrelevant to the output. RSS feed:{entry}"""
        
        return self.llm.invoke(prompt)
        

In [31]:
agent2 = RSSTechAgent()

In [32]:
responses = []

for i in stripped_feed:
    responses.append(agent2.find_interesting_tech_from_feed(i))
    

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.11it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.04it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  4.34it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.99it/s]
Error fetching https://thinkingeek.com/2017/01/14/gfortran-array-descriptor/ with attempt 1/3: Cannot connect to host thinkingeek.com:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)')]. Retrying...
Error fetching https://thinkingeek.com/2017/01/14/gfortran-array-descriptor/ with attempt 2/3: Cannot connect to host thinkingeek.com:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)')]. Retrying...

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.75it/s]
Fetching pages:   0%|          | 0/1 [00:10<?, ?it/s]
Error fetching https://community.openai.com/t/gpt-4

KeyboardInterrupt: 

In [34]:
from utilities import print_long_text


for response in responses:
    print_long_text(response.content)
    print('\n\n')

The RSS feed contains an article from Fortune titled "Millennials wanted a dream tech job. Gen Z wants stability." The
article discusses a shift in preferences among recent graduates, with Gen Z being less interested in tech jobs and more
interested in government jobs, which they perceive as more stable. This shift is attributed to various factors,
including layoffs in the tech industry, a change in Silicon Valley culture, and economic volatility. The article also
notes that Gen Z tends to prioritize job stability and is concerned about paying down student loans and affording basic
necessities. The article provides insights into the career preferences of Gen Z and their attitudes towards the tech
industry and job stability. However, it does not contain any information about new technologies or their grounding.



The RSS feed contains information about the "State of HTML 2023 Survey Features Results" article. The article provides
statistics and insights about various HTML features and 

In [None]:
link analyser -> result summariser
