In [None]:
from typing import Optional, List

from datetime import datetime, timedelta

from pydantic import BaseModel, Field
from langchain_google_vertexai import ChatVertexAI

from updater.collect import new_articles

In [None]:
class ArticlePlan(BaseModel):
    ids: List[str] = Field(description='List of IDs of articles to use')

class Section(BaseModel):
    title: str
    description: Optional[str] = None
    literaly: bool = True

class Format(BaseModel):
    topic: str
    audience: str
    description: str
    sections: List[Section]

    def to_xml(self) -> str:

        description = f'<TOPIC>{self.topic}</TOPIC>\n<AUDIENCE>{self.audience}</AUDIENCE>\n<DESCRIPTION>{self.description}</DESCRIPTION'

        description += '<SECTIONS>/n'
        for section in self.sections:
            description += f'<SECTION><TITLE>{section.title}</TITLE><DESCRIPTION>{section.description}</DESCRIPTION>'
            if not section.literaly:
                description += '<INSTRUCTION>Use this as a guidline to create multiple sections as described</INSTRUCTION>'
        description += '/n</SECTIONS>'
    
        return '<FORMAT>\n' + description + '\n</FORMAT>'

In [None]:
class Summary(BaseModel):
    title: str = Field(description='Title of the article')
    short_summary: str = Field(description='Short summary of the article, one paragraph long. This summary will be shown togather with title. Be direct, avoid statements like This post ...')
    long_summary: str = Field(description='Long summary, maximum 1/2 of the page. Use HTML to format the output')
    links: List[str] = Field(description='If article refer to another interesting informations, list of urls')

In [None]:
newsletter = Format(
    topic='Space Exploration',
    audience='technical, mostly sofware engineers and data scientists',
    description='Newsletter of latest developments in the field',
    sections=[
        Section(title='TL;DR', description='summary of main development'),
        Section(title='Name of the article', description='one section for each article within selected topic, select 5 articles most relevant to topic', literaly=False)
    ]
)

In [None]:
DEFAULT_FEEDS = [
    'https://huyenchip.com/feed.xml',  # Regular RSS
    'https://decodingml.substack.com/feed',  # Substack
    'https://www.youtube.com/feeds/videos.xml?channel_id=UCeMcDx6-rOq_RlKSPehk2tQ'  # Youtube
]

In [None]:
starting_point = datetime(year=1900, month=1, day=1)

In [None]:
articles = new_articles(DEFAULT_FEEDS, starting_point)

In [None]:
prompt_articles = '<ARTICLES>' + ''.join([a.to_xml() for a in articles]) + '</ARTICLES>'

In [None]:
prompt = f"""Create outline of document with following specifications: {newsletter.to_xml()}

{prompt_articles}

Respond with list of sections, each section with plan how to create content for it.
"""

In [None]:
llm = ChatVertexAI(model='gemini-1.5-pro-002').with_structured_output(ArticlePlan, method='json_mode')

In [None]:
response = llm.invoke(prompt)

In [None]:
s = [idx for (idx, a) in enumerate(articles) if str(a.id) in response.ids]

In [None]:
s = ChatVertexAI(model='gemini-1.5-flash-002').with_structured_output(Summary, method='json_mode')

In [None]:
articles[-1].url

In [None]:
a = s.invoke([
        ("user", [{"type": "text", "text": "Analyze the article"}, 
              {"type": "media", "mime_type": "text/html", "file_uri": articles[0].url}])
])

In [None]:
a.title

In [None]:
a.links