In [33]:
import json
import requests
from pydantic import BaseModel, Field
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
def fetch_text_from_url(url: str):
    url = f"https://r.jina.ai/{url}"
    headers = {
        "Accept": "application/json",
        "Authorization": "Bearer jina_0fee2f762d8c40b48252fcd9d8d5b3feZwXbGRzxzIT-glgzZCNhO9J9-CwG",
        "X-Return-Format": "markdown",
        "X-Timeout": "30",
    }

    response = requests.get(url, headers=headers)

    return response.text


In [None]:
QUERY = "How to build a digital brain?"
DOCS = [
    "https://fortelabs.com/blog/basboverview/",
    "https://zettelkasten.de/overview/",
    "https://blog.alexanderfyoung.com/the-perfect-mindmap-4-step-framework/",
]

fetched_docs = [fetch_text_from_url(d) for d in DOCS]
parsed_docs = [json.loads(d)["data"]["content"] for d in fetched_docs]

In [38]:
llm = OpenAI()


class Concept(BaseModel):
    name: str
    description: str = Field(
        ..., description="A short one sentence description of the concept."
    )
    exists_in_documents: list[int] = Field(
        ..., description="The indices of the documents in in which this concept exists."
    )


class DocumentSetConcepts(BaseModel):
    concepts: list[Concept]


SYSTEM_PROMPT = """Given a user QUERY and a set of DOCUMENTS - you must parse RELEVANT high level concepts from the documents,
which would be useful in answering and/or exploring the user's QUERY.
There should be overlapping concepts between the documents which help tie them together.
"""


def get_document_concepts(query, docs):
    completion = llm.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": f"The QUERY: {query}\n The DOCUMENTS:\n{'\n'.join(docs)}",
            },
        ],
        response_format=DocumentSetConcepts,
    )

    return completion.choices[0].message.parsed


class DocumentSummary(BaseModel):
    summary: str


def get_document_summary(query, doc):
    completion = llm.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": "Generate a short summary of the provided DOCUMENT which is conditioned on the user's QUERY.",
            },
            {
                "role": "user",
                "content": f"The QUERY:\n{query}\nThe DOCUMENT:\n{doc}",
            },
        ],
        response_format=DocumentSummary,
    )

    return completion.choices[0].message.parsed


In [39]:
concepts = get_document_concepts(QUERY, parsed_docs)

In [41]:
summaries = [get_document_summary(QUERY, d) for d in parsed_docs]

In [45]:
summaries[0]

DocumentSummary(summary='To build a digital or "Second Brain," the method involves four key steps encapsulated in the CODE acronym: Capture, Organize, Distill, and Express. This process allows you to store and manage critical information outside of your biological brain, enhancing productivity and creativity.\n\n1. **Capture**: Collect significant ideas and information that resonate with you through tools like digital notebooks or apps, ensuring they are stored in a centralized digital location for easy access.\n\n2. **Organize**: Use the PARA Method (Projects, Areas, Resources, Archive) to categorize and manage information based on its relevance to current projects or future reference, minimizing information overload.\n\n3. **Distill**: Summarize notes down to their essential points to make them actionable and easy to retrieve in the future, using methods like progressive summarization.\n\n4. **Express**: Utilize the information to create new work or projects, leveraging the curated k