In [1]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

from permchain.connection_inmemory import InMemoryPubSubConnection
from permchain.pubsub import PubSub
from permchain.topic import Topic

## Content Fetcher

First, we are going to define our content fetcher. This is responsible for taking a search query an getting relevant web pages.

In [2]:
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

In [3]:
from duckduckgo_search import DDGS

ddgs = DDGS()

In [4]:
def retrieve_documents(query):
    query = query.strip().strip('"')
    search_results = ddgs.text(query)
    urls_to_look = []
    for res in search_results:
        if res.get("href", None):
            urls_to_look.append(res["href"])
        if len(urls_to_look) >= 4:
            break

    # Relevant urls
    # Load, split, and add new urls to vectorstore
    if urls_to_look:
        loader = AsyncHtmlLoader(urls_to_look)
        html2text = Html2TextTransformer()
        docs = loader.load()
        docs = list(html2text.transform_documents(docs))
    else:
        docs = []
    return docs

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
# docs = retrieve_documents("langchain")

## Summarizer
We will now come up with an actor to summarize the results given a query and some search results

In [7]:
prompt = ChatPromptTemplate.from_template(
    "Answer the user's question given the search results\n\n<question>{question}</question><search_results>{search_results}</search_results>"
)

In [8]:
summarizer_chain = (
    prompt
    | ChatOpenAI(max_retries=0).with_fallbacks(
        [ChatOpenAI(model="gpt-3.5-turbo-16k"), ChatAnthropic(model="claude-2")]
    )
    | StrOutputParser()
)

## All together now!

In [9]:
summarizer_inbox = Topic("summarizer")

In [10]:
search_actor = (
    Topic.IN.subscribe()
    | {
        "search_results": retrieve_documents,
        "question": Topic.IN.current(),
    }
    | summarizer_inbox.publish()
)

In [11]:
summ_actor = (
    summarizer_inbox.subscribe() | {"answer": summarizer_chain} | Topic.OUT.publish()
)

In [12]:
web_researcher = PubSub(
    processes=(search_actor, summ_actor),
    connection=InMemoryPubSubConnection(),
).with_config(run_name="WebResearcher")

In [13]:
web_researcher.invoke("What is langsmith?")

Fetching pages: 100%|###################################################| 4/4 [00:01<00:00,  3.58it/s]


[{'answer': 'LangSmith is a platform developed by LangChain that helps developers trace, debug, test, evaluate, and monitor their language model applications and intelligent agents. It aims to bridge the gap between prototyping and production by providing tools and features to improve the performance and reliability of LLM applications. LangSmith offers features such as full visibility into model inputs and outputs, the ability to modify prompts and re-run them in a playground, visualization of the sequence of events in complicated chains and agents, tracking of latency and token usage, collaborative debugging, dataset creation and management, testing and evaluation using datasets and evaluators, application monitoring, and exporting datasets for use in other contexts.'}]

In [14]:
web_researcher.batch(["what is langsmith", "what is llama"])

Fetching pages:   0%|                                                           | 0/4 [00:00<?, ?it/s]
Fetching pages: 100%|###################################################| 4/4 [00:00<00:00,  4.87it/s][A
Fetching pages: 100%|###################################################| 4/4 [00:01<00:00,  2.03it/s]


[[{'answer': 'LangSmith is a platform developed by LangChain that helps trace and evaluate language model applications and intelligent agents. It assists in moving from prototype to production by providing tools for debugging, testing, evaluating, and monitoring LLM applications. LangSmith offers features such as full visibility into model inputs and outputs, dataset creation for testing, integration with evaluation modules, and system-level performance tracking. It aims to simplify the process of developing LLM-powered applications and bridging the gap between prototypes and production.'}],
 [{'answer': 'According to the search results, a llama is a domesticated livestock species that is a descendant of the guanaco and is a member of the camel family. Llamas are primarily used as pack animals and a source of food, wool, hides, tallow for candles, and dried dung for fuel. They are found primarily in Bolivia, Peru, Colombia, Ecuador, Chile, and Argentina. Llamas are herbivores and graze

## Trying to use it as a sub component

In [15]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [16]:
template = """Write between 2 and 5 sub questions that serve as google search queries to search online that form an objective opinion from the following: {question}"""
functions = [
    {
        "name": "sub_questions",
        "description": "List of sub questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "description": "List of sub questions to ask.",
                    "items": {"type": "string"},
                },
            },
        },
    },
]
prompt = ChatPromptTemplate.from_template(template)
question_chain = (
    prompt
    | ChatOpenAI(temperature=0).bind(
        functions=functions, function_call={"name": "sub_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [17]:
question_chain.invoke({"question": "what is langsmith?"})

['What is the purpose of Langsmith?',
 'Who developed Langsmith?',
 'What are the features of Langsmith?',
 'How does Langsmith work?',
 'Are there any alternatives to Langsmith?']

In [18]:
template = """You are tasked with writing a research report to answer the following question:

<question>
{question}
</question>

In order to do that, you first came up with several sub questions and researched those. please find those below:

<research>
{research}
</research>

Now, write your final report answering the original question!"""
prompt = ChatPromptTemplate.from_template(template)
report_chain = prompt | ChatOpenAI() | StrOutputParser()

In [19]:
research_inbox = Topic("researcher_inbox")
writer_inbox = Topic("writer_inbox")

In [20]:
subquestion_actor = (
    # Listed in inputs
    Topic.IN.subscribe()
    | question_chain
    # Each subquestion generated by question_chain is pushed to the researcher's inbox
    | research_inbox.publish_each()
)
research_actor = (
    research_inbox.subscribe()
    | web_researcher
    # The results of each research task are published to writer inbox
    | writer_inbox.publish()
)
write_actor = (
    # The writer waits until all research results arrive
    writer_inbox.join()
    | {
        "research": RunnablePassthrough(),
        "question": Topic.IN.current() | itemgetter("question"),
    }
    | report_chain
    | Topic.OUT.publish()
)

In [21]:
longer_researcher = PubSub(
    processes=(subquestion_actor, research_actor, write_actor),
    connection=InMemoryPubSubConnection(),
).with_config(run_name="LongResearcher")

In [None]:
longer_researcher.invoke({"question": "what is langsmith?"})

Fetching pages:   0%|                                                           | 0/4 [00:00<?, ?it/s]
Fetching pages:   0%|                                                           | 0/4 [00:00<?, ?it/s][A

Fetching pages:   0%|                                                           | 0/4 [00:00<?, ?it/s][A[A


Fetching pages:   0%|                                                           | 0/4 [00:00<?, ?it/s][A[A[A



Fetching pages: 100%|###################################################| 4/4 [00:01<00:00,  3.70it/s][A[A[A[A




Fetching pages:  75%|######################################2            | 3/4 [00:01<00:00,  2.66it/s][A[A[A[A
Fetching pages: 100%|###################################################| 4/4 [00:01<00:00,  3.11it/s][A



Fetching pages:  50%|#########################5                         | 2/4 [00:01<00:01,  1.68it/s][A[A[A

Fetching pages: 100%|###################################################| 4/4 [00:01<00:00,  3.03it/s][A[A