# Storm Research Assistant

Reference
https://github.com/langchain-ai/langgraph/blob/main/examples/storm/storm.ipynb


In [1]:
## Prereqs

# %pip install -U langchain_community langchain_openai langgraph wikipedia  scikit-learn  langchain_fireworks
# We use one or the other search engine below
# %pip install -U tavily-python
# %pip install -U duckduckgo-search
# ! apt-get install graphviz graphviz-dev
# %pip install pygraphviz



In [2]:
from storm import *
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


fast_llm = ChatOpenAI(model="gpt-3.5-turbo")
# long_context_llm = ChatOpenAI(model="gpt-4-turbo-preview")
long_context_llm = ChatOpenAI(model="gpt-3.5-turbo-0125")


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore_dir = "./data/storm/vectorstore/"
vectorstore = Chroma(persist_directory=vectorstore_dir, embedding_function=embeddings)

interview_config = InterviewConfig(long_llm=long_context_llm, 
                                   fast_llm=fast_llm, 
                                   max_conversations=5, 
                                   max_reference_length=10000,
                                   tags_to_extract=[ "p", "h1", "h2", "h3"],
                                   embeddings=embeddings,
                                   vectorstore=vectorstore,
                                   vectorstore_dir=vectorstore_dir,
                                   runnable_config=RunnableConfig()
                                   )

In [3]:
# Needs topic as input - {"topic": ""}
outline = get_chain_outline(interview_config.fast_llm)

# Needs topic as input - {"topic": ""}
expand_chain = get_chain_expand_related_topics(fast_llm)


gen_perspectives_chain = get_chain_perspective_generator(fast_llm)

# Need messages as input - {"messages": []}
gen_queries_chain = get_chain_queries(fast_llm)
gen_answer_chain = get_chain_answer(fast_llm)

example_topic = "Covid-19 impact in 2024"
t1 = {"topic": example_topic}

In [4]:
o1 = outline.invoke(t1)
o1

Outline(page_title='Covid-19 Impact in 2024', sections=[Section(section_title='Introduction', description='Overview of the global impact of Covid-19 in the year 2024.', subsections=None), Section(section_title='Health Impact', description='Details on the health implications and advancements in handling Covid-19 in 2024.', subsections=[Subsection(subsection_title='Vaccine Progress', description='Information on vaccine development and distribution progress in 2024.'), Subsection(subsection_title='New Variants', description='Discussion on any new variants of the virus that emerged in 2024 and their impact.')]), Section(section_title='Economic Impact', description='Analysis of the economic repercussions of the ongoing pandemic in 2024.', subsections=[Subsection(subsection_title='Global Markets', description='Effects of the pandemic on global economies and financial markets.'), Subsection(subsection_title='Job Market', description='Assessment of the job market and employment trends amidst C

In [5]:
r1 = expand_chain.invoke(t1)
r1

RelatedSubjects(topics=['COVID-19 pandemic', 'Impact of COVID-19 on economy', 'Global health crisis', 'Public health response to pandemics', 'Future predictions for COVID-19', 'Social implications of COVID-19', 'Technological innovations in response to pandemics', 'Vaccine development and distribution'])

In [6]:
p1 = gen_perspectives_chain.invoke({"examples": r1.topics, "topic": example_topic})
p1

Perspectives(editors=[Editor(affiliation='World Health Organization (WHO)', name='Dr. Maria Silva', role='Public Health Expert', description='Dr. Silva will focus on the global health crisis aspect of the topic, including public health responses to pandemics and the current status of the COVID-19 pandemic.'), Editor(affiliation='International Monetary Fund (IMF)', name='John Smith', role='Economic Analyst', description='John will provide insights into the impact of COVID-19 on the economy, discussing economic trends, forecasts, and recovery strategies post-pandemic.'), Editor(affiliation='Center for Disease Control and Prevention (CDC)', name='Dr. Sarah Johnson', role='Epidemiologist', description='Dr. Johnson will focus on future predictions for COVID-19, analyzing trends, mutations, and potential scenarios for the evolution of the virus.'), Editor(affiliation='Social Science Research Institute', name='Dr. Emily Chen', role='Social Scientist', description='Dr. Chen will explore the so

In [7]:
c = get_chain_question_generator(fast_llm)
t2 = {"persona": ""}

q1 = c.invoke(t2)
q1

AIMessage(content='What are some common misconceptions about the topic that are important to address in the Wikipedia page?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 138, 'total_tokens': 157}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39ffa8', 'finish_reason': 'stop', 'logprobs': None})

In [8]:
q_in = {"messages": [HumanMessage(content=q1.content, name="JohnSmith")]}
a1 = await gen_queries_chain.ainvoke(q_in)
a1

Queries(queries=['Common misconceptions about the topic', 'Important points to address in the Wikipedia page'])

In [9]:
# Nodes

In [10]:
# Question node 
@as_runnable
async def node_generate_question(state: InterviewState) -> dict[str, Any]:
    """
    Generates a question for the editor in the interview.

    Args:
        state (InterviewState): The interview state.

    Returns:
        InterviewState: The updated interview state with the generated question added as a message.
    """
    editor: Editor = state.editor
    interview_config = state.interview_config
    fast_llm = interview_config.fast_llm

    # Normalize name
    name = cleanup_name(editor.name)
    editor.name = name


    logger.info(f'Generating question for {name}')
    gn_chain = c = get_chain_question_generator(fast_llm)
    input = {"persona": editor.persona}
    
    ai_response = await gn_chain.ainvoke(input)
    
    # Convert AI response to HumanMessage to simulate human conversation
    tag_with_name(ai_response, name)
    message = HumanMessage(**ai_response.dict(exclude={"type"}))
    
    state.messages.append(message)

    logger.info(f'Generated question for {name}: {message.content}')
    return state.as_dict()


# Test 

state = InterviewState(
    interview_config=interview_config,
    editor=Editor(affiliation="Example University", name="John Doe", role="Lead Editor", description="Experienced in the field of biology."),
    messages=[],
    references={},
    summary=""
)

await node_generate_question.ainvoke(state)

2024-04-02 17:56:39,182 [MainThread  ] [INFO ]  Generating question for JohnDoe


2024-04-02 17:56:39,847 [MainThread  ] [INFO ]  Generated question for JohnDoe: What are some recent advancements in the field of biology that have the potential to significantly impact our understanding of genetics and evolutionary biology?


{'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278c33d0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1278f1f90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1278f37f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', e

In [11]:
# Answer node

@as_runnable
async def node_generate_answer(state: InterviewState) -> dict[str, Any]:
    """
    Generates an answer for the editor's question in the interview.
    
    Args:
        state (InterviewState): The interview state.
    
    Returns:
        InterviewState: The updated interview state with the generated answer added as a message.
    """
    
    editor: Editor = state.editor
    config = state.interview_config
    fast_llm = config.fast_llm
    
    # Chain definitions
    gen_answer_chain = get_chain_answer(fast_llm)
    queries_chain = get_chain_queries(fast_llm)
    
    # Normalize name
    name = cleanup_name(editor.name)
    # editor.name = name

    logger.info(f'START - Generate answers for [{name}]')
    
    # Generate search engine queries
    
    q_in = {"messages": state.messages}
    queries:Queries = await queries_chain.ainvoke(q_in)
    logger.info(f"Got {len(queries.queries)} search engine queries for [{name}] -\n\t {queries.queries}")


    # Run search engine on all generated queries using tool
    query_results = await search_engine.abatch(queries.queries, config.runnable_config, return_exceptions=True)
    successful_results = [res for res in query_results if not isinstance(res, Exception)]
    all_query_results = {res["url"]: res["content"] for results in successful_results for res in results}
    
    # Dump search engine results to string and truncate to max reference length
    dumped_successful_results = json.dumps(all_query_results)
    # dumped_successful_results = dumped_successful_results[:config.max_reference_length] \
    #     if config.max_reference_length is not None \
    #     and len(dumped_successful_results) > int(config.max_reference_length) \
    #     else dumped_successful_results
    #     # and config.max_reference_length > 0 \
    
    logger.info(f"Got {len(successful_results)} search engine results for [{name}] - \n\t {all_query_results}")
    logger.info(f"Dumped {len(dumped_successful_results)} characters for [{name}] - \n\t {dumped_successful_results}")

    # # Append Questions from Wikipedia and Answers from the search engine
    ai_message_for_queries: AIMessage = get_ai_message(json.dumps(queries.as_dict()))    
    tool_results_message = generate_human_message(dumped_successful_results)
    
    logger.debug(f"QUERY_AI_MSG: {ai_message_for_queries} for [{name}]")
    logger.debug(f"RESULTS_H_MSG: {tool_results_message} for [{name}]")
    state.messages.append(ai_message_for_queries)
    state.messages.append(tool_results_message)
    
    # Only update the shared state with the final answer to avoid polluting the dialogue history with intermediate messages
    try:
        generated: AnswerWithCitations = await gen_answer_chain.ainvoke(state)
        logger.info(f"Genreted final answer {generated} for [{name}] - \n\t {generated.as_str}")

    except Exception as e:
        logger.error(f"Error generating answer for [{name}] - {e}")
        generated = AnswerWithCitations(answer="", cited_urls=[])
    
    cited_urls = set(generated.cited_urls)
    
    # Update references with cited references - Save the retrieved information to a the shared state for future reference
    cited_references = {k: v for k, v in all_query_results.items() if k in cited_urls}
    state.references = update_references(state.references, cited_references)
    
    
    # # Add message to shared state
    formatted_message = AIMessage(name=name, content=generated.as_str)
    state.messages.append(formatted_message)
    
    logger.info(f'END - generate answer for [{name}]')    
    return state.as_dict()

# clone state 
ax1 = await node_generate_answer.ainvoke(state)
ax1

2024-04-02 17:56:39,880 [MainThread  ] [INFO ]  START - Generate answers for [JohnDoe]
2024-04-02 17:56:40,698 [MainThread  ] [INFO ]  Got 2 search engine queries for [JohnDoe] -
	 ['Recent advancements in biology impacting genetics', 'Recent advancements in biology impacting evolutionary biology']


Searching DuckDuckGo for [Recent advancements in biology impacting genetics]
Got search engine results: 5 for [Recent advancements in biology impacting genetics]
Searching DuckDuckGo for [Recent advancements in biology impacting evolutionary biology]


2024-04-02 17:56:42,613 [MainThread  ] [INFO ]  Got 2 search engine results for [JohnDoe] - 
	 {'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10376292/': 'Next-generation sequencing (NGS) is a powerful tool used in genomics research. NGS can sequence millions of DNA fragments at once, providing detailed information about the structure of genomes, genetic variations, gene activity, and changes in gene behavior. Recent advancements have focused on faster and more accurate sequencing, reduced costs ...', 'https://www.quantamagazine.org/the-biggest-discoveries-in-biology-in-2023-20231219/': 'The Year in Biology. In a year packed with fascinating discoveries, biologists pushed the limits of synthetic life, probed how organisms keep time, and refined theories about consciousness and emotional health. Video: During 2023, Quanta turned a spotlight on important research progress into the nature of consciousness, the origins of our ...', 'https://www.nature.com/subjects/genetics': 'RSS Feed. Gen

Got search engine results: 5 for [Recent advancements in biology impacting evolutionary biology]
Updating references: 
	{} 
with new references: 
	{}


{'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278c33d0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1278f1f90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1278f37f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', e

In [24]:

def route_messages(state_dict: dict):
    
    print(f'Routing messages: {state_dict}')
    
    state = InterviewState.from_dict(state_dict)

    editor = state.editor
    config = state.interview_config
    name = cleanup_name(editor.name)

    print(f'Routing messages for [{name}]')

    messages = state.messages
    num_responses = len(
        [m for m in messages if isinstance(m, AIMessage) and m.name == name]
    )

    if num_responses >= config.max_conversations:
        return END
    
    last_question = messages[-2]
    last_question_content = str(last_question.content if last_question.content else "")
    if last_question_content.endswith("Thank you so much for your help!"):
        return END
    
    print(f'Continue asking question for [{name}] as this is not the last end of the conversation')
    return "ask_question"

In [25]:
class StormInterviewGraph1:
    def __init__(self, interview_config: InterviewConfig):
        self.interview_config = interview_config
        self.interviews = Interviews(interview_config=interview_config, perspectives=None, conversations={})
        self.graph = self.build_graph()
        
    def build_graph(self):
        builder = StateGraph(InterviewState)

        builder.add_node("ask_question", node_generate_question)
        builder.add_node("answer_question",node_generate_answer)
        builder.add_conditional_edges("answer_question", route_messages)
        builder.add_edge("ask_question", "answer_question")

        builder.set_entry_point("ask_question")
        return builder.compile().with_config(run_name="Conduct Interviews")
    
    def initialize_conversations(self):
        if self.interviews is not None \
            and self.interviews.perspectives is not None \
            and len(self.interviews.perspectives.editors) > 0:
            for editor in self.interviews.perspectives.editors:
                convo = InterviewState(interview_config=self.interview_config, editor=editor, messages=[], references={})
                self.interviews.conversations[editor] = convo
    
    async def run_single_interview(self, state: InterviewState):
        return await self.graph.ainvoke(state)
    
    async def run(self):
        inital_states = list(self.interviews.conversations.values())
        return await self.graph.abatch(inital_states, return_exceptions=True)
    
    async def stream_and_return_results(self, state):
        async for step in self.graph.astream(state):
            name = next(iter(step))
            print(name)
            print(f"Processing step: {name}")
            print("-- ", str(step[name]["messages"])[:300])
            if END in step:
                final_step = step
                
        final_state = next(iter(final_step.values()))
        return final_state

In [26]:
@as_runnable
async def survey_subjects(topic: str)-> Perspectives:
    print(f"Survey Subjects for Topic: {topic}")
    related_subjects = await expand_chain.ainvoke({"topic": topic})
    retrieved_docs = await wikipedia_retriever.abatch(
        related_subjects.topics, return_exceptions=True
    )
    all_docs = []
    for docs in retrieved_docs:
        if isinstance(docs, BaseException):
            continue
        all_docs.extend(docs)
    print(f"Retrieved {len(all_docs)} docs for Topic: {topic}")
    
    formatted = format_docs(all_docs)
    return await gen_perspectives_chain.ainvoke({"examples": formatted, "topic": topic})

In [27]:

interview_graph = StormInterviewGraph1(interview_config=interview_config)

# Generate perspectives
perspectives = await survey_subjects.ainvoke("Machine Learning")

# Set perspectives
interview_graph.interviews.perspectives = perspectives
interview_graph.initialize_conversations()
logger.info(interview_graph.interviews.conversations.keys())



Survey Subjects for Topic: Machine Learning


Retrieved 15 docs for Topic: Machine Learning


2024-04-02 18:02:04,958 [MainThread  ] [INFO ]  dict_keys([Editor(affiliation='Academic Research Institute', name='Alice', role='Artificial Intelligence Researcher', description='Specializes in advanced AI algorithms and their applications in real-world scenarios.'), Editor(affiliation='Tech Company', name='Bob', role='Machine Learning Engineer', description='Focuses on implementing deep learning models for computer vision tasks.'), Editor(affiliation='Healthcare Organization', name='Charlie', role='Data Scientist', description='Utilizes data science techniques to extract insights from medical datasets.'), Editor(affiliation='Neural Network Startup', name='David', role='Neural Network Architect', description='Designs custom neural network architectures for specific business needs.'), Editor(affiliation='Online Education Platform', name='Eve', role='AI Ethics Researcher', description='Investigates ethical implications of AI technology and advocates for responsible AI development.')])


In [40]:
# Run interview

final_step = None
# await interview_graph.stream_and_return_results(
#     {"editor": perspectives.editors[0], "messages": []}
# )

initial_state : InterviewState = InterviewState(interview_config=interview_config, editor=p1.editors[0], messages=[], references={})
# {
#     "editor": p1.editors[0],
#     "messages": [
#         AIMessage(
#             content=f"So you said you were writing an article on {example_topic}?",
#             name="SubjectMatterExpert",
#         )
#     ],
# }
async for step in interview_graph.graph.astream(initial_state.as_dict()):
    name = next(iter(step))
    print(name)
    print(f"Processing step: {name}")
    print("-- ", str(step[name]["messages"])[:300])
    if END in step:
        final_step = step
        
final_state = next(iter(final_step.values()))

2024-04-02 18:12:42,090 [MainThread  ] [INFO ]  Generating question for DrMariaSilva


2024-04-02 18:12:43,139 [MainThread  ] [INFO ]  Generated question for DrMariaSilva: What specific public health measures have been most effective in combating the spread of COVID-19 globally?
2024-04-02 18:12:43,157 [MainThread  ] [INFO ]  START - Generate answers for [DrMariaSilva]


ask_question
Processing step: ask_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39


2024-04-02 18:12:44,691 [MainThread  ] [INFO ]  Got 5 search engine queries for [DrMariaSilva] -
	 ['Specific public health measures effective in combating COVID-19 spread globally', 'Impact of lockdowns on COVID-19 transmission rates worldwide', 'Effectiveness of mask-wearing in reducing COVID-19 cases', 'Role of social distancing in controlling the spread of COVID-19', 'Importance of widespread testing and contact tracing in managing COVID-19 outbreaks']


Searching DuckDuckGo for [Specific public health measures effective in combating COVID-19 spread globally]
Got search engine results: 5 for [Specific public health measures effective in combating COVID-19 spread globally]
Searching DuckDuckGo for [Impact of lockdowns on COVID-19 transmission rates worldwide]
Got search engine results: 5 for [Impact of lockdowns on COVID-19 transmission rates worldwide]
Searching DuckDuckGo for [Effectiveness of mask-wearing in reducing COVID-19 cases]
Got search engine results: 5 for [Effectiveness of mask-wearing in reducing COVID-19 cases]
Searching DuckDuckGo for [Role of social distancing in controlling the spread of COVID-19]
Got search engine results: 5 for [Role of social distancing in controlling the spread of COVID-19]
Searching DuckDuckGo for [Importance of widespread testing and contact tracing in managing COVID-19 outbreaks]


2024-04-02 18:12:52,950 [MainThread  ] [INFO ]  Got 5 search engine results for [DrMariaSilva] - 
	 {'https://www.nature.com/articles/s41467-023-40940-4': 'In the early stage of epidemics, it is critical to implement precise and effective public-health measures to control the spread and contain community-level transmission in a timely manner, with ...', 'https://www.nature.com/articles/s41598-023-31709-2': 'Effects of COVID-19 and policy responses on mobility. Figure 1 gives a global long-term temporal view of the spread of the virus (blue) and the associated mortality rate (red) as well as the ...', 'https://www.ecdc.europa.eu/en/infectious-disease-topics/z-disease-list/covid-19/facts/public-health-control-measures-covid-19': 'Public health authorities can take several measures to mitigate the negative effects of COVID-19 at individual and community or population level. Key public health measures and their main aims are: Vaccination to reduce the risk of severe COVID-19 disease. Surve

Got search engine results: 5 for [Importance of widespread testing and contact tracing in managing COVID-19 outbreaks]
Updating references: 
	{} 
with new references: 
	{}
answer_question
Processing step: answer_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39
Routing messages: {'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.complet

2024-04-02 18:12:53,462 [MainThread  ] [INFO ]  Generated question for DrMariaSilva: What are some key public health responses that have been implemented globally to address the COVID-19 pandemic?
2024-04-02 18:12:53,484 [MainThread  ] [INFO ]  START - Generate answers for [DrMariaSilva]


ask_question
Processing step: ask_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39


2024-04-02 18:12:54,936 [MainThread  ] [INFO ]  Got 5 search engine queries for [DrMariaSilva] -
	 ['Key public health responses to address the COVID-19 pandemic globally', 'Global implementation of public health measures to combat COVID-19', 'Role of contact tracing in controlling the spread of COVID-19 worldwide', 'Effectiveness of digital contact tracing in managing COVID-19 outbreaks', 'Global strategies for timely reporting and case investigation of COVID-19 cases']


Searching DuckDuckGo for [Key public health responses to address the COVID-19 pandemic globally]
Got search engine results: 5 for [Key public health responses to address the COVID-19 pandemic globally]
Searching DuckDuckGo for [Global implementation of public health measures to combat COVID-19]
Got search engine results: 5 for [Global implementation of public health measures to combat COVID-19]
Searching DuckDuckGo for [Role of contact tracing in controlling the spread of COVID-19 worldwide]
Got search engine results: 5 for [Role of contact tracing in controlling the spread of COVID-19 worldwide]
Searching DuckDuckGo for [Effectiveness of digital contact tracing in managing COVID-19 outbreaks]
Got search engine results: 5 for [Effectiveness of digital contact tracing in managing COVID-19 outbreaks]
Searching DuckDuckGo for [Global strategies for timely reporting and case investigation of COVID-19 cases]


2024-04-02 18:13:02,530 [MainThread  ] [INFO ]  Got 5 search engine results for [DrMariaSilva] - 
	 {'https://www.who.int/publications-detail-redirect/9789240074644': "In December 2022, the World Health Organization was responding to 53 health emergencies, including 13 grade 3, reaching millions of people. The annual report on WHO's response to health emergencies in 2022 outlines the increasingly critical role of WHO at global, regional and country levels, and across the key elements of effective emergency ...", 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10716458/': '1. Introduction. On January 30, 2020, the World Health Organization (WHO) declared that the SARS CoV-2 outbreak, which causes the COVID-19 disease, was a global health emergency, and on March 11, 2020, it was declared a pandemic ().As of July 2023, there were over 767 million cases reported globally, with 35.9% in Europe (EURO), 26.6% in the Western Pacific (WPRO), 25.2% in the Americas (PAHO ...', 'https://www.nature.c

Got search engine results: 5 for [Global strategies for timely reporting and case investigation of COVID-19 cases]
Updating references: 
	{} 
with new references: 
	{}
answer_question
Processing step: answer_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39
Routing messages: {'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completions

2024-04-02 18:13:03,223 [MainThread  ] [INFO ]  Generated question for DrMariaSilva: What are some key public health measures that have been effective in controlling the spread of pandemics, particularly during the current COVID-19 pandemic?
2024-04-02 18:13:03,248 [MainThread  ] [INFO ]  START - Generate answers for [DrMariaSilva]


ask_question
Processing step: ask_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39


2024-04-02 18:13:05,085 [MainThread  ] [INFO ]  Got 5 search engine queries for [DrMariaSilva] -
	 ['Key public health measures effective in controlling pandemic spread', 'Successful public health strategies during the COVID-19 pandemic', 'Role of vaccination in pandemic control and prevention', 'Importance of surveillance and contact tracing in managing pandemics', 'Global efforts in implementing non-pharmaceutical interventions during pandemics']


Searching DuckDuckGo for [Key public health measures effective in controlling pandemic spread]
Got search engine results: 5 for [Key public health measures effective in controlling pandemic spread]
Searching DuckDuckGo for [Successful public health strategies during the COVID-19 pandemic]
Got search engine results: 5 for [Successful public health strategies during the COVID-19 pandemic]
Searching DuckDuckGo for [Role of vaccination in pandemic control and prevention]
Got search engine results: 5 for [Role of vaccination in pandemic control and prevention]
Searching DuckDuckGo for [Importance of surveillance and contact tracing in managing pandemics]
Got search engine results: 5 for [Importance of surveillance and contact tracing in managing pandemics]
Searching DuckDuckGo for [Global efforts in implementing non-pharmaceutical interventions during pandemics]


2024-04-02 18:13:13,440 [MainThread  ] [INFO ]  Got 5 search engine results for [DrMariaSilva] - 
	 {'https://www.nature.com/articles/s41467-023-40940-4': 'In the early stage of epidemics, it is critical to implement precise and effective public-health measures to control the spread and contain community-level transmission in a timely manner, with ...', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10469790/': 'The COVID-19 pandemic public health response: a case study in constant change ... Stemming from changing contexts, evidence, and perceptions, the effectiveness of community public health measures are dynamic. Acknowledging dynamic impact implies that ongoing adaptation of public health policy recommendations as new data emerge and resource ...', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10530798/': 'The COVID-19 pandemic highlighted the scale of global unpreparedness to deal with the fast-arising needs of global health threats. This problem was coupled with a crisis of gover

Got search engine results: 5 for [Global efforts in implementing non-pharmaceutical interventions during pandemics]
Updating references: 
	{} 
with new references: 
	{}
answer_question
Processing step: answer_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39
Routing messages: {'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completion

2024-04-02 18:13:14,216 [MainThread  ] [INFO ]  Generated question for DrMariaSilva: What are some key public health strategies that have been effective in controlling the spread of pandemics like COVID-19?
2024-04-02 18:13:14,236 [MainThread  ] [INFO ]  START - Generate answers for [DrMariaSilva]


ask_question
Processing step: ask_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39


2024-04-02 18:13:15,617 [MainThread  ] [INFO ]  Got 5 search engine queries for [DrMariaSilva] -
	 ['Key public health strategies effective in controlling pandemic spread like COVID-19', 'Successful public health measures during COVID-19 pandemic', 'Role of vaccination in pandemic control and prevention', 'Importance of surveillance and contact tracing in managing pandemics', 'Global efforts in implementing non-pharmaceutical interventions during pandemics']


Searching DuckDuckGo for [Key public health strategies effective in controlling pandemic spread like COVID-19]
Got search engine results: 5 for [Key public health strategies effective in controlling pandemic spread like COVID-19]
Searching DuckDuckGo for [Successful public health measures during COVID-19 pandemic]
Got search engine results: 5 for [Successful public health measures during COVID-19 pandemic]
Searching DuckDuckGo for [Role of vaccination in pandemic control and prevention]
Got search engine results: 5 for [Role of vaccination in pandemic control and prevention]
Searching DuckDuckGo for [Importance of surveillance and contact tracing in managing pandemics]
Got search engine results: 5 for [Importance of surveillance and contact tracing in managing pandemics]
Searching DuckDuckGo for [Global efforts in implementing non-pharmaceutical interventions during pandemics]


2024-04-02 18:13:23,627 [MainThread  ] [INFO ]  Got 5 search engine results for [DrMariaSilva] - 
	 {'https://www.nature.com/articles/s41541-023-00773-0': 'The COVID-19 pandemic was met with rapid, unprecedented global collaboration and action. Even still, the public health, societal, and economic impact may be felt for years to come. The risk of ...', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10530798/': '1. Introduction. The COVID-19 pandemic highlighted gaps in both national healthcare systems and wider preparedness policies. The post-pandemic Global Health Security (GHS) index report confirmed that no country was fully prepared to tackle an emerging public health emergency threat [].The difference between the GHS index ranking and the real-world performance of countries, as determined via ...', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10472824/': 'Contact tracing is a cornerstone in public health practice, providing an effective response to infectious disease outbreaks. Be

Got search engine results: 5 for [Global efforts in implementing non-pharmaceutical interventions during pandemics]
Updating references: 
	{} 
with new references: 
	{}
answer_question
Processing step: answer_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39
Routing messages: {'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completion

2024-04-02 18:13:24,647 [MainThread  ] [INFO ]  Generated question for DrMariaSilva: What are some key public health strategies that have been effective in addressing the COVID-19 pandemic globally?
2024-04-02 18:13:24,677 [MainThread  ] [INFO ]  START - Generate answers for [DrMariaSilva]


ask_question
Processing step: ask_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39


2024-04-02 18:13:27,191 [MainThread  ] [INFO ]  Got 5 search engine queries for [DrMariaSilva] -
	 ['Key public health strategies effective in addressing the COVID-19 pandemic globally', 'Successful public health measures during the COVID-19 pandemic', 'Role of vaccination in pandemic control and prevention', 'Importance of surveillance and contact tracing in managing COVID-19', 'Global efforts in implementing non-pharmaceutical interventions during the COVID-19 pandemic']


Searching DuckDuckGo for [Key public health strategies effective in addressing the COVID-19 pandemic globally]
Got search engine results: 5 for [Key public health strategies effective in addressing the COVID-19 pandemic globally]
Searching DuckDuckGo for [Successful public health measures during the COVID-19 pandemic]
Got search engine results: 5 for [Successful public health measures during the COVID-19 pandemic]
Searching DuckDuckGo for [Role of vaccination in pandemic control and prevention]
Got search engine results: 5 for [Role of vaccination in pandemic control and prevention]
Searching DuckDuckGo for [Importance of surveillance and contact tracing in managing COVID-19]
Got search engine results: 5 for [Importance of surveillance and contact tracing in managing COVID-19]
Searching DuckDuckGo for [Global efforts in implementing non-pharmaceutical interventions during the COVID-19 pandemic]


2024-04-02 18:13:35,337 [MainThread  ] [INFO ]  Got 5 search engine results for [DrMariaSilva] - 
	 {'https://www.nature.com/articles/s41392-023-01724-w': 'The spread of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) has resulted in significant casualties and put immense strain on public health systems worldwide, leading to economic ...', 'https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(23)00485-1/fulltext': 'The need for safe, high-quality care and resilient health systems in all countries, with patient safety as an essential component, is already highlighted in the resolution WHA72.6 on global action on patient safety adopted during the World Health Assembly in 2019. The COVID-19 pandemic has shown the urgency of preventing harm to patients and ...', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10530798/': 'The COVID-19 pandemic highlighted the scale of global unpreparedness to deal with the fast-arising needs of global health threats. This problem was 

Got search engine results: 5 for [Global efforts in implementing non-pharmaceutical interventions during the COVID-19 pandemic]
Updating references: 
	{} 
with new references: 
	{}
answer_question
Processing step: answer_question
--  [HumanMessage(content='What specific public health measures have been most effective in combating the spread of COVID-19 globally?', response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 195, 'total_tokens': 214}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39
Routing messages: {'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.cha

In [41]:

                    

state2 = InterviewState.from_dict(final_state)
final_state

{'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278f0910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278f1d80>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1278c09d0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1278c33d0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1278f1f90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1278f37f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', e

In [42]:
state2.trim_messages(max_characters=1000)

Truncated message 2/20 to 1000 characters for msgName:DrMariaSilva
Truncated message 6/20 to 1000 characters for msgName:DrMariaSilva
Truncated message 10/20 to 1000 characters for msgName:DrMariaSilva
Truncated message 14/20 to 1000 characters for msgName:DrMariaSilva
Truncated message 18/20 to 1000 characters for msgName:DrMariaSilva


In [None]:

## Generate Initial Outline

from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser

direct_gen_outline_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a Wikipedia writer. Write an outline for a Wikipedia page about a user-provided topic. Be comprehensive and specific.",
        ),
        ("user", "{topic}\n{format_instructions}"),
    ]
)


class Subsection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    description: str = Field(..., title="Content of the subsection")

    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.description}".strip()


class Section(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    description: str = Field(..., title="Content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )

    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            f"### {subsection.subsection_title}\n\n{subsection.description}"
            for subsection in self.subsections or []
        )
        return f"## {self.section_title}\n\n{self.description}\n\n{subsections}".strip()


class Outline(BaseModel):
    page_title: str = Field(..., title="Title of the Wikipedia page")
    sections: List[Section] = Field(
        default_factory=list,
        title="Titles and descriptions for each section of the Wikipedia page.",
    )

    @property
    def as_str(self) -> str:
        sections = "\n\n".join(section.as_str for section in self.sections)
        return f"# {self.page_title}\n\n{sections}".strip()


outline_parser = PydanticOutputParser(pydantic_object=Outline)

generate_outline_direct = direct_gen_outline_prompt.partial(format_instructions=outline_parser.get_format_instructions()) | fast_llm | outline_parser


In [None]:

example_topic = "Impact of million-plus token context window language models on RAG"

initial_outline = generate_outline_direct.invoke({"topic": example_topic})

print(initial_outline.as_str)

In [None]:
## Expand Topics\



In [None]:
gen_related_topics_prompt = ChatPromptTemplate.from_template(
    """I'm writing a Wikipedia page for a topic mentioned below. Please identify and recommend some Wikipedia pages on closely related subjects. I'm looking for examples that provide insights into interesting aspects commonly associated with this topic, or examples that help me understand the typical content and structure included in Wikipedia pages for similar topics.

Please list the as many subjects and urls as you can.

Topic of interest: {topic}
{format_instructions}
"""
)


class RelatedSubjects(BaseModel):
    topics: List[str] = Field(
        description="Comprehensive list of related subjects as background research.",
    )


related_topics_parser = PydanticOutputParser(pydantic_object=RelatedSubjects)

expand_chain = gen_related_topics_prompt.partial(format_instructions=related_topics_parser.get_format_instructions()) | fast_llm | related_topics_parser


In [None]:
related_subjects = await expand_chain.ainvoke({"topic": example_topic})
related_subjects

## Generate Perspectives

From these related subjects, we can select representative Wikipedia editors as "subject matter experts" with distinct backgrounds and affiliations. These will help distribute the search process to encourage a more well-rounded final report.


In [None]:
class Editor(BaseModel):
    affiliation: str = Field(
        description="Primary affiliation of the editor.",
    )
    name: str = Field(
        description="Name of the editor.",
    )
    role: str = Field(
        description="Role of the editor in the context of the topic.",
    )
    description: str = Field(
        description="Description of the editor's focus, concerns, and motives.",
    )

    @property
    def persona(self) -> str:
        return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"


class Perspectives(BaseModel):
    editors: List[Editor] = Field(
        description="Comprehensive list of editors with their roles and affiliations.",
        # Add a pydantic validation/restriction to be at most M editors
    )

gen_perspectives_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You need to select a diverse (and distinct) group of Wikipedia editors who will work together to create a comprehensive article on the topic. Each of them represents a different perspective, role, or affiliation related to this topic.\
    You can use other Wikipedia pages of related topics for inspiration. For each editor, add a description of what they will focus on.

    Wiki page outlines of related topics for inspiration:
    {examples}""",
        ),
        ("user", "Topic of interest: {topic}\n\n{format_instructions}"),
    ]
)

perspectives_parser = PydanticOutputParser(pydantic_object=Perspectives)

gen_perspectives_chain = gen_perspectives_prompt.partial(format_instructions=perspectives_parser.get_format_instructions()) | fast_llm | perspectives_parser


In [None]:
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.runnables import RunnableLambda, chain as as_runnable

wikipedia_retriever = WikipediaRetriever(load_all_available_meta=True, top_k_results=1)


def format_doc(doc, max_length=1000)-> str:
    related = "- ".join(doc.metadata["categories"])
    return f"### {doc.metadata['title']}\n\nSummary: {doc.page_content}\n\nRelated\n{related}"[
        :max_length
    ]


def format_docs(docs):
    return "\n\n".join(format_doc(doc) for doc in docs)


@as_runnable
async def survey_subjects(topic: str)-> Perspectives:
    print(f"Survey Subjects for Topic: {topic}")
    related_subjects = await expand_chain.ainvoke({"topic": topic})
    retrieved_docs = await wikipedia_retriever.abatch(
        related_subjects.topics, return_exceptions=True
    )
    all_docs = []
    for docs in retrieved_docs:
        if isinstance(docs, BaseException):
            continue
        all_docs.extend(docs)
    print(f"Retrieved {len(all_docs)} docs for Topic: {topic}")
    
    formatted = format_docs(all_docs)
    return await gen_perspectives_chain.ainvoke({"examples": formatted, "topic": topic})

In [None]:
perspectives = await survey_subjects.ainvoke(example_topic)


In [None]:

perspectives.dict()


## Expert Dialog

Each wikipedia writer is primed to role-play using the perspectives presented above. It will ask a series of questions of a second "domain expert" with access to a search engine. This generate content to generate a refined outline as well as an updated index of reference documents.

### Interview State

The conversation is cyclic, so we will construct it within its own graph. The State will contain messages, the reference docs, and the editor (with its own "persona") to make it easy to parallelize these conversations.


In [None]:
from langgraph.graph import StateGraph, END
from typing_extensions import TypedDict
from langchain_core.messages import AnyMessage
from typing import Annotated, Sequence


def add_messages(left, right):
    if not isinstance(left, list):
        left = [left]
    if not isinstance(right, list):
        right = [right]
    return left + right


def update_references(references, new_references):
    if not references:
        references = {}
    references.update(new_references)
    return references


def update_editor(editor, new_editor):
    # Can only set at the outset
    if not editor:
        return new_editor
    return editor


class InterviewState(TypedDict):
    messages: Annotated[List[AnyMessage], add_messages]
    references: Annotated[Optional[dict], update_references]
    editor: Annotated[Optional[Editor], update_editor]

# Dialog Roles

The graph will have two participants: the wikipedia editor (generate_question), who asks questions based on its assigned role, and a domain expert (`gen_answer_chain), who uses a search engine to answer the questions as accurately as possible.


In [None]:
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage


gen_qn_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an experienced Wikipedia writer and want to edit a specific page. \
Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
Now, you are chatting with an expert to get information. Ask good questions to get more useful information.

When you have no more questions to ask, say "Thank you so much for your help!" to end the conversation.\
Please only ask one question at a time and don't ask what you have asked before.\
Your questions should be related to the topic you want to write.
Be comprehensive and curious, gaining as much unique insight from the expert as possible.\

Stay true to your specific perspective:

{persona}""",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)


def tag_with_name(ai_message: AIMessage, name: str) -> AIMessage:
    ai_message.name = name
    return ai_message


def swap_roles(state: InterviewState, name: str) -> InterviewState:

    # Normalize name
    name = cleanup_name(name)

    print(f'Swapping roles for {name}')

    converted = []
    for message in state["messages"]:
        if isinstance(message, AIMessage) and message.name != name:
            message = HumanMessage(**message.dict(exclude={"type"}))
        converted.append(message)
    
    print(f'Converted messages for {name} while swapping roles: {len(converted)} messages')

    return {"messages": converted}


@as_runnable
async def generate_question(state: InterviewState) -> InterviewState:
    editor = state["editor"]

    name = cleanup_name(editor.name)

    print(f'Generating question for {name}')

    gn_chain = (
        RunnableLambda(swap_roles).bind(name=name)
        | gen_qn_prompt.partial(persona=editor.persona)
        | fast_llm
        | RunnableLambda(tag_with_name).bind(name=name)
    )
    result:AIMessage = await gn_chain.ainvoke(state)

    print(f'Generated question for {name}')
    return {"messages": [result]}

In [None]:
messages = [
    HumanMessage(f"So you said you were writing an article on {example_topic}?")
]
question = await generate_question.ainvoke(
    {
        "editor": perspectives.editors[0],
        "messages": messages,
    }
)

question["messages"][0]

### Answer questions

The `gen_answer_chain` first generates queries (query expansion) to answer the editor's question, then responds with citations.


In [None]:
class Queries(BaseModel):
    queries: List[str] = Field(
        description="Comprehensive list of search engine queries to answer the user's questions.",
    )


gen_queries_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful research assistant. Query the search engine to answer the user's questions.\n{format_instructions}",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)

queries_parser = PydanticOutputParser(pydantic_object=Queries)

gen_queries_chain = gen_queries_prompt.partial(format_instructions=queries_parser.get_format_instructions()) | fast_llm | queries_parser

In [None]:

queries = await gen_queries_chain.ainvoke(
    {"messages": [HumanMessage(content=question["messages"][0].content)]}
)

queries

In [None]:

class AnswerWithCitations(BaseModel):
    answer: str = Field(
        description="Comprehensive answer to the user's question with citations.",
    )
    cited_urls: List[str] = Field(
        description="List of urls cited in the answer.",
    )

    @property
    def as_str(self) -> str:
        return f"{self.answer}\n\nCitations:\n\n" + "\n".join(
            f"[{i+1}]: {url}" for i, url in enumerate(self.cited_urls)
        )


gen_answer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert who can use information effectively. You are chatting with a Wikipedia writer who wants\
 to write a Wikipedia page on the topic you know. You have gathered the related information and will now use the information to form a response.

Make your response as informative as possible and make sure every sentence is supported by the gathered information.
Each response must be backed up by a citation from a reliable source, formatted as a footnote, reproducing the URLS after your response.
{format_instructions}""",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)

ac_parser = PydanticOutputParser(pydantic_object=AnswerWithCitations)

gen_answer_chain = gen_answer_prompt.partial(format_instructions=ac_parser.get_format_instructions()) | fast_llm | ac_parser 

# .with_structured_output(
#     AnswerWithCitations, include_raw=True
# ).with_config(run_name="GenerateAnswer")
                                             

In [None]:
from langchain_community.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper
from langchain_core.tools import tool

# DDG 
search_engine = DuckDuckGoSearchAPIWrapper()

@tool
async def search_engine(query: str):
    """Search engine to the internet."""

    print(f"Searching DuckDuckGo for [{query}]")

    results = DuckDuckGoSearchAPIWrapper()._ddgs_text(query)

    print(f"Got search engine results: {len(results)} for [{query}]")
    
    return [{"content": r["body"], "url": r["href"]} for r in results]

In [None]:
from langchain_core.runnables import RunnableConfig
import json, re


async def gen_answer(
    state: InterviewState,
    config: Optional[RunnableConfig] = None,
    name: str = "SubjectMatterExpert",
    max_str_len: int = 15000,
):
    name = cleanup_name(name)

    print(f'Generating answers for [{name}]')


    swapped_state = swap_roles(state, name)  # Convert all other AI messages
    
    queries:Queries = await gen_queries_chain.ainvoke(swapped_state)

    print(f"Got {len(queries.queries)} search engine queries for [{name}]")

    query_results = await search_engine.abatch(
        queries.queries, config, return_exceptions=True
    )
    successful_results = [
        res for res in query_results if not isinstance(res, Exception)
    ]

    print(f"Got {len(successful_results)} search engine results for [{name}]")

    all_query_results = {
        res["url"]: res["content"] for results in successful_results for res in results
    }

    # We could be more precise about handling max token length if we wanted to here
    dumped = json.dumps(all_query_results)[:max_str_len]
    
    ai_message: AIMessage = str(queries)
    # print(f"Got {ai_message} for [{name}]")
    
    # tool_call = queries["raw"].additional_kwargs["tool_calls"][0]
    # tool_id = tool_call["id"]

    # tool_message = ToolMessage(tool_call_id=tool_id, content=dumped)
    tool_message = HumanMessage(content=dumped)

    swapped_state["messages"].extend([ai_message, tool_message])
    
    # Only update the shared state with the final answer to avoid
    # polluting the dialogue history with intermediate messages
    try:
        generated: AnswerWithCitations = await gen_answer_chain.ainvoke(swapped_state)
    except Exception as e:
        print(f"Error generating answer for [{name}] - {e}")
        generated = AnswerWithCitations(answer="", cited_urls=[])
    
    cited_urls = set(generated.cited_urls)
    
    # Save the retrieved information to a the shared state for future reference
    cited_references = {k: v for k, v in all_query_results.items() if k in cited_urls}
    
    formatted_message = AIMessage(name=name, content=generated.as_str)

    print(f'Finished generating answer for [{name}]')
    return {"messages": [formatted_message], "references": cited_references}
    

In [None]:

example_answer = await gen_answer(
    {"messages": [HumanMessage(content=question["messages"][0].content)]}
)
example_answer["messages"][-1].content

# Construct the Interview Graph

Now that we've defined the editor and domain expert, we can compose them in a graph.


In [None]:
max_num_turns = 5




builder = StateGraph(InterviewState)

builder.add_node("ask_question", generate_question)
builder.add_node("answer_question", gen_answer)
builder.add_conditional_edges("answer_question", route_messages)
builder.add_edge("ask_question", "answer_question")

builder.set_entry_point("ask_question")
interview_graph = builder.compile().with_config(run_name="Conduct Interviews")

In [None]:
from IPython.display import Image

# comment out if you have not installed pygraphviz
# Image(interview_graph.get_graph().draw_png())

In [None]:

final_step = None

initial_state = {
    "editor": perspectives.editors[0],
    "messages": [
        AIMessage(
            content=f"So you said you were writing an article on {example_topic}?",
            name="SubjectMatterExpert",
        )
    ],
}
async for step in interview_graph.astream(initial_state):
    name = next(iter(step))
    print(name)
    print(f"Processing step: {name}")
    print("-- ", str(step[name]["messages"])[:300])
    if END in step:
        final_step = step
        
final_state = next(iter(final_step.values()))

In [None]:
final_state

## Refine Outline

At this point in STORM, we've conducted a large amount of research from different perspectives. It's time to refine the original outline based on these investigations. Below, create a chain using the LLM with a long context window to update the original outline.


In [None]:
refine_outline_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a Wikipedia writer. You have gathered information from experts and search engines. Now, you are refining the outline of the Wikipedia page. \
You need to make sure that the outline is comprehensive and specific. \
Topic you are writing about: {topic} 

Old outline:

{old_outline}
""",
        ),
        (
            "user",
            "Refine the outline based on your conversations with subject-matter experts:\n\nConversations:\n\n{conversations}\n\n{format_instructions}\n\nWrite the refined Wikipedia outline:",
        ),
    ]
)


# Using turbo preview since the context can get quite long
refine_outline_chain = refine_outline_prompt.partial(format_instructions=outline_parser.get_format_instructions()) | long_context_llm | outline_parser

In [None]:
refined_outline = refine_outline_chain.invoke(
    {
        "topic": example_topic,
        "old_outline": initial_outline.as_str,
        "conversations": "\n\n".join(
            f"### {m.name}\n\n{m.content}" for m in final_state["messages"]
        ),
    }
)

In [None]:
print(refined_outline.as_str)

In [None]:
## Generate Article

In [None]:
from langchain_core.documents import Document

from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
reference_docs = [
    Document(page_content=v, metadata={"source": k})
    for k, v in final_state["references"].items()
]

print(f"Number of references: {len(reference_docs)}")

# This really doesn't need to be a vectorstore for this size of data.
# It could just be a numpy matrix. Or you could store documents
# across requests if you want.
vectorstore = SKLearnVectorStore.from_documents(
    reference_docs,
    embedding=embeddings,
)
retriever = vectorstore.as_retriever(k=10)

In [None]:
retriever.invoke("What's a long context LLM anyway?")

#### Generate Sections

Now you can generate the sections using the indexed docs.


In [None]:
class SubSection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    content: str = Field(
        ...,
        title="Full content of the subsection. Include [#] citations to the cited sources where relevant.",
    )

    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.content}".strip()


class WikiSection(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    content: str = Field(..., title="Full content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )
    citations: List[str] = Field(default_factory=list)

    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            subsection.as_str for subsection in self.subsections or []
        )
        citations = "\n".join([f" [{i}] {cit}" for i, cit in enumerate(self.citations)])
        return (
            f"## {self.section_title}\n\n{self.content}\n\n{subsections}".strip()
            + f"\n\n{citations}".strip()
        )


section_writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia writer. Complete your assigned WikiSection from the following outline:\n\n"
            "{outline}\n\nCite your sources, using the following references:\n\n<Documents>\n{docs}\n<Documents>",
        ),
        ("user", "Write the full WikiSection for the {section} section.\n{format_instructions}"),
    ]
)


async def retrieve(inputs: dict):
    docs = await retriever.ainvoke(inputs["topic"] + ": " + inputs["section"])
    formatted = "\n".join(
        [
            f'<Document href="{doc.metadata["source"]}"/>\n{doc.page_content}\n</Document>'
            for doc in docs
        ]
    )
    return {"docs": formatted, **inputs}

wiki_parser = PydanticOutputParser(pydantic_object=WikiSection)

section_writer = (
    retrieve
    | section_writer_prompt.partial(format_instructions=wiki_parser.get_format_instructions())
    | long_context_llm
    | wiki_parser
)

In [None]:
section = await section_writer.ainvoke(
    {
        "outline": refined_outline.as_str,
        "section": refined_outline.sections[1].section_title,
        "topic": example_topic,
    }
)
print(section.as_str)

#### Generate final article

Now we can rewrite the draft to appropriately group all the citations and maintain a consistent voice.


In [None]:
from langchain_core.output_parsers import StrOutputParser

writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia author. Write the complete wiki article on {topic} using the following section drafts:\n\n"
            "{draft}\n\nStrictly follow Wikipedia format guidelines.",
        ),
        (
            "user",
            'Write the complete Wiki article using markdown format. Organize citations using footnotes like "[1]","" avoiding duplicates in the footer. Include URLs in the footer.',
        ),
    ]
)

writer = writer_prompt | long_context_llm | StrOutputParser()

In [None]:
for tok in writer.stream({"topic": example_topic, "draft": section.as_str}):
    print(tok, end="")

## Final Flow

Now it's time to string everything together. We will have 6 main stages in sequence:
.

1. Generate the initial outline + perspectives
2. Batch converse with each perspective to expand the content for the article
3. Refine the outline based on the conversations
4. Index the reference docs from the conversations
5. Write the individual sections of the article
6. Write the final wiki

The state tracks the outputs of each stage.


In [None]:
class ResearchState(TypedDict):
    topic: str
    outline: Outline
    editors: List[Editor]
    interview_results: List[InterviewState]
    # The final sections output
    sections: List[WikiSection]
    article: str

In [None]:
import asyncio


async def initialize_research(state: ResearchState):
    topic = state["topic"]
    coros = (
        generate_outline_direct.ainvoke({"topic": topic}),
        survey_subjects.ainvoke(topic),
    )
    results = await asyncio.gather(*coros)
    return {
        **state,
        "outline": results[0],
        "editors": results[1].editors,
    }


async def conduct_interviews(state: ResearchState):
    topic = state["topic"]
    initial_states = [
        {
            "editor": editor,
            "messages": [
                AIMessage(
                    content=f"So you said you were writing an article on {topic}?",
                    name="SubjectMatterExpert",
                )
            ],
        }
        for editor in state["editors"]
    ]
    # We call in to the sub-graph here to parallelize the interviews
    interview_results = await interview_graph.abatch(initial_states)

    return {
        **state,
        "interview_results": interview_results,
    }


def format_conversation(interview_state):
    messages = interview_state["messages"]
    convo = "\n".join(f"{m.name}: {m.content}" for m in messages)
    return f'Conversation with {interview_state["editor"].name}\n\n' + convo


async def refine_outline(state: ResearchState):
    convos = "\n\n".join(
        [
            format_conversation(interview_state)
            for interview_state in state["interview_results"]
        ]
    )

    updated_outline = await refine_outline_chain.ainvoke(
        {
            "topic": state["topic"],
            "old_outline": state["outline"].as_str,
            "conversations": convos,
        }
    )
    return {**state, "outline": updated_outline}


async def index_references(state: ResearchState):
    all_docs = []
    for interview_state in state["interview_results"]:
        reference_docs = [
            Document(page_content=v, metadata={"source": k})
            for k, v in interview_state["references"].items()
        ]
        all_docs.extend(reference_docs)
    await vectorstore.aadd_documents(all_docs)
    return state


async def write_sections(state: ResearchState):
    outline = state["outline"]
    sections = await section_writer.abatch(
        [
            {
                "outline": refined_outline.as_str,
                "section": section.section_title,
                "topic": state["topic"],
            }
            for section in outline.sections
        ]
    )
    return {
        **state,
        "sections": sections,
    }


async def write_article(state: ResearchState):
    topic = state["topic"]
    sections = state["sections"]
    draft = "\n\n".join([section.as_str for section in sections])
    article = await writer.ainvoke({"topic": topic, "draft": draft})
    return {
        **state,
        "article": article,
    }

#### Create the graph


In [None]:
builder_of_storm = StateGraph(ResearchState)

nodes = [
    ("init_research", initialize_research),
    ("conduct_interviews", conduct_interviews),
    ("refine_outline", refine_outline),
    ("index_references", index_references),
    ("write_sections", write_sections),
    ("write_article", write_article),
]
for i in range(len(nodes)):
    name, node = nodes[i]
    builder_of_storm.add_node(name, node)
    if i > 0:
        builder_of_storm.add_edge(nodes[i - 1][0], name)

builder_of_storm.set_entry_point(nodes[0][0])
builder_of_storm.set_finish_point(nodes[-1][0])
storm = builder_of_storm.compile()

In [None]:
async for step in storm.astream(
    {
        "topic": "Building better slack bots using LLMs",
    }
):
    name = next(iter(step))
    print(name)
    print("-- ", str(step[name])[:300])
    if END in step:
        results = step

In [None]:
article = results[END]["article"]

## Render the Wiki

Now we can render the final wiki page!


In [None]:
from IPython.display import Markdown

# We will down-header the sections to create less confusion in this notebook
Markdown(article.replace("\n#", "\n##"))