# Storm Research Assistant

Reference
https://github.com/langchain-ai/langgraph/blob/main/examples/storm/storm.ipynb


In [1]:
# ## Prereqs

# %pip install -U langchain_community langchain_openai langgraph wikipedia  scikit-learn  langchain_fireworks
# # We use one or the other search engine below
# %pip install -U tavily-python
# %pip install -U duckduckgo-search
# # ! apt-get install graphviz graphviz-dev
# # %pip install pygraphviz



In [2]:
from storm import *
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic


fast_llm = ChatOpenAI(model="gpt-3.5-turbo")
# long_context_llm = ChatOpenAI(model="gpt-4-turbo-preview")
long_context_llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

# haiku model
# haiku_model_name = "claude-3-haiku-20240307"
# fast_llm = ChatAnthropic(model_name=haiku_model_name)
# long_context_llm = ChatAnthropic(model_name=haiku_model_name)


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore_dir = "./data/storm/vectorstore/"
vectorstore = Chroma(persist_directory=vectorstore_dir, embedding_function=embeddings)

interview_config = InterviewConfig(long_llm=long_context_llm, 
                                   fast_llm=fast_llm, 
                                   max_conversations=3, 
                                   max_reference_length=10000,
                                   tags_to_extract=[ "p", "h1", "h2", "h3"],
                                   embeddings=embeddings,
                                   vectorstore=vectorstore,
                                   vectorstore_dir=vectorstore_dir,
                                   runnable_config=RunnableConfig()
                                   )

In [3]:
# Needs topic as input - {"topic": ""}
outline = get_chain_outline(interview_config.fast_llm)

# Needs topic as input - {"topic": ""}
expand_chain = get_chain_expand_related_topics(fast_llm)


gen_perspectives_chain = get_chain_perspective_generator(fast_llm)

# Need messages as input - {"messages": []}
gen_queries_chain = get_chain_queries(fast_llm)
gen_answer_chain = get_chain_answer(fast_llm)

example_topic = "Increase development productivity by using Docker compose and local docker labs"

In [4]:
i = Interviews(topic=example_topic, interview_config=interview_config)

g = StormGraph(interview_config=interview_config, topic=example_topic)
g1 = await g.graph.ainvoke(i.as_dict())


-- Survey Subjects for Topic: [Increase development productivity by using Docker compose and local docker labs] --

Related Subjects: ['Docker (software)', 'Containerization', 'DevOps', 'Continuous Integration', 'Software Development', 'Virtualization', 'Microservices']
Retrieved 7 wiki batches for Topic: Increase development productivity by using Docker compose and local docker labs:

	Docker (software) - https://en.wikipedia.org/wiki/Docker_(software)
	Docker, Inc. - https://en.wikipedia.org/wiki/Docker,_Inc.
	Containerization - https://en.wikipedia.org/wiki/Containerization
	Containerization (computing) - https://en.wikipedia.org/wiki/Containerization_(computing)
	DevOps - https://en.wikipedia.org/wiki/DevOps
	Azure DevOps Server - https://en.wikipedia.org/wiki/Azure_DevOps_Server
	Continuous integration - https://en.wikipedia.org/wiki/Continuous_integration
	Comparison of continuous integration software - https://en.wikipedia.org/wiki/Comparison_of_continuous_integration_software


2024-04-21 17:32:09,864 [MainThread  ] [INFO ]  Generating question for Alice


Generated 5 perspectives for Topic: [Increase development productivity by using Docker compose and local docker labs]

>> Generated perspective for: Alice 
Affiliation: - Software Development Company
Persona: - Name: Alice
Role: Software Engineer
Affiliation: Software Development Company
Description: Alice will focus on practical implementation examples of using Docker compose and local Docker labs to increase development productivity. She will provide insights into best practices, common challenges, and how to optimize development workflows using these tools.

Topic: - Increase development productivity by using Docker compose and local docker labs

>> Generated perspective for: Bob 
Affiliation: - Technology Consulting Firm
Persona: - Name: Bob
Role: DevOps Specialist
Affiliation: Technology Consulting Firm
Description: Bob specializes in DevOps practices and will provide guidance on integrating Docker compose and local Docker labs into the DevOps pipeline. He will focus on automation

2024-04-21 17:32:10,403 [MainThread  ] [INFO ]  Generated question for Alice: What are some common challenges that developers face when using Docker compose in their development workflow?
2024-04-21 17:32:10,425 [MainThread  ] [INFO ]  START - Generate answers for [Alice]
2024-04-21 17:32:10,946 [MainThread  ] [INFO ]  Got 1 search engine queries for [Alice] -
	 ['Common challenges when using Docker compose in development workflow']


Searching DuckDuckGo for [Common challenges when using Docker compose in development workflow]


2024-04-21 17:32:12,356 [MainThread  ] [INFO ]  Got 1 search engine results for [Alice] - 
	 {'https://medium.com/@dtadmin/docking-your-workflow-a-hands-on-guide-to-docker-compose-installation-and-examples-898fd814e179': 'docker-compose build =Look for all services containing the build. docker-compose run=Run a one-time command against a service. docker-compose up=Command used to start all the services of the ...', 'https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/': 'Five Challenges with Developing Locally Using Docker Compose. After the popularization of containers, a lot of the development workflow started leaning on Docker Compose. Developers would have a Docker Compose file which defined how to build the container images for all their services, what ports to expose, and have volumes attached to their ...', 'https://harsh05.medium.com/mastering-docker-a-guide-to-common-practical-challenges-5275c1dbfe3b': 'Expose port 80 to 1234 . Go to the wo

Got search engine results: 5 for [Common challenges when using Docker compose in development workflow]


2024-04-21 17:32:15,733 [MainThread  ] [INFO ]  Genreted final answer answer='Some common challenges that developers face when using Docker compose in their development workflow include defining how to build container images for services, managing exposed ports, attaching volumes, connecting services like databases to applications, creating and removing containers, and dealing with the verbosity and complexity of Docker run commands. Docker Compose simplifies the management of multi-container applications by using a declarative approach to configuration, allowing developers to specify the desired application environment.' cited_urls=['https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/', 'https://bomberbot.com/docker/mastering-docker-based-development-enabling-live-reload-for-lightning-fast-iterations/', 'https://harsh05.medium.com/mastering-docker-a-guide-to-common-practical-challenges-5275c1dbfe3b', 'https://medium.com/@dtadmin/docking-your-workf

InterviewState.from_dict: data is an instance of InterviewState
Routing messages for [Alice]
Continue asking question for [Alice] as this is not the last end of the conversation - ResponseCount: 1 of 3


2024-04-21 17:32:16,241 [MainThread  ] [INFO ]  Generated question for Alice: What are some common challenges that developers face when using Docker compose in their development workflow?
2024-04-21 17:32:16,261 [MainThread  ] [INFO ]  START - Generate answers for [Alice]
2024-04-21 17:32:16,759 [MainThread  ] [INFO ]  Got 1 search engine queries for [Alice] -
	 ['Common challenges when using Docker compose in development workflow']


Searching DuckDuckGo for [Common challenges when using Docker compose in development workflow]


2024-04-21 17:32:18,089 [MainThread  ] [INFO ]  Got 1 search engine results for [Alice] - 
	 {'https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/': 'Five Challenges with Developing Locally Using Docker Compose. After the popularization of containers, a lot of the development workflow started leaning on Docker Compose. Developers would have a Docker Compose file which defined how to build the container images for all their services, what ports to expose, and have volumes attached to their ...', 'https://medium.com/@dtadmin/docking-your-workflow-a-hands-on-guide-to-docker-compose-installation-and-examples-898fd814e179': 'docker-compose build =Look for all services containing the build. docker-compose run=Run a one-time command against a service. docker-compose up=Command used to start all the services of the ...', 'https://blog.devops.dev/docker-compose-tips-tricks-you-should-know-32859b6a9bee': "The software development world would be a fundamental

Got search engine results: 5 for [Common challenges when using Docker compose in development workflow]


2024-04-21 17:32:20,996 [MainThread  ] [INFO ]  Genreted final answer answer='Some common challenges that developers face when using Docker compose in their development workflow include defining how to build container images for services, managing exposed ports, and attaching volumes. Docker Compose simplifies the management of multiple Docker elements by allowing developers to define containers, networks, and services as one cohesive system. Additionally, Docker Compose uses a declarative approach to configuration, enabling developers to specify the desired application environment without scripting each step.' cited_urls=['https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/', 'https://medium.com/@dtadmin/docking-your-workflow-a-hands-on-guide-to-docker-compose-installation-and-examples-898fd814e179', 'https://blog.devops.dev/docker-compose-tips-tricks-you-should-know-32859b6a9bee', 'https://www.linkedin.com/pulse/empowering-development-workflows-d

InterviewState.from_dict: data is an instance of InterviewState
Routing messages for [Alice]
Continue asking question for [Alice] as this is not the last end of the conversation - ResponseCount: 2 of 3


2024-04-21 17:32:21,556 [MainThread  ] [INFO ]  Generated question for Alice: What are some common challenges that developers face when using Docker compose for local development environments?
2024-04-21 17:32:21,577 [MainThread  ] [INFO ]  START - Generate answers for [Alice]
2024-04-21 17:32:22,000 [MainThread  ] [INFO ]  Got 1 search engine queries for [Alice] -
	 ['Common challenges when using Docker compose for local development environments']


Searching DuckDuckGo for [Common challenges when using Docker compose for local development environments]


2024-04-21 17:32:23,324 [MainThread  ] [INFO ]  Got 1 search engine results for [Alice] - 
	 {'https://medium.com/simform-engineering/setting-up-a-local-development-environment-using-docker-compose-551efb4ec0ee': 'May 4, 2023. 2. Setting up a local development environment using Docker Compose. Docker Compose is a handy tool that helps you create and run applications that use multiple containers. It makes ...', 'https://reintech.io/blog/leveraging-docker-compose-local-dev-environments': 'Advantages of Using Docker Compose in Development. Consistency: Docker Compose ensures that developers are working in an environment that matches production. Ease of use: With just a few commands, new team members can get started without having to set up complex environments. Isolation: Each service runs in its own container, which minimizes ...', 'https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/': 'Consequently, developing locally with Docker Compose means worki

Got search engine results: 5 for [Common challenges when using Docker compose for local development environments]


2024-04-21 17:32:30,581 [MainThread  ] [INFO ]  Genreted final answer answer='Some common challenges that developers face when using Docker Compose for local development environments include working in an environment that deviates from production, which can lead to bugs that are challenging to identify, such as misspelled environment variables. To ensure stability and avoid surprises during deployment, it is crucial to address these discrepancies. Docker Compose offers advantages such as consistency in development environments, ease of use for new team members, isolation of services in their containers to minimize conflicts, and the ability to define the local development environment with environment variables, accessible ports, and mounted volumes.' cited_urls=['https://www.okteto.com/blog/five-challenges-with-developing-locally-using-docker-compose/', 'https://reintech.io/blog/leveraging-docker-compose-local-dev-environments', 'https://dev.to/theramoliya/docker-utilize-docker-compose

InterviewState.from_dict: data is an instance of InterviewState
Routing messages for [Alice]
Reached max number of responses for [Alice] - ResponseCount: 3
InterviewState.from_dict: data is an instance of dict
InterviewState(interview_config=InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x11c149490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x11c17c690>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x11ad34e50>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x11abfc990>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=3, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x11c17d7d0>, async_client=<openai.resources

2024-04-21 17:32:31,297 [MainThread  ] [INFO ]  Generated question for Bob: What are some best practices for integrating Docker compose and local Docker labs into the DevOps pipeline to enhance automation and scalability in software development?
2024-04-21 17:32:31,317 [MainThread  ] [INFO ]  START - Generate answers for [Bob]
2024-04-21 17:32:32,387 [MainThread  ] [INFO ]  Got 4 search engine queries for [Bob] -
	 ['best practices for integrating Docker compose in DevOps pipeline', 'best practices for integrating local Docker labs in DevOps pipeline', 'how to enhance automation in software development using Docker compose', 'how to enhance scalability in software development using Docker compose']


Searching DuckDuckGo for [best practices for integrating Docker compose in DevOps pipeline]
Got search engine results: 5 for [best practices for integrating Docker compose in DevOps pipeline]
Searching DuckDuckGo for [best practices for integrating local Docker labs in DevOps pipeline]
Got search engine results: 5 for [best practices for integrating local Docker labs in DevOps pipeline]
Searching DuckDuckGo for [how to enhance automation in software development using Docker compose]
Got search engine results: 5 for [how to enhance automation in software development using Docker compose]
Searching DuckDuckGo for [how to enhance scalability in software development using Docker compose]


2024-04-21 17:32:38,674 [MainThread  ] [INFO ]  Got 4 search engine results for [Bob] - 
	 {'https://blog.devops.dev/a-deep-dive-into-docker-compose-for-devops-engineers-5b640bc47715': 'Docker Compose is a tool that makes it easier to manage the complexities of multi-container applications. Instead of managing each Docker container individually, Docker Compose allows you to define multiple containers, their configurations, networks, and volumes in a single docker-compose.yml file. With a single command, you can spin up or tear ...', 'https://reintech.io/blog/continuous-integration-workflows-docker-compose': 'In the world of software development, Continuous Integration (CI) has become an indispensable practice, allowing teams to merge their code changes into a central repository, where builds and tests are run automatically. This approach minimizes integration issues and leads to more reliable software. Docker Compose is a tool that can significantly simplify the CI process by defining 

Got search engine results: 5 for [how to enhance scalability in software development using Docker compose]


2024-04-21 17:32:45,411 [MainThread  ] [INFO ]  Genreted final answer answer='Integrating Docker Compose into the DevOps pipeline can greatly enhance automation and scalability in software development. Docker Compose simplifies the management of multi-container applications by allowing you to define multiple containers, configurations, networks, and volumes in a single docker-compose.yml file. This streamlines the process of spinning up or tearing down complex application environments with a single command, making it easier to manage and replicate development and testing setups. Additionally, using Docker Compose for environment configuration can help in defining and managing multi-container Docker applications efficiently. By leveraging Docker Compose, teams can enhance the Continuous Integration (CI) process, making it easier to build, test, and deploy applications automatically. This integration provides a robust environment for automating the CI/CD pipeline, leading to more efficie

InterviewState.from_dict: data is an instance of InterviewState
Routing messages for [Bob]
Continue asking question for [Bob] as this is not the last end of the conversation - ResponseCount: 1 of 3


2024-04-21 17:32:46,034 [MainThread  ] [INFO ]  Generated question for Bob: What are some best practices for integrating Docker compose and local Docker labs into the DevOps pipeline to ensure automation and scalability in the software development lifecycle?
2024-04-21 17:32:46,055 [MainThread  ] [INFO ]  START - Generate answers for [Bob]
2024-04-21 17:32:46,987 [MainThread  ] [INFO ]  Got 4 search engine queries for [Bob] -
	 ['best practices for integrating Docker compose in DevOps pipeline', 'best practices for integrating local Docker labs in DevOps pipeline', 'how to ensure automation in software development using Docker compose', 'how to ensure scalability in software development using Docker compose']


Searching DuckDuckGo for [best practices for integrating Docker compose in DevOps pipeline]
Got search engine results: 5 for [best practices for integrating Docker compose in DevOps pipeline]
Searching DuckDuckGo for [best practices for integrating local Docker labs in DevOps pipeline]
Got search engine results: 5 for [best practices for integrating local Docker labs in DevOps pipeline]
Searching DuckDuckGo for [how to ensure automation in software development using Docker compose]
Got search engine results: 5 for [how to ensure automation in software development using Docker compose]
Searching DuckDuckGo for [how to ensure scalability in software development using Docker compose]


2024-04-21 17:32:52,766 [MainThread  ] [INFO ]  Got 4 search engine results for [Bob] - 
	 {'https://blog.devops.dev/a-deep-dive-into-docker-compose-for-devops-engineers-5b640bc47715': 'Docker Compose is a tool that makes it easier to manage the complexities of multi-container applications. Instead of managing each Docker container individually, Docker Compose allows you to define multiple containers, their configurations, networks, and volumes in a single docker-compose.yml file. With a single command, you can spin up or tear ...', 'https://blog.devops.dev/devops-zero-to-hero-5-docker-compose-to-run-multi-container-docker-applications-f8e51db47f22': 'mkdir docker-compose cd docker-compose touch docker-compose.yml requirements.txt app.py Dockerfile mkdir -p static/css touch static/css/style.css mkdir templates touch templates/index.html. Dockerfile — This will be used to build the web application image. app.py — This will contain the Flask code for the web application we will build in 

Got search engine results: 5 for [how to ensure scalability in software development using Docker compose]


In [None]:
x

NameError: name 'x' is not defined

In [None]:
f 

In [None]:
o1 = outline.invoke(t1)
o1

In [None]:
r1 = expand_chain.invoke(t1)
r1

In [None]:
p1 = gen_perspectives_chain.invoke({"examples": r1.topics, "topic": example_topic})
p1

In [None]:
c = get_chain_question_generator(fast_llm)
t2 = {"persona": ""}

q1 = c.invoke(t2)
q1

In [None]:
q_in = {"messages": [HumanMessage(content=q1.content, name="JohnSmith")]}
a1 = await gen_queries_chain.ainvoke(q_in)
a1

# Nodes

In [None]:

# Question node 

# Test 

state = InterviewState(
    interview_config=interview_config,
    editor=Editor(affiliation="Example University", name="John Doe", role="Lead Editor", description="Experienced in the field of biology."),
    messages=[],
    references={}
)

q2 = await node_generate_question.ainvoke(state)
q2x = InterviewState.from_dict(q2)
q2x

In [None]:
q3 = await node_generate_answer.ainvoke(q2x)
q3x = InterviewState.from_dict(q3)
q3x

In [None]:
node_route_messages(q3)

# Main Graph

In [None]:
i = Interviews(topic=example_topic, interview_config=interview_config)
i1 = await node_survey_subjects.ainvoke(i)

i1x = Interviews.from_dict(i1)


In [None]:
print(i1x)

In [None]:
i = Interviews(topic=example_topic, interview_config=interview_config)

g = StormGraph(interview_config=interview_config, topic=example_topic)
await g.graph.ainvoke(i.as_dict())

In [None]:

interview_graph = StormInterviewGraph1(interview_config=interview_config)

# Generate perspectives
perspectives = await survey_subjects.ainvoke(example_topic)

# Set perspectives
interview_graph.interviews.perspectives = perspectives
interview_graph.initialize_conversations()
logger.info(interview_graph.interviews.conversations.keys())



In [None]:
# Run interview

final_step = None
# await interview_graph.stream_and_return_results(
#     {"editor": perspectives.editors[0], "messages": []}
# )

initial_state : InterviewState = InterviewState(interview_config=interview_config, editor=p1.editors[0], messages=[], references={})
# {
#     "editor": p1.editors[0],
#     "messages": [
#         AIMessage(
#             content=f"So you said you were writing an article on {example_topic}?",
#             name="SubjectMatterExpert",
#         )
#     ],
# }
async for step in interview_graph.graph.astream(initial_state.as_dict()):
    name = next(iter(step))
    print(name)
    print(f"Processing step: {name}")
    print("-- ", str(step[name]["messages"])[:300])
    if END in step:
        final_step = step
        
final_state = next(iter(final_step.values()))

In [None]:
# final_state = next(iter(step.values()))


In [None]:
state2 = InterviewState.from_dict(final_state)
state2

In [None]:
# state2.trim_messages(max_characters=1000)
# final_state

In [None]:

## Generate Initial Outline

from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser

direct_gen_outline_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a Wikipedia writer. Write an outline for a Wikipedia page about a user-provided topic. Be comprehensive and specific.",
        ),
        ("user", "{topic}\n{format_instructions}"),
    ]
)


class Subsection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    description: str = Field(..., title="Content of the subsection")

    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.description}".strip()


class Section(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    description: str = Field(..., title="Content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )

    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            f"### {subsection.subsection_title}\n\n{subsection.description}"
            for subsection in self.subsections or []
        )
        return f"## {self.section_title}\n\n{self.description}\n\n{subsections}".strip()


class Outline(BaseModel):
    page_title: str = Field(..., title="Title of the Wikipedia page")
    sections: List[Section] = Field(
        default_factory=list,
        title="Titles and descriptions for each section of the Wikipedia page.",
    )

    @property
    def as_str(self) -> str:
        sections = "\n\n".join(section.as_str for section in self.sections)
        return f"# {self.page_title}\n\n{sections}".strip()


outline_parser = PydanticOutputParser(pydantic_object=Outline)

generate_outline_direct = direct_gen_outline_prompt.partial(format_instructions=outline_parser.get_format_instructions()) | fast_llm | outline_parser


In [None]:

example_topic = "Impact of million-plus token context window language models on RAG"

initial_outline = generate_outline_direct.invoke({"topic": example_topic})

print(initial_outline.as_str)

In [None]:
## Expand Topics\



In [None]:
gen_related_topics_prompt = ChatPromptTemplate.from_template(
    """I'm writing a Wikipedia page for a topic mentioned below. Please identify and recommend some Wikipedia pages on closely related subjects. I'm looking for examples that provide insights into interesting aspects commonly associated with this topic, or examples that help me understand the typical content and structure included in Wikipedia pages for similar topics.

Please list the as many subjects and urls as you can.

Topic of interest: {topic}
{format_instructions}
"""
)


class RelatedSubjects(BaseModel):
    topics: List[str] = Field(
        description="Comprehensive list of related subjects as background research.",
    )


related_topics_parser = PydanticOutputParser(pydantic_object=RelatedSubjects)

expand_chain = gen_related_topics_prompt.partial(format_instructions=related_topics_parser.get_format_instructions()) | fast_llm | related_topics_parser


In [None]:
related_subjects = await expand_chain.ainvoke({"topic": example_topic})
related_subjects

## Generate Perspectives

From these related subjects, we can select representative Wikipedia editors as "subject matter experts" with distinct backgrounds and affiliations. These will help distribute the search process to encourage a more well-rounded final report.


In [None]:
class Editor(BaseModel):
    affiliation: str = Field(
        description="Primary affiliation of the editor.",
    )
    name: str = Field(
        description="Name of the editor.",
    )
    role: str = Field(
        description="Role of the editor in the context of the topic.",
    )
    description: str = Field(
        description="Description of the editor's focus, concerns, and motives.",
    )

    @property
    def persona(self) -> str:
        return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"


class Perspectives(BaseModel):
    editors: List[Editor] = Field(
        description="Comprehensive list of editors with their roles and affiliations.",
        # Add a pydantic validation/restriction to be at most M editors
    )

gen_perspectives_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You need to select a diverse (and distinct) group of Wikipedia editors who will work together to create a comprehensive article on the topic. Each of them represents a different perspective, role, or affiliation related to this topic.\
    You can use other Wikipedia pages of related topics for inspiration. For each editor, add a description of what they will focus on.

    Wiki page outlines of related topics for inspiration:
    {examples}""",
        ),
        ("user", "Topic of interest: {topic}\n\n{format_instructions}"),
    ]
)

perspectives_parser = PydanticOutputParser(pydantic_object=Perspectives)

gen_perspectives_chain = gen_perspectives_prompt.partial(format_instructions=perspectives_parser.get_format_instructions()) | fast_llm | perspectives_parser


In [None]:
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.runnables import RunnableLambda, chain as as_runnable

wikipedia_retriever = WikipediaRetriever(load_all_available_meta=True, top_k_results=1)


def format_doc(doc, max_length=1000)-> str:
    related = "- ".join(doc.metadata["categories"])
    return f"### {doc.metadata['title']}\n\nSummary: {doc.page_content}\n\nRelated\n{related}"[
        :max_length
    ]


def format_docs(docs):
    return "\n\n".join(format_doc(doc) for doc in docs)


@as_runnable
async def survey_subjects(topic: str)-> Perspectives:
    print(f"Survey Subjects for Topic: {topic}")
    related_subjects = await expand_chain.ainvoke({"topic": topic})
    retrieved_docs = await wikipedia_retriever.abatch(
        related_subjects.topics, return_exceptions=True
    )
    all_docs = []
    for docs in retrieved_docs:
        if isinstance(docs, BaseException):
            continue
        all_docs.extend(docs)
    print(f"Retrieved {len(all_docs)} docs for Topic: {topic}")
    
    formatted = format_docs(all_docs)
    return await gen_perspectives_chain.ainvoke({"examples": formatted, "topic": topic})

In [None]:
perspectives = await survey_subjects.ainvoke(example_topic)


In [None]:

perspectives.dict()


## Expert Dialog

Each wikipedia writer is primed to role-play using the perspectives presented above. It will ask a series of questions of a second "domain expert" with access to a search engine. This generate content to generate a refined outline as well as an updated index of reference documents.

### Interview State

The conversation is cyclic, so we will construct it within its own graph. The State will contain messages, the reference docs, and the editor (with its own "persona") to make it easy to parallelize these conversations.


In [None]:
from langgraph.graph import StateGraph, END
from typing_extensions import TypedDict
from langchain_core.messages import AnyMessage
from typing import Annotated, Sequence


def add_messages(left, right):
    if not isinstance(left, list):
        left = [left]
    if not isinstance(right, list):
        right = [right]
    return left + right


def update_references(references, new_references):
    if not references:
        references = {}
    references.update(new_references)
    return references


def update_editor(editor, new_editor):
    # Can only set at the outset
    if not editor:
        return new_editor
    return editor


class InterviewState(TypedDict):
    messages: Annotated[List[AnyMessage], add_messages]
    references: Annotated[Optional[dict], update_references]
    editor: Annotated[Optional[Editor], update_editor]

# Dialog Roles

The graph will have two participants: the wikipedia editor (generate_question), who asks questions based on its assigned role, and a domain expert (`gen_answer_chain), who uses a search engine to answer the questions as accurately as possible.


In [None]:
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage


gen_qn_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an experienced Wikipedia writer and want to edit a specific page. \
Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
Now, you are chatting with an expert to get information. Ask good questions to get more useful information.

When you have no more questions to ask, say "Thank you so much for your help!" to end the conversation.\
Please only ask one question at a time and don't ask what you have asked before.\
Your questions should be related to the topic you want to write.
Be comprehensive and curious, gaining as much unique insight from the expert as possible.\

Stay true to your specific perspective:

{persona}""",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)


def tag_with_name(ai_message: AIMessage, name: str) -> AIMessage:
    ai_message.name = name
    return ai_message


def swap_roles(state: InterviewState, name: str) -> InterviewState:

    # Normalize name
    name = cleanup_name(name)

    print(f'Swapping roles for {name}')

    converted = []
    for message in state["messages"]:
        if isinstance(message, AIMessage) and message.name != name:
            message = HumanMessage(**message.dict(exclude={"type"}))
        converted.append(message)
    
    print(f'Converted messages for {name} while swapping roles: {len(converted)} messages')

    return {"messages": converted}


@as_runnable
async def generate_question(state: InterviewState) -> InterviewState:
    editor = state["editor"]

    name = cleanup_name(editor.name)

    print(f'Generating question for {name}')

    gn_chain = (
        RunnableLambda(swap_roles).bind(name=name)
        | gen_qn_prompt.partial(persona=editor.persona)
        | fast_llm
        | RunnableLambda(tag_with_name).bind(name=name)
    )
    result:AIMessage = await gn_chain.ainvoke(state)

    print(f'Generated question for {name}')
    return {"messages": [result]}

In [None]:
messages = [
    HumanMessage(f"So you said you were writing an article on {example_topic}?")
]
question = await generate_question.ainvoke(
    {
        "editor": perspectives.editors[0],
        "messages": messages,
    }
)

question["messages"][0]

### Answer questions

The `gen_answer_chain` first generates queries (query expansion) to answer the editor's question, then responds with citations.


In [None]:
class Queries(BaseModel):
    queries: List[str] = Field(
        description="Comprehensive list of search engine queries to answer the user's questions.",
    )


gen_queries_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful research assistant. Query the search engine to answer the user's questions.\n{format_instructions}",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)

queries_parser = PydanticOutputParser(pydantic_object=Queries)

gen_queries_chain = gen_queries_prompt.partial(format_instructions=queries_parser.get_format_instructions()) | fast_llm | queries_parser

In [None]:

queries = await gen_queries_chain.ainvoke(
    {"messages": [HumanMessage(content=question["messages"][0].content)]}
)

queries

In [None]:

class AnswerWithCitations(BaseModel):
    answer: str = Field(
        description="Comprehensive answer to the user's question with citations.",
    )
    cited_urls: List[str] = Field(
        description="List of urls cited in the answer.",
    )

    @property
    def as_str(self) -> str:
        return f"{self.answer}\n\nCitations:\n\n" + "\n".join(
            f"[{i+1}]: {url}" for i, url in enumerate(self.cited_urls)
        )


gen_answer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert who can use information effectively. You are chatting with a Wikipedia writer who wants\
 to write a Wikipedia page on the topic you know. You have gathered the related information and will now use the information to form a response.

Make your response as informative as possible and make sure every sentence is supported by the gathered information.
Each response must be backed up by a citation from a reliable source, formatted as a footnote, reproducing the URLS after your response.
{format_instructions}""",
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)

ac_parser = PydanticOutputParser(pydantic_object=AnswerWithCitations)

gen_answer_chain = gen_answer_prompt.partial(format_instructions=ac_parser.get_format_instructions()) | fast_llm | ac_parser 

# .with_structured_output(
#     AnswerWithCitations, include_raw=True
# ).with_config(run_name="GenerateAnswer")
                                             

In [None]:
from langchain_community.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper
from langchain_core.tools import tool

# DDG 
search_engine = DuckDuckGoSearchAPIWrapper()

@tool
async def search_engine(query: str):
    """Search engine to the internet."""

    print(f"Searching DuckDuckGo for [{query}]")

    results = DuckDuckGoSearchAPIWrapper()._ddgs_text(query)

    print(f"Got search engine results: {len(results)} for [{query}]")
    
    return [{"content": r["body"], "url": r["href"]} for r in results]

In [None]:
from langchain_core.runnables import RunnableConfig
import json, re


async def gen_answer(
    state: InterviewState,
    config: Optional[RunnableConfig] = None,
    name: str = "SubjectMatterExpert",
    max_str_len: int = 15000,
):
    name = cleanup_name(name)

    print(f'Generating answers for [{name}]')


    swapped_state = swap_roles(state, name)  # Convert all other AI messages
    
    queries:Queries = await gen_queries_chain.ainvoke(swapped_state)

    print(f"Got {len(queries.queries)} search engine queries for [{name}]")

    query_results = await search_engine.abatch(
        queries.queries, config, return_exceptions=True
    )
    successful_results = [
        res for res in query_results if not isinstance(res, Exception)
    ]

    print(f"Got {len(successful_results)} search engine results for [{name}]")

    all_query_results = {
        res["url"]: res["content"] for results in successful_results for res in results
    }

    # We could be more precise about handling max token length if we wanted to here
    dumped = json.dumps(all_query_results)[:max_str_len]
    
    ai_message: AIMessage = str(queries)
    # print(f"Got {ai_message} for [{name}]")
    
    # tool_call = queries["raw"].additional_kwargs["tool_calls"][0]
    # tool_id = tool_call["id"]

    # tool_message = ToolMessage(tool_call_id=tool_id, content=dumped)
    tool_message = HumanMessage(content=dumped)

    swapped_state["messages"].extend([ai_message, tool_message])
    
    # Only update the shared state with the final answer to avoid
    # polluting the dialogue history with intermediate messages
    try:
        generated: AnswerWithCitations = await gen_answer_chain.ainvoke(swapped_state)
    except Exception as e:
        print(f"Error generating answer for [{name}] - {e}")
        generated = AnswerWithCitations(answer="", cited_urls=[])
    
    cited_urls = set(generated.cited_urls)
    
    # Save the retrieved information to a the shared state for future reference
    cited_references = {k: v for k, v in all_query_results.items() if k in cited_urls}
    
    formatted_message = AIMessage(name=name, content=generated.as_str)

    print(f'Finished generating answer for [{name}]')
    return {"messages": [formatted_message], "references": cited_references}
    

In [None]:

example_answer = await gen_answer(
    {"messages": [HumanMessage(content=question["messages"][0].content)]}
)
example_answer["messages"][-1].content

# Construct the Interview Graph

Now that we've defined the editor and domain expert, we can compose them in a graph.


In [None]:
max_num_turns = 5




builder = StateGraph(InterviewState)

builder.add_node("ask_question", generate_question)
builder.add_node("answer_question", gen_answer)
builder.add_conditional_edges("answer_question", route_messages)
builder.add_edge("ask_question", "answer_question")

builder.set_entry_point("ask_question")
interview_graph = builder.compile().with_config(run_name="Conduct Interviews")

In [None]:
from IPython.display import Image

# comment out if you have not installed pygraphviz
# Image(interview_graph.get_graph().draw_png())

In [None]:

final_step = None

initial_state = {
    "editor": perspectives.editors[0],
    "messages": [
        AIMessage(
            content=f"So you said you were writing an article on {example_topic}?",
            name="SubjectMatterExpert",
        )
    ],
}
async for step in interview_graph.astream(initial_state):
    name = next(iter(step))
    print(name)
    print(f"Processing step: {name}")
    print("-- ", str(step[name]["messages"])[:300])
    if END in step:
        final_step = step
        
final_state = next(iter(final_step.values()))

In [None]:
final_state

## Refine Outline

At this point in STORM, we've conducted a large amount of research from different perspectives. It's time to refine the original outline based on these investigations. Below, create a chain using the LLM with a long context window to update the original outline.


In [None]:
refine_outline_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a Wikipedia writer. You have gathered information from experts and search engines. Now, you are refining the outline of the Wikipedia page. \
You need to make sure that the outline is comprehensive and specific. \
Topic you are writing about: {topic} 

Old outline:

{old_outline}
""",
        ),
        (
            "user",
            "Refine the outline based on your conversations with subject-matter experts:\n\nConversations:\n\n{conversations}\n\n{format_instructions}\n\nWrite the refined Wikipedia outline:",
        ),
    ]
)


# Using turbo preview since the context can get quite long
refine_outline_chain = refine_outline_prompt.partial(format_instructions=outline_parser.get_format_instructions()) | long_context_llm | outline_parser

In [None]:
refined_outline = refine_outline_chain.invoke(
    {
        "topic": example_topic,
        "old_outline": initial_outline.as_str,
        "conversations": "\n\n".join(
            f"### {m.name}\n\n{m.content}" for m in final_state["messages"]
        ),
    }
)

In [None]:
print(refined_outline.as_str)

In [None]:
## Generate Article

In [None]:
from langchain_core.documents import Document

from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
reference_docs = [
    Document(page_content=v, metadata={"source": k})
    for k, v in final_state["references"].items()
]

print(f"Number of references: {len(reference_docs)}")

# This really doesn't need to be a vectorstore for this size of data.
# It could just be a numpy matrix. Or you could store documents
# across requests if you want.
vectorstore = SKLearnVectorStore.from_documents(
    reference_docs,
    embedding=embeddings,
)
retriever = vectorstore.as_retriever(k=10)

In [None]:
retriever.invoke("What's a long context LLM anyway?")

#### Generate Sections

Now you can generate the sections using the indexed docs.


In [None]:
class SubSection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    content: str = Field(
        ...,
        title="Full content of the subsection. Include [#] citations to the cited sources where relevant.",
    )

    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.content}".strip()


class WikiSection(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    content: str = Field(..., title="Full content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )
    citations: List[str] = Field(default_factory=list)

    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            subsection.as_str for subsection in self.subsections or []
        )
        citations = "\n".join([f" [{i}] {cit}" for i, cit in enumerate(self.citations)])
        return (
            f"## {self.section_title}\n\n{self.content}\n\n{subsections}".strip()
            + f"\n\n{citations}".strip()
        )


section_writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia writer. Complete your assigned WikiSection from the following outline:\n\n"
            "{outline}\n\nCite your sources, using the following references:\n\n<Documents>\n{docs}\n<Documents>",
        ),
        ("user", "Write the full WikiSection for the {section} section.\n{format_instructions}"),
    ]
)


async def retrieve(inputs: dict):
    docs = await retriever.ainvoke(inputs["topic"] + ": " + inputs["section"])
    formatted = "\n".join(
        [
            f'<Document href="{doc.metadata["source"]}"/>\n{doc.page_content}\n</Document>'
            for doc in docs
        ]
    )
    return {"docs": formatted, **inputs}

wiki_parser = PydanticOutputParser(pydantic_object=WikiSection)

section_writer = (
    retrieve
    | section_writer_prompt.partial(format_instructions=wiki_parser.get_format_instructions())
    | long_context_llm
    | wiki_parser
)

In [None]:
section = await section_writer.ainvoke(
    {
        "outline": refined_outline.as_str,
        "section": refined_outline.sections[1].section_title,
        "topic": example_topic,
    }
)
print(section.as_str)

#### Generate final article

Now we can rewrite the draft to appropriately group all the citations and maintain a consistent voice.


In [None]:
from langchain_core.output_parsers import StrOutputParser

writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia author. Write the complete wiki article on {topic} using the following section drafts:\n\n"
            "{draft}\n\nStrictly follow Wikipedia format guidelines.",
        ),
        (
            "user",
            'Write the complete Wiki article using markdown format. Organize citations using footnotes like "[1]","" avoiding duplicates in the footer. Include URLs in the footer.',
        ),
    ]
)

writer = writer_prompt | long_context_llm | StrOutputParser()

In [None]:
for tok in writer.stream({"topic": example_topic, "draft": section.as_str}):
    print(tok, end="")

## Final Flow

Now it's time to string everything together. We will have 6 main stages in sequence:
.

1. Generate the initial outline + perspectives
2. Batch converse with each perspective to expand the content for the article
3. Refine the outline based on the conversations
4. Index the reference docs from the conversations
5. Write the individual sections of the article
6. Write the final wiki

The state tracks the outputs of each stage.


In [None]:
class ResearchState(TypedDict):
    topic: str
    outline: Outline
    editors: List[Editor]
    interview_results: List[InterviewState]
    # The final sections output
    sections: List[WikiSection]
    article: str

In [None]:
import asyncio


async def initialize_research(state: ResearchState):
    topic = state["topic"]
    coros = (
        generate_outline_direct.ainvoke({"topic": topic}),
        survey_subjects.ainvoke(topic),
    )
    results = await asyncio.gather(*coros)
    return {
        **state,
        "outline": results[0],
        "editors": results[1].editors,
    }


async def conduct_interviews(state: ResearchState):
    topic = state["topic"]
    initial_states = [
        {
            "editor": editor,
            "messages": [
                AIMessage(
                    content=f"So you said you were writing an article on {topic}?",
                    name="SubjectMatterExpert",
                )
            ],
        }
        for editor in state["editors"]
    ]
    # We call in to the sub-graph here to parallelize the interviews
    interview_results = await interview_graph.abatch(initial_states)

    return {
        **state,
        "interview_results": interview_results,
    }


def format_conversation(interview_state):
    messages = interview_state["messages"]
    convo = "\n".join(f"{m.name}: {m.content}" for m in messages)
    return f'Conversation with {interview_state["editor"].name}\n\n' + convo


async def refine_outline(state: ResearchState):
    convos = "\n\n".join(
        [
            format_conversation(interview_state)
            for interview_state in state["interview_results"]
        ]
    )

    updated_outline = await refine_outline_chain.ainvoke(
        {
            "topic": state["topic"],
            "old_outline": state["outline"].as_str,
            "conversations": convos,
        }
    )
    return {**state, "outline": updated_outline}


async def index_references(state: ResearchState):
    all_docs = []
    for interview_state in state["interview_results"]:
        reference_docs = [
            Document(page_content=v, metadata={"source": k})
            for k, v in interview_state["references"].items()
        ]
        all_docs.extend(reference_docs)
    await vectorstore.aadd_documents(all_docs)
    return state


async def write_sections(state: ResearchState):
    outline = state["outline"]
    sections = await section_writer.abatch(
        [
            {
                "outline": refined_outline.as_str,
                "section": section.section_title,
                "topic": state["topic"],
            }
            for section in outline.sections
        ]
    )
    return {
        **state,
        "sections": sections,
    }


async def write_article(state: ResearchState):
    topic = state["topic"]
    sections = state["sections"]
    draft = "\n\n".join([section.as_str for section in sections])
    article = await writer.ainvoke({"topic": topic, "draft": draft})
    return {
        **state,
        "article": article,
    }

#### Create the graph


In [None]:
builder_of_storm = StateGraph(ResearchState)

nodes = [
    ("init_research", initialize_research),
    ("conduct_interviews", conduct_interviews),
    ("refine_outline", refine_outline),
    ("index_references", index_references),
    ("write_sections", write_sections),
    ("write_article", write_article),
]
for i in range(len(nodes)):
    name, node = nodes[i]
    builder_of_storm.add_node(name, node)
    if i > 0:
        builder_of_storm.add_edge(nodes[i - 1][0], name)

builder_of_storm.set_entry_point(nodes[0][0])
builder_of_storm.set_finish_point(nodes[-1][0])
storm = builder_of_storm.compile()

In [None]:
async for step in storm.astream(
    {
        "topic": "Building better slack bots using LLMs",
    }
):
    name = next(iter(step))
    print(name)
    print("-- ", str(step[name])[:300])
    if END in step:
        results = step

In [None]:
article = results[END]["article"]

## Render the Wiki

Now we can render the final wiki page!


In [None]:
from IPython.display import Markdown

# We will down-header the sections to create less confusion in this notebook
Markdown(article.replace("\n#", "\n##"))