# Storm Research Assistant

Reference
https://github.com/langchain-ai/langgraph/blob/main/examples/storm/storm.ipynb


In [1]:
## Prereqs

# %pip install -U langchain_community langchain_openai langgraph wikipedia  scikit-learn  langchain_fireworks
# We use one or the other search engine below
# %pip install -U tavily-python
# %pip install -U duckduckgo-search
# ! apt-get install graphviz graphviz-dev
# %pip install pygraphviz



In [2]:
from storm import *
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


fast_llm = ChatOpenAI(model="gpt-3.5-turbo")
# long_context_llm = ChatOpenAI(model="gpt-4-turbo-preview")
long_context_llm = ChatOpenAI(model="gpt-3.5-turbo-0125")


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore_dir = "./data/storm/vectorstore/"
vectorstore = Chroma(persist_directory=vectorstore_dir, embedding_function=embeddings)

interview_config = InterviewConfig(long_llm=long_context_llm, 
                                   fast_llm=fast_llm, 
                                   max_conversations=5, 
                                   max_reference_length=10000,
                                   tags_to_extract=[ "p", "h1", "h2", "h3"],
                                   embeddings=embeddings,
                                   vectorstore=vectorstore,
                                   vectorstore_dir=vectorstore_dir,
                                   runnable_config=RunnableConfig()
                                   )

In [3]:
example_topic = "Covid-19 impact in 2024"


In [4]:


# Test generate question

state = InterviewState(
    interview_config=interview_config,
    editor=Editor(affiliation="Example University", name="John Doe", role="Lead Editor", description="Experienced in the field of biology."),
    messages=[],
    references={},
    summary=""
)

await node_generate_question.ainvoke(state)

2024-04-03 09:47:57,512 [MainThread  ] [INFO ]  Generating question for JohnDoe
2024-04-03 09:47:59,566 [MainThread  ] [INFO ]  Generated question for JohnDoe: What are some recent advancements in the field of molecular biology that have had a significant impact on our understanding of genetic regulation?


{'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118ba1de0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba3520>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118b5b430>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba08e0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x118ba3730>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x118c010f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', e

In [5]:

# Test generate answer

ax1 = await node_generate_answer.ainvoke(state)
ax1

2024-04-03 09:47:59,588 [MainThread  ] [INFO ]  START - Generate answers for [JohnDoe]
2024-04-03 09:48:00,538 [MainThread  ] [INFO ]  Got 2 search engine queries for [JohnDoe] -
	 ['Recent advancements in molecular biology impacting genetic regulation', 'Significant discoveries in genetic regulation in molecular biology']


Searching DuckDuckGo for [Recent advancements in molecular biology impacting genetic regulation]
Got search engine results: 5 for [Recent advancements in molecular biology impacting genetic regulation]
Searching DuckDuckGo for [Significant discoveries in genetic regulation in molecular biology]


2024-04-03 09:48:02,894 [MainThread  ] [INFO ]  Got 2 search engine results for [JohnDoe] - 
	 {'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10376292/': 'Next-generation sequencing (NGS) is a powerful tool used in genomics research. NGS can sequence millions of DNA fragments at once, providing detailed information about the structure of genomes, genetic variations, gene activity, and changes in gene behavior. Recent advancements have focused on faster and more accurate sequencing, reduced costs ...', 'https://www.nature.com/articles/s41576-024-00709-x': 'Advancements in genetic research from 1977 to 2023 have enabled high-resolution variant identification, large-scale DNA sequencing, cell-type-specific regulation understanding and breakthroughs in ...', 'https://www.nature.com/subjects/gene-regulation/nrg': 'In this Review, Preissl, Gaulton and Ren discuss single-cell epigenomic methods and data analysis tools, their readiness for profiling cis -regulatory elements in human tissues an

Got search engine results: 5 for [Significant discoveries in genetic regulation in molecular biology]


2024-04-03 09:48:10,304 [MainThread  ] [INFO ]  Genreted final answer answer='Next-generation sequencing (NGS) has revolutionized genomics research by enabling detailed sequencing of DNA fragments, providing insights into genetic variations, gene activity, and changes in gene behavior. Advancements in genetic research from 1977 to 2023 have led to high-resolution variant identification, large-scale DNA sequencing, and understanding cell-type-specific regulation. Recent research in molecular biology has focused on single-cell epigenomic methods, profiling cis-regulatory elements in human tissues, and understanding gene regulation mechanisms through histone modifications and chromatin alterations. Alternative splicing has been identified as an important gene regulation mechanism in response to environmental factors. Studies have also shown the significance of gene replication in driving cell cycle transcriptional dynamics. Additionally, research has integrated large-scale data to explore

{'interview_config': InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118ba1de0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba3520>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118b5b430>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba08e0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x118ba3730>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x118c010f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', e

In [6]:

# interview_graph = StormInterviewGraph1(interview_config=interview_config)

# # Generate perspectives
# perspectives = await survey_subjects.ainvoke("Machine Learning")

# # Set perspectives
# interview_graph.interviews.perspectives = perspectives
# interview_graph.initialize_conversations()
# logger.info(interview_graph.interviews.conversations.keys())



In [7]:
# # Run interview

# final_step = None

# initial_state : InterviewState = InterviewState(interview_config=interview_config, editor=p1.editors[0], messages=[], references={})

# async for step in interview_graph.graph.astream(initial_state.as_dict()):
#     name = next(iter(step))
#     print(name)
#     print(f"Processing step: {name}")
#     print("-- ", str(step[name]["messages"])[:300])
#     if END in step:
#         final_step = step
        
# final_state = next(iter(final_step.values()))
# final_state

In [8]:
# state2 = InterviewState.from_dict(final_state)
# state2

# Full Graph


In [9]:
# # Test generate outline
# i = ResearchState(topic=example_topic, interview_config=interview_config)
# o = await node_generate_outline.ainvoke(i)

# i = ResearchState.from_dict(o)
# i

ResearchState(topic='Covid-19 impact in 2024', interview_config=InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118ba1de0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba3520>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118b5b430>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba08e0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x118ba3730>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x118c010f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=N

In [11]:
# # Test generate perspectives
# o = await node_generate_perspectives.ainvoke(i)

# i = ResearchState.from_dict(o)
# i

2024-04-03 09:48:15,324 [MainThread  ] [INFO ]  Related Subjects for [Covid-19 impact in 2024]: topics=['COVID-19 pandemic', 'Impact of COVID-19 on economy', 'Health effects of COVID-19', 'COVID-19 vaccines', 'Global response to COVID-19', 'COVID-19 misinformation']
2024-04-03 09:48:26,074 [MainThread  ] [INFO ]  Generated 5 perspectives for topic: [Covid-19 impact in 2024] 
	- ['DrSarahEpidemiologist', 'DrJamesNeurologist', 'DrEmilyVaccinologist', 'MsSofiaHumanitarian', 'MrAlexFact-Checker']
2024-04-03 09:48:26,076 [MainThread  ] [INFO ]  Initialized 5 conversations.


ResearchState(topic='Covid-19 impact in 2024', interview_config=InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118ba1de0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba3520>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118b5b430>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba08e0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x118ba3730>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x118c010f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=N

In [13]:
# # Test
# o = await node_refine_outline.ainvoke(i)

# i = ResearchState.from_dict(o)
# i

ResearchState(topic='Covid-19 impact in 2024', interview_config=InterviewConfig(long_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118ba1de0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba3520>, model_name='gpt-3.5-turbo-0125', openai_api_key=SecretStr('**********'), openai_proxy=''), fast_llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x118b5b430>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x118ba08e0>, openai_api_key=SecretStr('**********'), openai_proxy=''), max_conversations=5, max_reference_length=10000, tags_to_extract=['p', 'h1', 'h2', 'h3'], embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x118ba3730>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x118c010f0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=N

Failed to batch ingest runs: TypeError('keys must be str, int, float, bool or None, not Editor')


In [14]:
i = ResearchState(topic=example_topic, interview_config=interview_config)
storm1 = StormResearchGraph(interview_config=interview_config, topic=example_topic)

o = storm1.stream_and_return_results(i)


NameError: name 'StormResearchGraph' is not defined

## Generate Perspectives

From these related subjects, we can select representative Wikipedia editors as "subject matter experts" with distinct backgrounds and affiliations. These will help distribute the search process to encourage a more well-rounded final report.


## Expert Dialog

Each wikipedia writer is primed to role-play using the perspectives presented above. It will ask a series of questions of a second "domain expert" with access to a search engine. This generate content to generate a refined outline as well as an updated index of reference documents.

### Interview State

The conversation is cyclic, so we will construct it within its own graph. The State will contain messages, the reference docs, and the editor (with its own "persona") to make it easy to parallelize these conversations.


# Dialog Roles

The graph will have two participants: the wikipedia editor (generate_question), who asks questions based on its assigned role, and a domain expert (`gen_answer_chain), who uses a search engine to answer the questions as accurately as possible.


### Answer questions

The `gen_answer_chain` first generates queries (query expansion) to answer the editor's question, then responds with citations.


# Construct the Interview Graph

Now that we've defined the editor and domain expert, we can compose them in a graph.


## Refine Outline

At this point in STORM, we've conducted a large amount of research from different perspectives. It's time to refine the original outline based on these investigations. Below, create a chain using the LLM with a long context window to update the original outline.


In [None]:
## Generate Article

In [None]:
from langchain_core.documents import Document

from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
reference_docs = [
    Document(page_content=v, metadata={"source": k})
    for k, v in final_state["references"].items()
]

print(f"Number of references: {len(reference_docs)}")

# This really doesn't need to be a vectorstore for this size of data.
# It could just be a numpy matrix. Or you could store documents
# across requests if you want.
vectorstore = SKLearnVectorStore.from_documents(
    reference_docs,
    embedding=embeddings,
)
retriever = vectorstore.as_retriever(k=10)

In [None]:
retriever.invoke("What's a long context LLM anyway?")

#### Generate Sections

Now you can generate the sections using the indexed docs.


In [None]:
class SubSection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    content: str = Field(
        ...,
        title="Full content of the subsection. Include [#] citations to the cited sources where relevant.",
    )

    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.content}".strip()


class WikiSection(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    content: str = Field(..., title="Full content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )
    citations: List[str] = Field(default_factory=list)

    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            subsection.as_str for subsection in self.subsections or []
        )
        citations = "\n".join([f" [{i}] {cit}" for i, cit in enumerate(self.citations)])
        return (
            f"## {self.section_title}\n\n{self.content}\n\n{subsections}".strip()
            + f"\n\n{citations}".strip()
        )


section_writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia writer. Complete your assigned WikiSection from the following outline:\n\n"
            "{outline}\n\nCite your sources, using the following references:\n\n<Documents>\n{docs}\n<Documents>",
        ),
        ("user", "Write the full WikiSection for the {section} section.\n{format_instructions}"),
    ]
)


async def retrieve(inputs: dict):
    docs = await retriever.ainvoke(inputs["topic"] + ": " + inputs["section"])
    formatted = "\n".join(
        [
            f'<Document href="{doc.metadata["source"]}"/>\n{doc.page_content}\n</Document>'
            for doc in docs
        ]
    )
    return {"docs": formatted, **inputs}

wiki_parser = PydanticOutputParser(pydantic_object=WikiSection)

section_writer = (
    retrieve
    | section_writer_prompt.partial(format_instructions=wiki_parser.get_format_instructions())
    | long_context_llm
    | wiki_parser
)

In [None]:
section = await section_writer.ainvoke(
    {
        "outline": refined_outline.as_str,
        "section": refined_outline.sections[1].section_title,
        "topic": example_topic,
    }
)
print(section.as_str)

#### Generate final article

Now we can rewrite the draft to appropriately group all the citations and maintain a consistent voice.


In [None]:
from langchain_core.output_parsers import StrOutputParser

writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert Wikipedia author. Write the complete wiki article on {topic} using the following section drafts:\n\n"
            "{draft}\n\nStrictly follow Wikipedia format guidelines.",
        ),
        (
            "user",
            'Write the complete Wiki article using markdown format. Organize citations using footnotes like "[1]","" avoiding duplicates in the footer. Include URLs in the footer.',
        ),
    ]
)

writer = writer_prompt | long_context_llm | StrOutputParser()

In [None]:
for tok in writer.stream({"topic": example_topic, "draft": section.as_str}):
    print(tok, end="")

## Final Flow

Now it's time to string everything together. We will have 6 main stages in sequence:
.

1. Generate the initial outline + perspectives
2. Batch converse with each perspective to expand the content for the article
3. Refine the outline based on the conversations
4. Index the reference docs from the conversations
5. Write the individual sections of the article
6. Write the final wiki

The state tracks the outputs of each stage.


In [None]:
class ResearchState(TypedDict):
    topic: str
    outline: Outline
    editors: List[Editor]
    interview_results: List[InterviewState]
    # The final sections output
    sections: List[WikiSection]
    article: str

In [None]:
import asyncio


async def initialize_research(state: ResearchState):
    topic = state["topic"]
    coros = (
        generate_outline_direct.ainvoke({"topic": topic}),
        survey_subjects.ainvoke(topic),
    )
    results = await asyncio.gather(*coros)
    return {
        **state,
        "outline": results[0],
        "editors": results[1].editors,
    }


async def conduct_interviews(state: ResearchState):
    topic = state["topic"]
    initial_states = [
        {
            "editor": editor,
            "messages": [
                AIMessage(
                    content=f"So you said you were writing an article on {topic}?",
                    name="SubjectMatterExpert",
                )
            ],
        }
        for editor in state["editors"]
    ]
    # We call in to the sub-graph here to parallelize the interviews
    interview_results = await interview_graph.abatch(initial_states)

    return {
        **state,
        "interview_results": interview_results,
    }


def format_conversation(interview_state):
    messages = interview_state["messages"]
    convo = "\n".join(f"{m.name}: {m.content}" for m in messages)
    return f'Conversation with {interview_state["editor"].name}\n\n' + convo


async def refine_outline(state: ResearchState):
    convos = "\n\n".join(
        [
            format_conversation(interview_state)
            for interview_state in state["interview_results"]
        ]
    )

    updated_outline = await refine_outline_chain.ainvoke(
        {
            "topic": state["topic"],
            "old_outline": state["outline"].as_str,
            "conversations": convos,
        }
    )
    return {**state, "outline": updated_outline}


async def index_references(state: ResearchState):
    all_docs = []
    for interview_state in state["interview_results"]:
        reference_docs = [
            Document(page_content=v, metadata={"source": k})
            for k, v in interview_state["references"].items()
        ]
        all_docs.extend(reference_docs)
    await vectorstore.aadd_documents(all_docs)
    return state


async def write_sections(state: ResearchState):
    outline = state["outline"]
    sections = await section_writer.abatch(
        [
            {
                "outline": refined_outline.as_str,
                "section": section.section_title,
                "topic": state["topic"],
            }
            for section in outline.sections
        ]
    )
    return {
        **state,
        "sections": sections,
    }


async def write_article(state: ResearchState):
    topic = state["topic"]
    sections = state["sections"]
    draft = "\n\n".join([section.as_str for section in sections])
    article = await writer.ainvoke({"topic": topic, "draft": draft})
    return {
        **state,
        "article": article,
    }

#### Create the graph


In [None]:
builder_of_storm = StateGraph(ResearchState)

nodes = [
    ("init_research", initialize_research),
    ("conduct_interviews", conduct_interviews),
    ("refine_outline", refine_outline),
    ("index_references", index_references),
    ("write_sections", write_sections),
    ("write_article", write_article),
]
for i in range(len(nodes)):
    name, node = nodes[i]
    builder_of_storm.add_node(name, node)
    if i > 0:
        builder_of_storm.add_edge(nodes[i - 1][0], name)

builder_of_storm.set_entry_point(nodes[0][0])
builder_of_storm.set_finish_point(nodes[-1][0])
storm = builder_of_storm.compile()

In [None]:
async for step in storm.astream(
    {
        "topic": "Building better slack bots using LLMs",
    }
):
    name = next(iter(step))
    print(name)
    print("-- ", str(step[name])[:300])
    if END in step:
        results = step

In [None]:
article = results[END]["article"]

## Render the Wiki

Now we can render the final wiki page!


In [None]:
from IPython.display import Markdown

# We will down-header the sections to create less confusion in this notebook
Markdown(article.replace("\n#", "\n##"))