In [3]:
from llama_index import SimpleDirectoryReader, ServiceContext, get_response_synthesizer
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import OpenAI
import nest_asyncio
nest_asyncio.apply()

In [5]:

wiki_titles = ["Houston", "Toronto", "New York City", "San Francisco"]
city_docs = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()
    docs[0].doc_id = wiki_title
    city_docs.extend(docs)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=1024)

In [8]:
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    city_docs,
    service_context=service_context,
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm


Parsing documents into nodes: 100%|██████████| 4/4 [00:00<00:00, 16.95it/s]
Summarizing documents:   0%|          | 0/4 [00:00<?, ?it/s]

current doc id: Houston


Summarizing documents:  25%|██▌       | 1/4 [00:57<02:53, 57.71s/it]

current doc id: Toronto


Summarizing documents:  50%|█████     | 2/4 [02:32<02:38, 79.26s/it]

current doc id: New York City


Summarizing documents:  75%|███████▌  | 3/4 [14:27<06:09, 369.90s/it]

current doc id: San Francisco


Summarizing documents: 100%|██████████| 4/4 [15:36<00:00, 234.03s/it]
Generating embeddings: 100%|██████████| 4/4 [00:00<00:00, 12.12it/s]


In [10]:
doc_summary_index.get_document_summary("Houston")

"The provided text is about the city of Houston, Texas, and covers various aspects of the city such as its population, location, history, economy, cultural institutions, geographic features, environmental challenges, demographics, religion, architecture, climate, sports, government, crime, education, media, healthcare, infrastructure, transportation, and the Houston Airport System.\n\nSome questions that this text can answer include:\n- What is the population of Houston?\n- Where is Houston located?\n- What is the history of Houston?\n- What is the economy of Houston based on?\n- What cultural institutions are located in Houston?\n- What geographic features are present in Houston?\n- How has Houston been affected by hurricanes and flooding?\n- What are some of the environmental challenges faced by Houston?\n- What is the demographic breakdown of Houston's population?\n- What is the religious makeup of Houston?\n- What is the architecture like in Houston?\n- What is the climate of Houst

In [11]:
doc_summary_index.storage_context.persist("index")

In [12]:
from llama_index.indices.loading import load_index_from_storage
from llama_index import StorageContext

#saving the index so we don't need to re-process the docs
storage_context = StorageContext.from_defaults(persist_dir="index")
doc_summary_index = load_index_from_storage(storage_context)


In [13]:
query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)

In [14]:
# it took 10 minutes to process this answer. It is insane to use LLM to retrieve answer.
response = query_engine.query("What are the sports teams in Toronto?")

In [15]:
print(response)

The sports teams in Toronto include the Toronto Maple Leafs (NHL), Toronto Raptors (NBA), Toronto Blue Jays (MLB), Toronto FC (MLS), and Toronto Argonauts (CFL).


Embedding-based Retrieval

In [16]:
from llama_index.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever

In [50]:
# it taks 0.2 second when using embedding retrieval, this is just to retrieve the relevant nodes, not the final answer
retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    similarity_top_k=2,
)
retrieved_nodes = retriever.retrieve("What are the sports teams in Toronto?")

In [51]:
# 22 nodes in total were retrieved
len(retrieved_nodes)

57

In [56]:
print(retrieved_nodes[21].node.get_text())

Toronto's traffic congestion is one of the highest in North America, and is the second highest in Canada after Vancouver.</p>
<h2><span id="Sister_cities">Sister cities</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<h3><span id="Partnership_cities">Partnership cities</span></h3>

<h3><span id="Friendship_cities">Friendship cities</span></h3>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1147244281">
<h2><span id="Notable_people">Notable people</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<h2><span id="See_also">See also</span></h2>
<ul><li>Outline of Toronto (extensive topic list)</li>
<li>Great Lakes megalopolis</li>
<li>Largest cities in the Americas</li>
<li>List of metropolitan areas in the Americas</li></ul><h2><span id="Notes">Notes</span></h2>

<h2><span id="References">References</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:Template

In [57]:
# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [21]:
# Using embedding-based, it takes 44 seconds to answer, which is much better than using LLM
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

The sports teams in Toronto include the Toronto Maple Leafs (NHL), Toronto Raptors (NBA), Toronto Blue Jays (MLB), Toronto FC (MLS), and Toronto Argonauts (CFL).


In [None]:
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

In [59]:
response = query_engine.query("Compare the population of Toronto and New York")
print(response)

The population of Toronto and New York cannot be determined based on the given information.


In [58]:
retrieved_nodes = retriever.retrieve("Compare the population of Toronto and New York")
print(len(retrieved_nodes))

57


In [47]:
print(retrieved_nodes[21].node.get_text())

Toronto's traffic congestion is one of the highest in North America, and is the second highest in Canada after Vancouver.</p>
<h2><span id="Sister_cities">Sister cities</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<h3><span id="Partnership_cities">Partnership cities</span></h3>

<h3><span id="Friendship_cities">Friendship cities</span></h3>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1147244281">
<h2><span id="Notable_people">Notable people</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<h2><span id="See_also">See also</span></h2>
<ul><li>Outline of Toronto (extensive topic list)</li>
<li>Great Lakes megalopolis</li>
<li>Largest cities in the Americas</li>
<li>List of metropolitan areas in the Americas</li></ul><h2><span id="Notes">Notes</span></h2>

<h2><span id="References">References</span></h2>
<link rel="mw-deduplicated-inline-style" href="mw-data:Template