In [47]:
import requests
from pathlib import Path
from llama_index import SimpleDirectoryReader, ServiceContext, LangchainEmbedding, StorageContext, VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [48]:
# wiki_titles = ["Toronto", "New York City", "Houston", "San Francisco"]
wiki_titles = ["Toronto"]
for title in wiki_titles:
    response = requests.get(f"https://en.wikipedia.org/w/api.php", 
                            params={
                                "action": "query",
                                "format": "json",
                                "titles": title,
                                "prop": "extracts",
                            }).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']
    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w', encoding="utf-8") as fp:
        fp.write(wiki_text)

In [49]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.7)
embedding_model = "BAAI/bge-large-en-v1.5"
embeddings = LangchainEmbedding(HuggingFaceBgeEmbeddings(model_name=embedding_model))
parser = SimpleNodeParser.from_defaults()
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embeddings, node_parser=parser)

In [51]:
docs = SimpleDirectoryReader('./data').load_data()
nodes = parser.get_nodes_from_documents(docs)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [52]:
index = VectorStoreIndex(nodes, service_context=service_context, storage_context=storage_context)

In [53]:
storage_context.persist(persist_dir="./storage")

In [54]:
query_engine = index.as_query_engine()

In [63]:
response = query_engine.query("what are the population of Toronto?")
print(f"Here is the response: \n {response}")

Here is the response: 
 The population of Toronto is 2,794,356 as of 2021.


In [64]:
response = query_engine.query("what are the population of New York?")
print(f"Here is the response: \n {response}")

Here is the response: 
 The population of New York City in 2020 was 8,804,190.


In [61]:
response = query_engine.query("compare the population of Toronto and New York City")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

Here is the response: 
 The population of Toronto is provided in the context information as 2,794,356 in 2021. However, there is no information about the population of New York City in the given context. Therefore, without additional information, it is not possible to compare the population of Toronto and New York City.
 Here is the source nodes:
 > Source (Doc id: df7d8e2f-115d-4c42-9b4d-f476fb7091c8): <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<p class="mw-...

> Source (Doc id: fa564c25-784e-406d-b08b-e8fd50e928b3): Toronto's traffic congestion is one of the highest in North America, and is the second highest in...


As stupid as it sounds, we just tested and it could answer both population of Toronto and New York City. Now it says it is impossible to compare.
Why is that? It has something to do with the query engine.

In [71]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import get_response_synthesizer

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = get_response_synthesizer(verbose=True)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)


In [72]:
response = query_engine.query("compare the population of Toronto and Houston")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

Here is the response: 
 Toronto has a recorded population of 2,794,356 in 2021, while Houston has a population of 2,302,878 in 2022. Therefore, Toronto has a higher population than Houston.
 Here is the source nodes:
 > Source (Doc id: df7d8e2f-115d-4c42-9b4d-f476fb7091c8): <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<p class="mw-...

> Source (Doc id: 90fd2e84-b1bb-4e74-a8f2-b4c845cb16cd): <p class="mw-empty-elt">

</p>
<p><b>Houston</b> (<span> <span></span></span>; <i title="English ...

> Source (Doc id: fa564c25-784e-406d-b08b-e8fd50e928b3): Toronto's traffic congestion is one of the highest in North America, and is the second highest in...

> Source (Doc id: 65f7e9e7-9921-452f-abd4-01804b16a568): The public transit system passed into public ownership in 1921 as the Toronto Transportation Comm...


As you can see, when we increase the top-k to 4, we expand the search result and thus, there is space to include the second city in question such as New york or Houston.
Now, let's get some maths involed in our query and see how it performs.

In [74]:
response = query_engine.query("Compare the population of Toronto and Houston. What is the percentage difference between two populations?")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

> Refine context: now known as Allen's Landing) and incorporated ...
Here is the response: 
 The population of Toronto and Houston are not provided in the given context. Therefore, it is not possible to calculate the percentage difference between the two populations.
 Here is the source nodes:
 > Source (Doc id: df7d8e2f-115d-4c42-9b4d-f476fb7091c8): <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<p class="mw-...

> Source (Doc id: deedec6d-4f8b-4adb-9c33-d596ffe23559): The Houston area is home to the largest African American community west of the Mississippi River....

> Source (Doc id: d5cff623-4203-4fd5-9a10-3b1181783a74): In 2009, Houston became the first U.S. city with a population over 1 million citizens to elect a ...

> Source (Doc id: 90fd2e84-b1bb-4e74-a8f2-b4c845cb16cd): <p class="mw-empty-elt">

</p>
<p><b>Houston</b> (<span> <span></span></span>; <i title="English ...


In [75]:
response = query_engine.query("Toronto has a recorded population of 2,794,356 in 2021, while Houston has a population of 2,302,878 in 2022. What is the percentage difference between two populations?")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

> Refine context: by total area whose government is not consolida...
Here is the response: 
 The percentage difference between the population of Toronto in 2021 and the population of Houston in 2022 cannot be determined based on the given context information.
 Here is the source nodes:
 > Source (Doc id: df7d8e2f-115d-4c42-9b4d-f476fb7091c8): <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<p class="mw-...

> Source (Doc id: edfbb42f-5725-4437-af56-ee09c72e5070): With a land area of 631.1 km<sup>2</sup> (243.7 sq mi), it had a population density of <span data...

> Source (Doc id: 97841949-af0b-4440-8266-b219936ea4e7): The Houston City Council passed this regulation in 2018 with a vote of 9–7. Had these floodplain ...

> Source (Doc id: 90fd2e84-b1bb-4e74-a8f2-b4c845cb16cd): <p class="mw-empty-elt">

</p>
<p><b>Houston</b> (<span> <span></span></span>; <i title="English ...


Ok, we can see that the retrieval engine might have problems of understanding "percentage difference", that's why it tries to retrieve documents that have something to do with percentage of areas, lands. This might be caused by the embedding quality of HuggingFace Model. It will be interesting to see the result when we use OpenAI Embeddings.