In [1]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from operator import itemgetter
from langchain.callbacks import get_openai_callback
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

In [2]:
with open('data/Toronto.txt', 'r') as f:
    html_toronto = f.read()
with open('data/New_York_City.txt', 'r') as f:
    html_new_york = f.read()

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
toronto_header_splits = html_splitter.split_text(html_toronto)
new_york_header_splits = html_splitter.split_text(html_new_york)

In [3]:
toronto_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
print(toronto_header_splits[0].metadata)

{'Header2': 'Overview', 'Header 3': 'population'}


In [None]:
print(toronto_header_splits[0])

In [None]:
len(new_york_header_splits)

In [4]:
new_york_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}

In [20]:
print(new_york_header_splits[0].metadata)

{'Header2': 'Overview', 'Header 3': 'population'}


In [5]:
chunk_size = 500
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
toronto_docs = text_splitter.split_documents(toronto_header_splits)
new_york_docs = text_splitter.split_documents(new_york_header_splits)

In [6]:
total_docs = toronto_header_splits + new_york_header_splits
len(total_docs)

95

In [7]:
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
vector_store = FAISS.from_documents(total_docs, embedding=OpenAIEmbeddings())
retriever = vector_store.as_retriever(search_kwargs={"k": 4})
template = """given the below context:
{context} 
please answer the question: {question}
"""
prompt = PromptTemplate.from_template(template=template)

chain = {"context": itemgetter("question") | retriever, "question": itemgetter("question")}| prompt | model | StrOutputParser()


In [8]:
with get_openai_callback() as callback:
    response = chain.invoke({"question": "what is the population of Toronto?"})
    print(response)
    print(f"\nHere is the cost breakdown for this call:\n{callback}")

The population of Toronto is 2,794,356 as of 2021.

Here is the cost breakdown for this call:
Tokens Used: 2355
	Prompt Tokens: 2338
	Completion Tokens: 17
Successful Requests: 1
Total Cost (USD): $0.0035410000000000003


In [9]:
with get_openai_callback() as callback:
    response = chain.invoke({"question": "please compare the population between New York City and Toronto, what is the percentage difference between two populations?"})
    print(response)
    print(f"\nHere is the cost breakdown for this call:\n{callback}")

To compare the population between New York City and Toronto, we need to consider the most recent data available. According to the given context, the population of Toronto in 2021 was 2,794,356. On the other hand, the population of New York City in 2020 was not explicitly mentioned, but it states that the city gained 629,000 residents between 2010 and 2020. Therefore, we can estimate the population of New York City in 2020 as 8,804,190 + 629,000 = 9,433,190.

To calculate the percentage difference between the two populations, we can use the formula:

Percentage Difference = ((New Population - Old Population) / Old Population) * 100

Percentage Difference = ((9,433,190 - 2,794,356) / 2,794,356) * 100

Percentage Difference = (6,638,834 / 2,794,356) * 100

Percentage Difference ≈ 237.4%

Therefore, the percentage difference between the populations of New York City and Toronto is approximately 237.4%.

Here is the cost breakdown for this call:
Tokens Used: 3439
	Prompt Tokens: 3206
	Comple

In [None]:
relevant_docs = retriever.get_relevant_documents("pleasse compare the population between New York City and Toronto, what is the percentage difference between two populations?")
for doc in relevant_docs:
    print(f"{doc} \n")