In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from dotenv import load_dotenv
load_dotenv()
# Helper function for printing docs
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
with open('./custom_embedding/data/Toronto.txt', 'r') as f:
    html_toronto = f.read()
with open('./custom_embedding/data/San_Francisco.txt', 'r') as f:
    html_sf = f.read()
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
toronto_header_splits = html_splitter.split_text(html_toronto)
sf_header_splits = html_splitter.split_text(html_sf)

toronto_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
sf_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
total_headers = toronto_header_splits + sf_header_splits
len(total_headers)

75

In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=80)
texts = text_splitter.split_documents(total_headers)
len(texts)

253

In [5]:
retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever(search_kwargs={"k": 1})

In [6]:
relevant_docs = retriever.get_relevant_documents("population of Toronto", k=1)
for doc in relevant_docs:
    print(f"{doc} \n")

page_content='Toronto is the most populous city in Canada and the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the fourth-most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. Toronto is an international centre of business, finance, arts, sports and culture, and is recognized as one of the most multicultural and cosmopolitan cities in the world.' metadata={'Header2': 'Overview', 'Header 3': 'population'} 



In [4]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [14]:
from langchain.callbacks import get_openai_callback
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain.schema import StrOutputParser

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

template = """given the below context:
{context} 
please answer the question: {question}
"""
prompt = PromptTemplate.from_template(template=template)

chain = {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}| prompt | llm | StrOutputParser()


In [8]:
from langchain.globals import set_verbose

In [None]:

with get_openai_callback() as callback:
   set_verbose(True)
   response = chain.invoke({"question": "compare the population of Toronto and San Francisco"})
   print(response)
   print(f"\nHere is the cost breakdown for this call:\n{callback}")

In [16]:

compressed_docs = compression_retriever.get_relevant_documents("compare the population of Toronto to San Francisco.")
pretty_print_docs(compressed_docs)





[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: compare the population of Toronto to San Francisco.
> Context:
>>>
The city's foreign-born persons made up 47 per cent of the population, compared to 49.9 per cent in 2006. According to the United Nations Development Programme, Toronto has the second-highest percentage of constant foreign-born population among world cities, after Miami, Florida. While Miami's foreign-born population has traditionally consisted primarily of Cubans and other Latin Americans, no single nationality or culture dominates Toronto's immigrant population, placing it among the most diverse cities in the world. In 2010, it was estimated over 100,000 immigrants arrive in the Greater Toron




[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: compare the population of Toronto to San Francisco.
> Context:
>>>
In the decades after World War II, refugees from war-torn Europe and Chinese job-seekers arrived, as well as construction labourers, particularly from Italy and Portugal. Toronto's population grew to more than one million in 1951 when large-scale suburbanization began and doubled to two million by 1971. Following the elimination of racially based immigration policies by the late 1960s, Toronto became a destination for immigrants from all parts of the world. By the 1980s, Toronto had surpassed Montreal as Canada's most populous city and chief economic hub. During this 




[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: compare the population of Toronto to San Francisco.
> Context:
>>>
The 2020 United States census showed San Francisco's population to be 873,965, an increase of 8.5% from the 2010 census. With roughly one-quarter the population density of Manhattan, San Francisco is the second-most densely populated large American city, behind only New York City among cities greater than 200,000 population, and the fifth-most densely populated U.S. county, following only four of the five New York City boroughs.  
San Francisco is part of the five-county San Francisco–Oakland–Hayward, CA Metropolitan Statistical Area, a region of 4.7 million people (1




[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: compare the population of Toronto to San Francisco.
> Context:
>>>
The historically low crime rate in Toronto has resulted in the city having a reputation as one of the safest major cities in North America. For instance, in 2007, the homicide rate for Toronto was 3.43 per 100,000 people, compared with Atlanta (19.7), Boston (10.3), Los Angeles (10.0), New York City (6.3), Vancouver (3.1), and Montreal (2.6). Toronto's robbery rate also ranks low, with 207.1 robberies per 100,000 people, compared with Los Angeles (348.5), Vancouver (266.2), New York City (265.9), and Montreal (235.3). Toronto has a comparable rate of car theft to vari




[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: compare the population of Toronto to San Francisco.
> Context:
>>>
The city continues to grow and attract immigrants. A 2019 study by Toronto Metropolitan University (then known as Ryerson University) showed that Toronto was the fastest-growing city in North America. The city added 77,435 people between July 2017 and July 2018. The Toronto metropolitan area was the second-fastest-growing metropolitan area in North America, adding 125,298 persons, compared with 131,767 in the Dallas–Fort Worth–Arlington metroplex in Texas. The large growth in the Toronto metropolitan area is attributed to international migration to Toronto.  
The COVI




[1m> Finished chain.[0m
Document 1:

The city's foreign-born persons made up 47 per cent of the population, compared to 49.9 per cent in 2006. According to the United Nations Development Programme, Toronto has the second-highest percentage of constant foreign-born population among world cities, after Miami, Florida. In 2010, it was estimated over 100,000 immigrants arrive in the Greater Toronto Area each year.
----------------------------------------------------------------------------------------------------
Document 2:

Toronto is the most populous city in Canada and the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the fourth-most populous city in North America.
----------------------------------------------------------------------------------------------------
Document 3:

Toronto's population grew to more than one million in 1951 when large-scale suburbanization began and doubled to two million by 1971. By the 1980s, To

The LLMChainFilter is not good in this case, it returns 3 out of 4 retrieved documents
The EmbeddingFilter also returned 4 documents in this case with decent length