In [12]:
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

In [13]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

with open('./custom_embedding/data/Toronto.txt', 'r') as f:
    html_toronto = f.read()
with open('./custom_embedding/data/San_Francisco.txt', 'r') as f:
    html_sf = f.read()
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
toronto_header_splits = html_splitter.split_text(html_toronto)
sf_header_splits = html_splitter.split_text(html_sf)

toronto_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
sf_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
total_headers = toronto_header_splits + sf_header_splits

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(total_headers)
len(texts)

258

In [None]:
embeddings = OpenAIEmbeddings()
qdrant = Qdrant.from_documents(
    texts,
    embeddings,
    path="./local_qdrant_storage",  # Local mode with on-disk storage
    collection_name="toronto_sf"
)

In [16]:
retriever = qdrant.as_retriever(search_kwargs={"k":4})

In [17]:
from langchain.callbacks import get_openai_callback
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain.schema import StrOutputParser
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

template = """given the below context:
{context} 
please answer the question: {question}
"""
prompt = PromptTemplate.from_template(template=template)

chain = {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}| prompt | llm | StrOutputParser()

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [10]:
with get_openai_callback() as callback:
    response = chain.invoke({"question": "what is the population of Toronto?"})
    print(response)
    print(f"\nHere is the cost breakdown for this call:\n{callback}")



The population of Toronto is 2,794,356.

Here is the cost breakdown for this call:
Tokens Used: 1776
	Prompt Tokens: 1564
	Completion Tokens: 212
Successful Requests: 5
Total Cost (USD): $0.0027700000000000003


In [9]:
query = "compare the population of San Francisco to Toronto"
found_docs = qdrant.similarity_search(query, k=6)
pretty_print_docs(found_docs)

Document 1:

The 2020 United States census showed San Francisco's population to be 873,965, an increase of 8.5% from the 2010 census. With roughly one-quarter the population density of Manhattan, San Francisco is the second-most densely populated large American city, behind only New York City among cities greater than 200,000 population, and the fifth-most densely populated U.S. county, following only four of the five New York City boroughs.  
San Francisco is part of the five-county San Francisco–Oakland–Hayward, CA Metropolitan Statistical Area, a region of 4.7 million people (13th most populous in the U.S.), and has served as its traditional demographic focal point. It is also part of the greater 14-county San Jose-San Francisco-Oakland, CA Combined Statistical Area, whose population is over 9.6 million, making it the fifth-largest in the United States as of 2018.
----------------------------------------------------------------------------------------------------
Document 2:

The 20

In [None]:
compressed_docs = compression_retriever.get_relevant_documents("compare the population of Toronto to San Francisco.")
pretty_print_docs(compressed_docs)

In [20]:
# this one doesn't help either.
query = "compare the population of Toronto and San Francisco"
found_docs = qdrant.max_marginal_relevance_search(query, k=6, fetch_k=10)
pretty_print_docs(found_docs)

Document 1:

The city's foreign-born persons made up 47 per cent of the population, compared to 49.9 per cent in 2006. According to the United Nations Development Programme, Toronto has the second-highest percentage of constant foreign-born population among world cities, after Miami, Florida. While Miami's foreign-born population has traditionally consisted primarily of Cubans and other Latin Americans, no single nationality or culture dominates Toronto's immigrant population, placing it among the most diverse cities in the world. In 2010, it was estimated over 100,000 immigrants arrive in the Greater Toronto Area each year.
----------------------------------------------------------------------------------------------------
Document 2:

Toronto is a large hub of the Canadian and global technology industry, generating $52 billion in revenues annually. In 2017, Toronto tech firms offered almost 30,000 jobs which is higher than the combination of San Francisco Bay area, Seattle and Washin