In [1]:
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.callbacks import get_openai_callback
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain.schema import StrOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [3]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

with open('./custom_embedding/data/Toronto.txt', 'r') as f:
    html_toronto = f.read()
with open('./custom_embedding/data/San_Francisco.txt', 'r') as f:
    html_sf = f.read()
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
toronto_header_splits = html_splitter.split_text(html_toronto)
sf_header_splits = html_splitter.split_text(html_sf)

toronto_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
sf_header_splits[0].metadata = {"Header2": "Overview", "Header 3": "population"}
total_headers = toronto_header_splits + sf_header_splits

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(total_headers)


In [4]:
embeddings = OpenAIEmbeddings()
qdrant = Qdrant.from_documents(
    path="./local_qdrant_storage",
)

TypeError: VectorStore.from_documents() missing 2 required positional arguments: 'documents' and 'embedding'

In [2]:
print(Qdrant.get_collection(path="./local_qdrant_storage"))

AttributeError: type object 'Qdrant' has no attribute 'get_collection'

In [25]:
functions = [
    {
        "name": "question_decomposer",
        "description": "decompose a question into 2 parts: non-searchable and searchable",
        "parameters": {
            "type": "object",
            "properties": {
                "non-searchable": {
                    "type": "string",
                    "description": "non-searchable are verbs like compare, contrast which does not need to be searched in our vector database"
                },
                "searchable": {
                    "type": "string",
                    "description": "searchable are nouns like Toronto, San Francisco which are information need to be searched and retrieved from our vector database"
                }
            },
            "required":["non-searchable", "searchable"]
        }
    }, 
    {
        "name": "search_decomposer",
        "description": "decompose a search term into individual items. For example: a population of Toronto and San Francisco will be decomposed into 2 items: the population of Toronto and the population of San Francisco",
        "parameters": {
            "type": "object",
            "properties": {
                "list_of_items": {
                    "type": "array",
                    "items": {
                        "type": "string",
                        "description": "individual item that needs to be searched"
                    }
                }
            },
            "required":["list_of_items"]
        }
    }
]

In [21]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

template_1 = PromptTemplate.from_template("Please decompose this quesstion: {question}")

chain_1 = {"question": itemgetter("question")} | template_1 | llm.bind(function_call={"name": "question_decomposer"},functions=functions) | JsonOutputFunctionsParser()


In [37]:
with get_openai_callback() as callback:
    question_desomposed = chain_1.invoke({"question": "compare the population of Toronto and San Francisco"})
    print(f"\nHere is the cost breakdown for this call:\n{callback}")


Here is the cost breakdown for this call:
Tokens Used: 154
	Prompt Tokens: 130
	Completion Tokens: 24
Successful Requests: 1
Total Cost (USD): $0.00024300000000000002


In [26]:
template_2 = PromptTemplate.from_template("Please decompose this search term: {search_term}")
chain_2 = {"search_term": itemgetter("search_term")} | template_2 | llm.bind(function_call={"name": "search_decomposer"},functions=functions) | JsonOutputFunctionsParser()

In [38]:
with get_openai_callback() as callback:
    search = chain_2.invoke({"search_term": question_desomposed["searchable"]})
    print(search)
    print(f"\nHere is the cost breakdown for this call:\n{callback}")

{'list_of_items': ['population of Toronto', 'population of San Francisco']}

Here is the cost breakdown for this call:
Tokens Used: 204
	Prompt Tokens: 185
	Completion Tokens: 19
Successful Requests: 1
Total Cost (USD): $0.0003155


In [43]:
relevant_docs = ""
for i in search["list_of_items"]:
    docs = qdrant.similarity_search(i, k=1)
    for j in docs:
        relevant_docs += j.page_content + "\n\n"
print(relevant_docs)

Toronto is the most populous city in Canada and the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the fourth-most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. Toronto is an international centre of business, finance, arts, sports and culture, and is recognized as one of the most multicultural and cosmopolitan cities in the world.

The 2020 United States census showed San Francisco's population to be 873,965, an increase of 8.5% from the 2010 census. With roughly one-quarter the population density of Manhattan, San Francisco is the second-most densely populated large American city, behind only New York City among cities greater than 200,000 population, and the fifth-most densely populated U.S. county, following only four of

In [35]:

template = """given the below context:
{context} 
please answer the question: {question}
"""
prompt = PromptTemplate.from_template(template=template)

chain = {"context": itemgetter("context"), "question": itemgetter("question")}| prompt | llm | StrOutputParser()

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [44]:
with get_openai_callback() as callback:
    response = chain.invoke({"context": relevant_docs ,"question": "compare the population of Toronto and San Francisco"})
    print(response)
    print(f"\nHere is the cost breakdown for this call:\n{callback}")

The population of Toronto in 2021 was 2,794,356, while the population of San Francisco in 2020 was 873,965. Therefore, Toronto has a significantly larger population than San Francisco.

Here is the cost breakdown for this call:
Tokens Used: 411
	Prompt Tokens: 367
	Completion Tokens: 44
Successful Requests: 1
Total Cost (USD): $0.0006385


In [46]:
docs = qdrant.similarity_search("population of Toronto", k=1)
pretty_print_docs(docs)

Document 1:

Toronto is the most populous city in Canada and the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the fourth-most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. Toronto is an international centre of business, finance, arts, sports and culture, and is recognized as one of the most multicultural and cosmopolitan cities in the world.
