Load and vectorize html documents.  Post questions to check search capability.

In [1]:
import os
import re
import glob
import time
import openai

# langchain
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# authentication
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

Load local html files here, local files have to be loaded one by one, apparently

In [2]:
urls = glob.glob('data/text/*html')
urls

['data/text/Julia_Harpman_equilar.html',
 'data/text/Robert_King_pitchbook.html',
 'data/text/Julia_Harpman_zoominfo.html',
 'data/text/Robert_King_google.html',
 'data/text/Robert_King_equilar.html',
 'data/text/Julia_Harpman_relsci.html',
 'data/text/Julia_Harpman_wealthx.html',
 'data/text/Robert_King_linkedin.html',
 'data/text/Julia_Harpman_pitchbook.html',
 'data/text/Robert_King_relsci.html',
 'data/text/Julia_Harpman_linkedin.html',
 'data/text/Robert_King_zoominfo.html',
 'data/text/Robert_King_wealthx.html',
 'data/text/Julia_Harpman_google.html']

In [3]:
docs = []
for url in urls:
    doc = UnstructuredHTMLLoader(url).load()
    # infer client name and add to metadata
    root = url.split('/')[-1]
    toks = root.split('_')
    client_name = toks[:-1]
    doc_type = toks[-1][:-5]
    # manually edit metadata
    doc[0].metadata['client_name'] = ' '.join(client_name)
    doc[0].metadata['doc_type'] = doc_type
    #print(doc[0].metadata['client_name'], doc_type)
    docs.extend(doc)

docs

[Document(page_content='Stock sold - Equity Transactions (Last 36 Months): Not applicable\n\nNew Equity Grants - Equity Transactions (Last 36 Months): Not applicable\n\nOptions Exercised - Equity Transactions (Last 36 Months): Not applicable\n\nEquity Holdings - Equity Transactions: Not applicable\n\nAnnual Compensation: Not applicable\n\nStock Sold:', metadata={'source': 'data/text/Julia_Harpman_equilar.html', 'client_name': 'Julia Harpman', 'doc_type': 'equilar'}),
 Document(page_content='Lead partner on deals:\n\nCompany: Tech Startup XYZ\n\nDeal Date: May 15, 2023\n\nDeal Type: Series B\n\nDeal Size: $30M\n\nDeal Status: Completed\n\nLocation: San Francisco, CA\n\nRepresenting: Sequoia Capital\n\nInvestor bio: Robert King is a seasoned investor with a focus on technology startups. He has led several successful deals, including the recent Series B funding round for Tech Startup XYZ.', metadata={'source': 'data/text/Robert_King_pitchbook.html', 'client_name': 'Robert King', 'doc_type

Vectorstore

In [4]:
# optionally persist the vector db to disk
persist_directory = 'data/chroma/'
override = False

if os.path.exists(persist_directory) and override:  # clean out existing data
    for f in glob.glob(persist_directory+'index/*'):
        os.remove(f)
    
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(),
    #persist_directory=persist_directory  # uncomment to persist on disk
)

In [5]:
# retrieval based on natural language question (no metadata filter applied)
q1 = 'Tell me about Robert King\'s family assets, board memberships, and venture capital deals'
max_k=7

### retrieval based on cosine similarity
for d in vectordb.similarity_search(q1, k=max_k):  # 6/8 results are for Robert King
    print(d.metadata)

{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'zoominfo', 'source': 'data/text/Robert_King_zoominfo.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'pitchbook', 'source': 'data/text/Julia_Harpman_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'relsci', 'source': 'data/text/Julia_Harpman_relsci.html'}


In [6]:
### retrieval based on maximal marginal relevance
### (cosine similarity plus an additional filter to promote diversity)
### https://python.langchain.com/docs/modules/model_io/prompts/example_selectors/mmr
for d in vectordb.max_marginal_relevance_search(q1, k=max_k):  # only 4/8 results pertain to Robert King - BAD
    print(d.metadata)

Number of requested results 20 is greater than number of elements in index 14, updating n_results = 14


{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'zoominfo', 'source': 'data/text/Robert_King_zoominfo.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'pitchbook', 'source': 'data/text/Julia_Harpman_pitchbook.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'relsci', 'source': 'data/text/Julia_Harpman_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'wealthx', 'source': 'data/text/Robert_King_wealthx.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'equilar', 'source': 'data/text/Julia_Harpman_equilar.html'}


In [17]:
# retrieval with metadata filter: only one metadata field can be specified at a time
q2 = 'Tell me about Robert King\'s equity holdings, stocks sold, transactions, and annual compensation'

print('#### Similarity search ####')
for d in vectordb.similarity_search(q2, k=max_k, filter={'client_name': 'Robert King'}):
    print(d.metadata)

print('\n\n#### MMR search ####')
for d in vectordb.max_marginal_relevance_search(q2, k=max_k, filter={'client_name': 'Robert King'}):
    print(d.metadata)

### equilar is ranking highest in both cases, but it took several attempts to make
### the question specific enough so it placed high
### documents appear ranked in the same order

#### Similarity search ####
{'client_name': 'Robert King', 'doc_type': 'equilar', 'source': 'data/text/Robert_King_equilar.html'}
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'zoominfo', 'source': 'data/text/Robert_King_zoominfo.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'wealthx', 'source': 'data/text/Robert_King_wealthx.html'}


#### MMR search ####


Number of requested results 20 is greater than number of elements in index 14, updating n_results = 14


{'client_name': 'Robert King', 'doc_type': 'equilar', 'source': 'data/text/Robert_King_equilar.html'}
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'zoominfo', 'source': 'data/text/Robert_King_zoominfo.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'wealthx', 'source': 'data/text/Robert_King_wealthx.html'}


Self-query meta data filter

In [8]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [9]:
metadata_field_info = [
    AttributeInfo(
        name="client_name",
        description="The name of the client associated with the document.",
        type="string",
    ),
    
    AttributeInfo(
        name="doc_type",
        description="The lecture the chunk is from, should be one of `equilar`, \
            `google`, `linkedin`, `pitchbook`, `relsci`, `wealthx`, or `zoominfo`",
        type="string",
    ),
]

In [10]:
document_content_description = "Client documents"

llm = OpenAI(temperature=0)

sq_retriever = SelfQueryRetriever.from_llm(  # sq_retriever = self-query retriever
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True
)

In [11]:
print(f'### Question: {q1} ###\n')
docs = sq_retriever.get_relevant_documents(q1)
for d in docs:
    print(d.metadata)

### self-query only returns 4 documents, not sure why
### was hoping wealthx would be included

### Question: Tell me about Robert King's family assets, board memberships, and venture capital deals ###





query='Robert King family assets board memberships venture capital deals' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}


In [18]:
print(f'### Question: {q2} ###\n')
docs = sq_retriever.get_relevant_documents(q2)
for d in docs:
    print(d.metadata)

### Question: Tell me about Robert King's equity holdings, stocks sold, transactions, and annual compensation ###

query='Robert King equity holdings stocks sold transactions annual compensation' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None
{'client_name': 'Robert King', 'doc_type': 'equilar', 'source': 'data/text/Robert_King_equilar.html'}
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}


In [19]:
### test query on doc_type metadata with limit
q = "Find the linkedin data for Robert King"
docs = sq_retriever.get_relevant_documents(q)
for d in docs:
    print(d.metadata)

### linkedin filter correct but multiple clients returned

query='Robert King' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='linkedin') limit=None
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Julia Harpman', 'doc_type': 'linkedin', 'source': 'data/text/Julia_Harpman_linkedin.html'}


In [20]:
### test query on doc_type metadata with limit
q = "Find the linkedin data for client Robert King"
docs = sq_retriever.get_relevant_documents(q)
for d in docs:
    print(d.metadata)

### note: "client Robert King" also sets filter on the client name
### could also say "the linkedin profile" to imply there is one result

query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='linkedin')]) limit=None
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}


In [22]:
q = "Find the equilar and relsci documents for client Robert King"
docs = sq_retriever.get_relevant_documents(q)
for d in docs:
    print(d.metadata)

### This demonstrates simultaneous filter on doc type and client name

query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='equilar'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='relsci')]), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King')]) limit=None
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'equilar', 'source': 'data/text/Robert_King_equilar.html'}


Recommendations:
 - Vector DB searches (similarity, maximal marginal relevance) in Chroma are prone to mixing results for separate clients.
 - Metadata filters clients for the correct client or document type. Not sure if metadata filters can include multiple filters (e.g., client_name and doc_type, or doc_type has one of several values) at once.
 - Self-query has the potential to automatically construct the filter for client and document type. It unfortunately can still mix results for multiple clients, unless one is careful about asking the question.  Also, with Chroma, the output seems fixed on just four documents.

Todo:
 - Repeat this experiment with other vectorstores to see if one works better.
 - Implement the final call to the LLM with the retrieved documents as context.

### Q&A with retrieval

In [29]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

In [33]:
### generation without retrieval
q3 = 'Who is Robert King?'
llm.call_as_llm(q3)

'There are several individuals named Robert King, so it is unclear which specific person you are referring to. Some notable individuals named Robert King include:\n\n1. Robert King (composer): An English composer known for his choral and orchestral works.\n2. Robert King (journalist): An American journalist and author who has written extensively on criminal justice issues.\n3. Robert King (screenwriter): A British screenwriter known for his work on the television series "The Good Karma Hospital."\n4. Robert King (politician): An American politician who served as the Governor of Indiana from 1997 to 2003.\n5. Robert King (music producer): An American music producer and executive who has worked with various artists in the music industry.\n\nWithout more specific information, it is difficult to determine which Robert King you are referring to.'

In [35]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q3})
print(f'### Question: {q3} ###\n')
result['result']



query='Robert King' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None


'Robert King is an experienced finance professional with a strong background in hedge funds. He currently serves as the CEO of Hedge Fund A in San Francisco. Prior to this role, he worked as a Senior Portfolio Manager at Hedge Fund B. Robert is also actively involved in the local community and serves as a Board Director for the San Francisco Chamber of Commerce. He has also made a generous donation of $1 million to a local charity. Additionally, he has led successful investment deals, including the recent Series B funding round for Tech Startup XYZ.'

In [36]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q2})
print(f'### Question: {q2} ###\n')
result['result']

query='Robert King equity holdings stocks sold transactions annual compensation' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None
### Question: Tell me about Robert King's equity holdings, stocks sold, transactions, and annual compensation ###



"Robert King's equity holdings are valued at $20 million. He has sold $10 million worth of stock in equity transactions over the last 36 months. Additionally, he has received new equity grants totaling $5 million and has exercised options worth $2 million during the same period. His annual compensation is $5 million."

In [37]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q1})
print(f'### Question: {q1} ###\n')
result['result']

query='Robert King family assets board memberships venture capital deals' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None
### Question: Tell me about Robert King's family assets, board memberships, and venture capital deals ###



"I'm sorry, but I don't have any information about Robert King's family assets or venture capital deals beyond what has been provided. However, I can tell you about his board memberships. Robert King currently serves as a Board Director for Tech Company X and as a Trustee for the San Francisco Museum of Modern Art. In the past, he has also served as a Board Director for Financial Institution Y and as a Trustee for Local Charity Z."