Load and vectorize html documents.  Post questions to check search capability.

In [1]:
import os
import re
import glob
import time
import openai

# langchain
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# authentication
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

Load local html files here, local files have to be loaded one by one, apparently

In [2]:
urls = glob.glob('data/text/*html')
urls

['data/text/Julia_Harpman_equilar.html',
 'data/text/Robert_King_pitchbook.html',
 'data/text/Julia_Harpman_zoominfo.html',
 'data/text/Robert_King_google.html',
 'data/text/Robert_King_equilar.html',
 'data/text/Julia_Harpman_relsci.html',
 'data/text/Julia_Harpman_wealthx.html',
 'data/text/Robert_King_linkedin.html',
 'data/text/Julia_Harpman_pitchbook.html',
 'data/text/Robert_King_relsci.html',
 'data/text/Julia_Harpman_linkedin.html',
 'data/text/Robert_King_zoominfo.html',
 'data/text/Robert_King_wealthx.html',
 'data/text/Julia_Harpman_google.html']

In [3]:
docs = []
for url in urls:
    doc = UnstructuredHTMLLoader(url).load()
    # infer client name and add to metadata
    root = url.split('/')[-1]
    toks = root.split('_')
    client_name = toks[:-1]
    doc_type = toks[-1][:-5]
    # manually edit metadata
    doc[0].metadata['client_name'] = ' '.join(client_name)
    #doc[0].metadata['doc_type'] = doc_type
    #print(doc[0].metadata['client_name'], doc_type)
    docs.extend(doc)

docs

[Document(page_content='Stock sold - Equity Transactions (Last 36 Months): Not applicable\n\nNew Equity Grants - Equity Transactions (Last 36 Months): Not applicable\n\nOptions Exercised - Equity Transactions (Last 36 Months): Not applicable\n\nEquity Holdings - Equity Transactions: Not applicable\n\nAnnual Compensation: Not applicable\n\nStock Sold:', metadata={'source': 'data/text/Julia_Harpman_equilar.html', 'client_name': 'Julia Harpman'}),
 Document(page_content='Lead partner on deals:\n\nCompany: Tech Startup XYZ\n\nDeal Date: May 15, 2023\n\nDeal Type: Series B\n\nDeal Size: $30M\n\nDeal Status: Completed\n\nLocation: San Francisco, CA\n\nRepresenting: Sequoia Capital\n\nInvestor bio: Robert King is a seasoned investor with a focus on technology startups. He has led several successful deals, including the recent Series B funding round for Tech Startup XYZ.', metadata={'source': 'data/text/Robert_King_pitchbook.html', 'client_name': 'Robert King'}),
 Document(page_content='Person

Vectorstore

In [4]:
embedding = OpenAIEmbeddings()

In [5]:
persist_directory = 'data/chroma/'
override = True

In [6]:
if os.path.exists(persist_directory) and override:  # clean out existing data
    for f in glob.glob(persist_directory+'index/*'):
        os.remove(f)
    
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [9]:
q = 'list docs with personal email for Julia Harpman'
docs = vectordb.similarity_search(q, k=3, filter={"client_name": "Robert King"})
docs

[Document(page_content='Personal Email: robertking@email.com', metadata={'client_name': 'Robert King', 'source': 'data/text/Robert_King_zoominfo.html'}),
 Document(page_content='Boards & Committees (Corporate): Tech Company X, Board Director\n\nBoards & Committees (Nonprofit): San Francisco Museum of Modern Art, Trustee\n\nFormer/Prior Boards & Committees (Corporate): Financial Institution Y, Board Director\n\nFormer/Prior Boards & Committees (Nonprofit): Local Charity Z, Trustee', metadata={'client_name': 'Robert King', 'source': 'data/text/Robert_King_relsci.html'}),
 Document(page_content='Lead partner on deals:\n\nCompany: Tech Startup XYZ\n\nDeal Date: May 15, 2023\n\nDeal Type: Series B\n\nDeal Size: $30M\n\nDeal Status: Completed\n\nLocation: San Francisco, CA\n\nRepresenting: Sequoia Capital\n\nInvestor bio: Robert King is a seasoned investor with a focus on technology startups. He has led several successful deals, including the recent Series B funding round for Tech Startup XY

todo:
connect openai and get llm summary
metadata filters - what are they useful for