Load and vectorize html documents.  Post questions to check search capability.

In [18]:
import os
import re
import glob
import time
import openai

# langchain
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# authentication
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

Load local html files here, local files have to be loaded one by one, apparently

In [2]:
urls = glob.glob('data/text/*html')
urls

['data/text/Robert_King_pitchbook.html',
 'data/text/Robert_King_google.html',
 'data/text/Robert_King_equilar.html',
 'data/text/Robert_King_linkedin.html',
 'data/text/Robert_King_relsci.html',
 'data/text/Robert_King_zoominfo.html',
 'data/text/Robert_King_wealthx.html']

In [14]:
docs = []
for url in urls:
    doc = UnstructuredHTMLLoader(url).load()
    docs.extend(doc)

docs

[Document(page_content="Lead partner on deals: {'Company': 'Tech Startup XYZ', 'Deal Date': 'May 15, 2023', 'Deal Type': 'Series B', 'Deal Size': '$30M', 'Deal Status': 'Completed', 'Location': 'San Francisco, CA', 'Representing': 'Sequoia Capital'}\n\nInvestor bio: Robert King is a seasoned investor with a focus on tech startups. He has led successful deals in the Series B funding stage, including the recent investment in Tech Startup XYZ.", metadata={'source': 'data/text/Robert_King_pitchbook.html'}),
 Document(page_content="Article 1: {'Title': 'Robert King donates $10 million to local charity', 'Date': 'March 10, 2023', 'Abstract': 'In a generous act of philanthropy, Robert King has donated $10 million to a local charity. The donation will support various community programs and initiatives, making a significant impact on the lives of many.'}", metadata={'source': 'data/text/Robert_King_google.html'}),
 Document(page_content='Stock sold - Equity Transactions (Last 36 Months): $20 mi

Vectorstore

In [19]:
embedding = OpenAIEmbeddings()

In [24]:
persist_directory = 'data/chroma/'
override = True

In [31]:
if os.path.exists(persist_directory) and override:  # clean out existing data
    for f in glob.glob(persist_directory+'index/*'):
        os.remove(f)
    
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [43]:
q = 'tell me about his board memberships'
docs = vectordb.similarity_search(q, k=1)
docs

[Document(page_content='Boards & Committees (Corporate): Tech Company X, Board Director\n\nBoards & Committees (Nonprofit): San Francisco Museum of Modern Art, Trustee\n\nFormer/Prior Boards & Committees (Corporate): Investment Firm Y, Board Director\n\nFormer/Prior Boards & Committees (Nonprofit): Local Charity Z, Trustee', metadata={'source': 'data/text/Robert_King_relsci.html'})]

todo:
connect openai and get llm summary
metadata filters - what are they useful for