Build sample tearsheet

In [2]:
import os
import re
import glob
import time
import openai

# langchain
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# ibm models
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes

# authentication
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
#openai.api_key = os.environ["OPENAI_API_KEY"]

ibm_credentials = {
    "url": os.environ['IBM_URL'],
    "apikey": os.environ['IBM_API_KEY']
}

ibm_project_id = os.environ['IBM_PROJECT_ID']

In [3]:
# ibm models and parameters

parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.MIN_NEW_TOKENS: 1,
    GenParams.MAX_NEW_TOKENS: 1024
}

model = Model(
    model_id=ModelTypes.LLAMA_2_70B_CHAT,
    params=parameters,
    credentials=ibm_credentials,
    project_id=ibm_project_id
)

Load local html files here, local files have to be loaded one by one, apparently

In [4]:
urls = glob.glob('data/text/Robert_King*html')
urls

['data/text/Robert_King_pitchbook.html',
 'data/text/Robert_King_google.html',
 'data/text/Robert_King_equilar.html',
 'data/text/Robert_King_linkedin.html',
 'data/text/Robert_King_relsci.html',
 'data/text/Robert_King_zoominfo.html',
 'data/text/Robert_King_wealthx.html']

In [5]:
docs = []
for url in urls:
    doc = UnstructuredHTMLLoader(url).load()
    # infer client name and add to metadata
    root = url.split('/')[-1]
    toks = root.split('_')
    client_name = toks[:-1]
    doc_type = toks[-1][:-5]
    # manually edit metadata
    doc[0].metadata['client_name'] = ' '.join(client_name)
    doc[0].metadata['doc_type'] = doc_type
    #print(doc[0].metadata['client_name'], doc_type)
    docs.extend(doc)

docs

[Document(page_content='Lead partner on deals:\n\nCompany: Tech Startup XYZ\n\nDeal Date: May 15 2023\n\nDeal Type: Series B\n\nDeal Size: $30M\n\nDeal Status: Completed\n\nLocation: San Francisco, CA\n\nRepresenting: Sequoia Capital\n\nOther Partners: Rich Dude I, Rich Guy II, Notso Rich III, Rich Wannabe\n\nInvestor bio: Robert King is a highly successful investor with a strong track record in the hedge fund industry. He has led numerous successful deals and has a deep understanding of the startup ecosystem. Robert is known for his strategic thinking and ability to identify promising investment opportunities.', metadata={'source': 'data/text/Robert_King_pitchbook.html', 'client_name': 'Robert King', 'doc_type': 'pitchbook'}),
 Document(page_content='Article 1:\n\nTitle: Robert King donates $1 million to local charity\n\nDate: March 10, 2023\n\nAbstract: In a generous act of philanthropy, Robert King has donated $1 million to a local charity.                          The donation will

Vectorstore

In [6]:
# optionally persist the vector db to disk
persist_directory = 'data/chroma/'
override = False

if os.path.exists(persist_directory) and override:  # clean out existing data
    for f in glob.glob(persist_directory+'index/*'):
        os.remove(f)
    
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(),
    #embedding=HuggingFaceEmbeddings(),
    #persist_directory=persist_directory  # uncomment to persist on disk
)

In [24]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains import RetrievalQA

In [11]:
metadata_field_info = [
    AttributeInfo(
        name="client_name",
        description="The name of the client associated with the document.",
        type="string",
    ),
    
    AttributeInfo(
        name="doc_type",
        description="The lecture the chunk is from, should be one of `equilar`, \
            `google`, `linkedin`, `pitchbook`, `relsci`, `wealthx`, or `zoominfo`",
        type="string",
    ),
]

In [20]:
document_content_description = "Client documents"

#llm = model.to_langchain()
llm = OpenAI()

sq_retriever = SelfQueryRetriever.from_llm(  # sq_retriever = self-query retriever
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True
)

In [21]:
### sample q & a

print(f'### Question: {q1} ###\n')
q1 = 'Tell me about Robert King\'s family assets, board memberships, and venture capital deals'
docs = sq_retriever.get_relevant_documents(q1)
for d in docs:
    print(d.metadata)

### self-query only returns 4 documents, not sure why
### was hoping wealthx would be included

### Question: Tell me about Robert King's family assets, board memberships, and venture capital deals ###

query='Robert King Family Assets Board Memberships Venture Capital Deals' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='client_name', value='Robert King') limit=None
{'client_name': 'Robert King', 'doc_type': 'pitchbook', 'source': 'data/text/Robert_King_pitchbook.html'}
{'client_name': 'Robert King', 'doc_type': 'linkedin', 'source': 'data/text/Robert_King_linkedin.html'}
{'client_name': 'Robert King', 'doc_type': 'relsci', 'source': 'data/text/Robert_King_relsci.html'}
{'client_name': 'Robert King', 'doc_type': 'google', 'source': 'data/text/Robert_King_google.html'}


questions to answer:

bio:
  - what is the current employment
  - summarize employment prior to the current position
  - describe any roles as a board member
  - describe any educational credentials and where they were obtained
  - describe the organization where client is currently employed (prone to hallucination)

table:
  - list the current employment title
  - summarize the education as an itemized list, more advanced degrees first
  - list prior employments as an itemized list
  - list the current board memberships
  - list prior board affiliations

In [38]:
multi_doc_prompt_dict = {'1':
                             {'q': 'according to linkedin, what is the current position of {client}?',
                              'a': ''},
                         '2':
                             {'q': 'where did {client} work prior to the current position?',
                              'a': ''},
                         '3':
                             {'q': 'according to linkedin, relsci, and pitchbook, what boards did the {client} serve on? What roles did they have on those boards?',
                              'a': ''},
                         '4': {
                             'q': 'what education credentials does {client} have',
                             'a': ''},
                         '5': {
                             'q': 'describe the nature (industry, purpose) of the organization where {client} currently works',
                             'a': ''},
                        }

client = 'Robert King'

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

for key in multi_doc_prompt_dict.keys():
    ### retrieve exact document per client
    q = multi_doc_prompt_dict[key]['q'].format(client=client)
    response = qa_chain({"query": q})
    multi_doc_prompt_dict[key]['a'] = response['result']

    print(f'{q}: {a}')

    #response = qa_chain({'query': 'Summarize the context in 100 words or less.', 'input_docs': docs})

    ### summarize the document
    #doc_content = '\n'.join([d.page_content for d in docs])
    #response = llm.call_as_llm(doc_summary_template.format(context=doc_content))
    
    #multi_doc_prompt_dict[key]['a'] = response

print('\n\n### Results:')
for key in multi_doc_prompt_dict.keys():
    print(f"{key}, q: {multi_doc_prompt_dict[key]['q']}, a: {multi_doc_prompt_dict[key]['a']}")

query='Robert King' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='linkedin') limit=1
according to linkedin, what is the current position of Robert King?: {'query': 'describe the organization where Robert King currently works', 'result': " I don't know."}
query='Robert King' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='linkedin') limit=1
where did Robert King work prior to the current position?: {'query': 'describe the organization where Robert King currently works', 'result': " I don't know."}
query='Robert King' filter=Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='linkedin'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='relsci'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='doc_type', value='pitchbook')]) limit=None
according to linkedin, relsci, and pitchbook, what boards did the Robert King serve

In [37]:
print('\n\n### Results:')
for key in multi_doc_prompt_dict.keys():
    print(f"{key}, q: {multi_doc_prompt_dict[key]['q']}, a: {multi_doc_prompt_dict[key]['a']}")



### Results:
1, q: according to linkedin, what is the current position of {client}?, a:  CEO of Hedge Fund A in San Francisco
2, q: where did {client} work prior to the current position?, a:  Robert King worked as a Senior Portfolio Manager at Hedge Fund B prior to his current position as the CEO of Hedge Fund A.
3, q: according to linkedin, relsci, and pitchbook, what boards did the {client} serve on? What roles did they have on those boards?, a:  According to the information provided, Robert King served as a Board Director for Tech Company X, a Trustee for the San Francisco Museum of Modern Art, a Board Director for Investment Firm Y, and a Trustee for Local Charity Z.
4, q: what education credentials does {client} have, a:  Robert King has an MBA from Stanford University.
5, q: describe the nature (industry, purpose) of the organization where {client} currently works, a:  I don't know.


### Q&A with retrieval

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

In [None]:
### generation without retrieval
q3 = 'Who is Robert King?'
print(f'### Question: {q3} ###\n')
llm.call_as_llm(q3)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q3})
print(f'### Question: {q3} ###\n')
result['result']

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q2})
print(f'### Question: {q2} ###\n')
result['result']

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=sq_retriever
)

result = qa_chain({"query": q1})
print(f'### Question: {q1} ###\n')
result['result']

In [None]:
a

### Combine multiple LLM calls to construct one coherent biography

Experiment 1:

    a. Summarize each document individually
    b. Feed summaries into one prompt
    c. Write the biography, emphasizing specific information

In [None]:
### Summarize individual documents
multi_doc_prompt_dict = {'linkedin': {'q': '', 'a': ''},
                         'google': {'q': '', 'a': ''},
                         'equilar': {'q': '', 'a': ''},
                         'pitchbook': {'q': '', 'a': ''},
                         'relsci': {'q': '', 'a': ''},
                         'wealthx': {'q': '', 'a': ''},
                         'zoominfo': {'q': '', 'a': ''},
                        }

client = 'Robert King'
doc_retriever_template = 'Retrieve the {doc_type} document for {client}.' # Summarize the provided context in 100 or fewer words.'

doc_summary_template = '''
Summarize the context below in 100 words or less.

Context:
{context}
'''

for key in multi_doc_prompt_dict.keys():
    ### retrieve exact document per client
    q = doc_retriever_template.format(doc_type=key, client=client)
    docs = sq_retriever.get_relevant_documents(q)
    multi_doc_prompt_dict[key]['q'] = q
    #response = qa_chain({'query': 'Summarize the context in 100 words or less.', 'input_docs': docs})

    ### summarize the document
    doc_content = '\n'.join([d.page_content for d in docs])
    response = llm.call_as_llm(doc_summary_template.format(context=doc_content))
    
    multi_doc_prompt_dict[key]['a'] = response

print('\n\n### Results:')
for key in multi_doc_prompt_dict.keys():
    print(f"document_type: {key}, summary: {multi_doc_prompt_dict[key]['a']}")

In [None]:
### construct one prompt with all of the above summaries

bio_prompt_template = '''
You are a writer and biographer. You specialize in writing
accurate life summarizes given large input documes.
Below is information from several documents about a single
client named {client}.

Prepare a biography that includes in the following order:
1. The client's name
2. Their professional work history
3. Board member activities
4. Philantropic activities
5. Their education
6. Any details about their family

Format the output as prose rather than an ordered list.

Input context:
{context}

Your response here:
'''

context = ''
for key in multi_doc_prompt_dict:
    context += ('\n\n' + multi_doc_prompt_dict[key]['a'])

formatted_prompt = bio_prompt_template.format(client=client, context=context)
print(f'Input prompt:\n\n{formatted_prompt}')

response = llm.call_as_llm(formatted_prompt)

In [None]:
print(response)