Process for generating html documents with raw tearsheet inputs.

In [1]:
import os
import re
import time
import json
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
# Helper function to parse the output from a one or more prompts combined
# expose the temperature setting to increase the degree of randomness

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0, verbose=False):
    """
    Prime the model with one or more input prompts
        messages - list of prompts (each prompt is a dictionary of "role" and "content")
        model - ChatGPT model to use
        temperature - Single values in the range [0, 2]. Higher values
            produce more randomness.
        verbose - if True, print the full response and then
            return only the chat response
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model"s output
    )
    if verbose:
        print(f"full response:\n{response}")
    return response.choices[0].message["content"]

In [3]:
# Output schema:

output_schema = {"_linkedin.html": {
                  "Employment 1": "Big Bank, Senior Data Scientist",
                  "Employment 2": "Small Bank, Data Scientist",
                  "Education": "UC Berkeley, PhD Astronomy",
                  "Board Member": "Girls and Boys Club, Chairman",
                  "Bio": "biography of up to 100 words that is consistent with the above information",
                 },

                 "_wealthx.html": {
                    "Estimated Net Worth": "At least $19.5 million",
                    "Estimated Liquid Assets": "At least $11.5 million",
                    "Estimated Household Wealth": "At least $31.5 million",
                    "Estimated Household Liquid Assets": "At least $11.5 million",
                    "Estimated Family Net Worth": "At least $226.1 billion",
                    "Interests, Passions, Hobbies" : "Tennis and golf"
                 },

                 "_relsci.html": {
                     "Boards & Committees (Corporate)": "Paramount Global, Board Director",
                     "Boards & Committees (Nonprofit)": "Brentwood School - California, Trustee",
                     "Former/Prior Boards & Committees (Corporate)": "The Walt Disney Company, Board Director",
                     "Former/Prior Boards & Committees (Nonprofit)": "The Paley Center for Media, Trustee",
                 },

                 "_equilar.html": {
                    "Stock sold - Equity Transactions (Last 36 Months)": "$215.3 million",
                    "New Equity Grants - Equity Transactions (Last 36 Months)": "$93 million",
                    "Options Exercised - Equity Transactions (Last 36 Months)": "$17.5 million",
                    "Equity Holdings - Equity Transactions": "$167.5 million",
                    "Annual Compensation": "$11.9 million",
                    "Stock Sold": "CEO and Chairperson, \
                        $1.6M at Cerevel Therapeutics Holdings, Inc. (50,000 shares), \
                        June 5 2023 (SEC) | June 1 2023 (Effective)",
                 },

                 "_zoominfo.html": {
                     "Personal Email": "email@domain.com",
                 },

                 "_pitchbook.html": {
                     "Lead partner on deals": {
                          "Company": "Harvey (Business/Productivity Software)",
                          "Deal Date": "April 26 2023",
                          "Deal Type": "Early Stage VC",
                          "Deal Size": "$21M",
                          "Deal Status": "Completed",
                          "Location": "Los Angeles, CA",
                          "Representing": "Sequoia Capital",
                     },
                    "Investor bio": "A brief summary of the investor and his investment deals"
                 },

                 "_google.html": {
                     "Article 1": {
                         "Title": 'John Smith granted key to the city',
                         'Date': 'January 1, 2023',
                         "Abstract": '''In an expensive public ceremony, the mayor granted John Smith the key to the city. \
                         The mayor then spoke for 20 minutes on how great of a person is John Smith and how luck we are \
                         to be his contemporaries.'''
                         },
                 },

                }

In [40]:
#client_description = """Robert King is a prominent business man and CEO \
#at a San Francisco hedge fund. His undergrad degree is in political science. \
#He serves on the board of the local boy scouts chapter.  His net worth exceeds \
#$100M and his family net worth is even greater. \
#"""

clients = {
    #"Robert King": {'description': """has worked at various hedge funds. He is currently \
    #    CEO at one in San Francisco. He serves on multiple local boards and is very wealthy.""",
    #               }

    "Julia Harpman": {'description': """is a crime reporter for the Daily News. She has one husband and
    no appreciable individual or family wealth.""",
                   }
    }

message_1_content = """***Client description: {client_description}*** \
                          <Your repsonse here>
                     """

messages =  [  
{"role":"system", "content":f"""You are a banking expert at a prosperous regional bank. You will
                               generate documents about imaginary, yet plausible, clients. Your
                               output will follow the format below. The prompt from the user is your
                               starting point. Be creative in making up details but stay consistent
                               with the input prompt.

                               Output schema: {output_schema}
                            """},    
{"role":"user", "content": None},    
]

print(messages)

[{'role': 'system', 'content': "You are a banking expert at a prosperous regional bank. You will\n                               generate documents about imaginary, yet plausible, clients. Your\n                               output will follow the format below. The prompt from the user is your\n                               starting point. Be creative in making up details but stay consistent\n                               with the input prompt.\n\n                               Output schema: {'_linkedin.html': {'Employment 1': 'Big Bank, Senior Data Scientist', 'Employment 2': 'Small Bank, Data Scientist', 'Education': 'UC Berkeley, PhD Astronomy', 'Board Member': 'Girls and Boys Club, Chairman', 'Bio': 'biography of up to 100 words that is consistent with the above information'}, '_wealthx.html': {'Estimated Net Worth': 'At least $19.5 million', 'Estimated Liquid Assets': 'At least $11.5 million', 'Estimated Household Wealth': 'At least $31.5 million', 'Estimated Household Liquid 

In [41]:
for name in clients.keys():
    print(name)
    client_description = name + ' ' + clients[name]['description']
    messages[1]['content'] = message_1_content.format(client_description=client_description)
    
    response = get_completion_from_messages(messages, temperature=0.0)
    clients[name]['docs'] = json.loads(response.replace("'", "\""))

Julia Harpman


In [42]:
for name in clients.keys():
    print(clients[name]['docs'])

{'_linkedin.html': {'Employment 1': 'Daily News, Crime Reporter', 'Employment 2': '', 'Education': '', 'Board Member': '', 'Bio': 'Julia Harpman is a dedicated crime reporter for the Daily News. With her sharp investigative skills and passion for justice, she has been covering crime stories for several years. Julia is known for her in-depth reporting and ability to uncover the truth behind the headlines. She is committed to bringing important stories to light and making a difference in her community.'}, '_wealthx.html': {'Estimated Net Worth': 'Not applicable', 'Estimated Liquid Assets': 'Not applicable', 'Estimated Household Wealth': 'Not applicable', 'Estimated Household Liquid Assets': 'Not applicable', 'Estimated Family Net Worth': 'Not applicable', 'Interests, Passions, Hobbies': ''}, '_relsci.html': {'Boards & Committees (Corporate)': '', 'Boards & Committees (Nonprofit)': '', 'Former/Prior Boards & Committees (Corporate)': '', 'Former/Prior Boards & Committees (Nonprofit)': ''},

In [38]:
# convert to html files
def html_dump(clients):
    '''Convert to html from clients dictionary'''
    for name in clients.keys():
        docs = clients[name]['docs']
        for doc_name in docs.keys():
            doc = docs[doc_name]
            output_name = 'data/text/' + (name.replace(' ', '_') + doc_name)
            html = ''
            for key, val in doc.items():
                html += f'<p>{key}: {val}</p>\n'

            with open(output_name, 'w') as fout:
                fout.write(html)    

In [39]:
html_dump(clients)