Process for generating html documents with raw tearsheet inputs.

In [1]:
import os
import re
import time
import json
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
# Helper function to parse the output from a one or more prompts combined
# expose the temperature setting to increase the degree of randomness

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0, verbose=False):
    """
    Prime the model with one or more input prompts
        messages - list of prompts (each prompt is a dictionary of "role" and "content")
        model - ChatGPT model to use
        temperature - Single values in the range [0, 2]. Higher values
            produce more randomness.
        verbose - if True, print the full response and then
            return only the chat response
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model"s output
    )
    if verbose:
        print(f"full response:\n{response}")
    return response.choices[0].message["content"]

In [23]:
# Output schema:

output_schema = {"_linkedin.html": {
                  "Employment 1": "Big Bank, Senior Data Scientist",
                  "Employment 2": "Small Bank, Data Scientist",
                  "Education": "UC Berkeley, PhD Astronomy",
                  "Board Member": "Girls and Boys Club, Chairman",
                  "Bio": "biography of up to 100 words that is consistent with the above information",
                 },

                 "_wealthx.html": {
                    "Estimated Net Worth": "At least $19.5 million",
                    "Estimated Liquid Assets": "At least $11.5 million",
                    "Estimated Household Wealth": "At least $31.5 million",
                    "Estimated Household Liquid Assets": "At least $11.5 million",
                    "Estimated Family Net Worth": "At least $226.1 billion",
                    "Interests, Passions, Hobbies" : "Tennis and golf",
                 },

                 "_relsci.html": {
                     "Boards & Committees (Corporate)": "Paramount Global, Board Director",
                     "Boards & Committees (Nonprofit)": "Brentwood School - California, Trustee",
                     "Former/Prior Boards & Committees (Corporate)": "The Walt Disney Company, Board Director",
                     "Former/Prior Boards & Committees (Nonprofit)": "The Paley Center for Media, Trustee",
                     "Top donations (Nonprofit)": "Greenpeace - $2M, UCSF - $1M, Kiva - $500K",
                     "Top donations (Political parties)": "Kamala Harris - $100K, CA Democratic Committee - $250K",
                 },

                 "_equilar.html": {
                    "Stock sold - Equity Transactions (Last 36 Months)": "$215.3 million",
                    "New Equity Grants - Equity Transactions (Last 36 Months)": "$93 million",
                    "Options Exercised - Equity Transactions (Last 36 Months)": "$17.5 million",
                    "Equity Holdings - Equity Transactions": "$167.5 million",
                    "Annual Compensation": "$11.9 million",
                    "Stock Sold": "CEO and Chairperson, \
                        $1.6M at Cerevel Therapeutics Holdings, Inc. (50,000 shares), \
                        June 5 2023 (SEC) | June 1 2023 (Effective)",
                 },

                 "_zoominfo.html": {
                     "Personal Email": "email@domain.com",
                 },

                 "_pitchbook.html": {
                     "Lead partner on deals": {
                          "Company": "Harvey (Business/Productivity Software)",
                          "Deal Date": "April 26 2023",
                          "Deal Type": "Early Stage VC",
                          "Deal Size": "$21M",
                          "Deal Status": "Completed",
                          "Location": "Los Angeles, CA",
                          "Representing": "Sequoia Capital",
                          "Other Partners": "Rich Dude I, Rich Guy II, Notso Rich III, Rich Wannabe",
                          #"Other Partners emails": "richdude@aristocracy.com, richguy@aristocracy.com",
                     },
#                     "Investor bio": "A brief summary of the investor, his peers, and his investment deals"
                 },

                 "_google.html": {
                     "Article 1": {
                         "Title": 'John Smith granted key to the city',
                         'Date': 'January 1, 2023',
                         "Abstract": '''In an expensive public ceremony, the mayor granted John Smith the key to the city. \
                         The mayor then spoke for 20 minutes on how great of a person is John Smith and how luck we are \
                         to be his contemporaries.'''
                         },
                 },
                 ## Should we add info on:
                     # criminal background
                     # if existing client, current relationship depth (e.g. CRB info)
                }

In [116]:
#client_description = """Robert King is a prominent business man and CEO \
#at a San Francisco hedge fund. His undergrad degree is in political science. \
#He serves on the board of the local boy scouts chapter.  His net worth exceeds \
#$100M and his family net worth is even greater. \
#"""

clients = {
    "Robert King": {'description': """has worked at various hedge funds. He is currently \
                    CEO at one in San Francisco. He serves on multiple local boards and is very wealthy.""",
                   },

    "Julia Harpman": {'description': """is a crime reporter for the Daily News. She has one husband and
                    no appreciable individual or family wealth.""",
                   },
    "Hanna Smith": {'description': """is self-employed. She bakes french pastries and sells them at Palo Alto farmers market on Sundays. Her two teenage daughters help her in packing and selling the goods.""",
                   },
    "Jerry Smith": {'description': """is a race car driver at a motorsports company with five years’ experience making speedy
                    turns for several hours, helping his thousands of fans to fulfill their entertainment needs.
                    He has helped raising money for multiple charities and has been one of the biggest contributers himself.""",
                   },

    "Mariann Avocado":{'description':""" Before being named CEO in August 2011, she was Enertech
                        chief operating officer and was responsible for all of the company’s worldwide sales and operations,
                        including end-to-end management ofsupply chain, sales activities, and service and support in all markets
                        and countries. She played a key role in the continued development of strategic reseller and supplier relationships.""",
                    },
    "Velvet Throat":{'description':"""She grew up in Arizona, where she demonstrated an early ability in music that included composing her own songs.
                A child actress, she landed starring roles on the TV series called  One Piece and Secondary School Musical: The saga 
                She released the heartrending debut single Doctor License in March 2020. The song hit No. 1 on the Billboard Hot 100 
                and was the first song with more than one billion streams in 2020. 
                She followed that up with the acclaimed album Sweet (2021).\
                She also started her own line of cosmetics, VIPglow, and is the sole shareowner.\
                Velvetglow operates strictly online and has limited market penetration""",
                },
    
    "Jared Livinglife": {'description': """is retired. He lives in the San Francisco East Bay.
                        He enjoys playing soccer and occasional dog-sitting. He used to work at a small bank. He loves charity and volunteering.""",
                       },
    
    "Aphrodite Greek": {'description': """is running a small craft store on Etsy selling jewellery.
                        Inspired during the pandemic, she learned jewellery making and is now quite renowned in the small business circuit. 
                        She lives in the San Francisco East Bay.
                        Before opening her own store, she was a full time parent focused on her two kids.""",
                       },
    
    "Helen Troy": {'description': """is the lead model for multiple luxury brand campaigns. She has recently been elected to the city council for Arcadia, California.
                   She is also known for her charity work for female victims of domestic abuse."""
                  },
    
    "Zeus Manly": {'description': """is a professional athlete that won 3 Olympic gold medals for discus throwing. 
                    He lives in Montana with his two pitbulls - Ares and Apollo."""
                  }
     }

message_1_content = """***Client description: {client_description}*** \
                          <Your repsonse here>
                     """

messages =  [  
{"role":"system", "content":f"""You are a banking expert at a prosperous regional bank. You will
                               generate documents about imaginary, yet plausible, clients. Your
                               output will follow the format below. The prompt from the user is your
                               starting point. Be creative in making up details but stay consistent
                               with the input prompt.

                               Output schema: {output_schema}
                            """},    
{"role":"user", "content": None},    
]

# print(messages)

In [117]:
for name in clients.keys():
    print(name)
    client_description = name + ' ' + clients[name]['description']
    messages[1]['content'] = message_1_content.format(client_description=client_description)
    print('content created')
    response = get_completion_from_messages(messages, temperature=0.0)
    print('response created')
#     clients[name]['docs'] = json.loads(response.replace("'", "\""))
    clients[name]['docs'] = json.loads(response.replace('"', "").replace("'s", "s").replace("'", "=").replace('=', '"').replace('\\', ''))

Zeus Manly
content created
response created


In [118]:
for name in clients.keys():
    print(clients[name]['docs'])

{'_linkedin.html': {'Employment 1': 'Professional Athlete, Discus Throwing', 'Education': 'N/A', 'Board Member': 'N/A', 'Bio': 'Zeus Manly is a professional athlete who has achieved great success in the sport of discus throwing. He has won 3 Olympic gold medals for his exceptional skills and dedication. Zeus currently resides in Montana with his two beloved pitbulls, Ares and Apollo.'}, '_wealthx.html': {'Estimated Net Worth': 'At least $10 million', 'Estimated Liquid Assets': 'At least $5 million', 'Estimated Household Wealth': 'At least $15 million', 'Estimated Household Liquid Assets': 'At least $5 million', 'Estimated Family Net Worth': 'At least $50 million', 'Interests, Passions, Hobbies': 'Discus throwing, outdoor activities'}, '_relsci.html': {'Boards & Committees (Corporate)': 'N/A', 'Boards & Committees (Nonprofit)': 'N/A', 'Former/Prior Boards & Committees (Corporate)': 'N/A', 'Former/Prior Boards & Committees (Nonprofit)': 'N/A', 'Top donations (Nonprofit)': 'N/A', 'Top don

In [119]:
# convert to html files
def html_dump(clients):
    '''Convert to html from clients dictionary'''
    for name in clients.keys():
        docs = clients[name]['docs']
        for doc_name in docs.keys():
            doc = docs[doc_name]
            output_name = 'data/text/' + (name.replace(' ', '_') + doc_name)
            html = ''
            for key, val in doc.items():
                if isinstance(val, dict): # if nested dict, present as list
                    html += f'<p>{key}:<ul>\n'
                    for nested_key, nested_val in val.items():
                        html += f'\t<li>{nested_key}: {nested_val}</li>\n'
                    html += '</ul></p>\n'
                else:
                    html += f'<p>{key}: {val}</p>\n'

            with open(output_name, 'w') as fout:
                fout.write(html)    

In [120]:
html_dump(clients)