In [142]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search
import fitz

# models
EMBEDDING_MODEL = "text-embedding-3-large"
GPT_MODEL = "gpt-4-turbo-preview"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))
first_pdf = './1/21200-input-limit.pdf'
second_pdf = './1/21200-output-atbay.pdf'

In [284]:
def extract_text_and_fields(pdf_path):
    doc = fitz.open(pdf_path)

    # Extract text
    text_list = []
    for page_num in range(doc.page_count):
        text_list_within_page = []
        page = doc[page_num]
        for text_block in page.get_text_blocks():
            text_list_within_page.append((text_block[1], text_block[4]))
        
        for widget in page.widgets():
            text_list_within_page.append((widget.rect[1], f'[{widget.field_name}: {widget.field_value}]'))

        text_list.append('\n'.join(list(map(lambda x: x[1], sorted(text_list_within_page)))))
    doc.close()

    return text_list

In [285]:
# Example usage
first_pdf_page_list = extract_text_and_fields(first_pdf)

# Display text data
for page in first_pdf_page_list:
    print(page, end='\n\n\n--------PAGE BREAK--------\n\n\n')

<image: DeviceRGB, width: 900, height: 198, bpc: 8>
Your application was submitted successfully. Your quotes will be emailed to your
insurance broker in a few minutes. They should follow up with you soon. Please contact
us if you need to correct any information in this application.

1 General Information

* What is the full name of the entity applying for insurance, as it should be listed in the policy?

[(entity_name): LUMBER CO LTD]
LUMBER CO LTD

If applicable, please list any additional named insureds.

Additional Insured Name

[(0_additional_named_insureds): ]
Additional Insured Name

[(1_additional_named_insureds): ]
Additional Insured Name

[(2_additional_named_insureds): ]
Please make sure application answers reflect all Named Insureds' exposure information. More than one Named

Insured will subject the application to underwriter review in most markets, and results generally from multiple

entities with common corporate ownership seeking to purchase just one policy.

* Total Nu

In [286]:
embeddings = []
response = client.embeddings.create(model=EMBEDDING_MODEL, input=first_pdf_page_list)
for i, be in enumerate(response.data):
    assert i == be.index  # double check embeddings are in same order as input
embeddings = [e.embedding for e in response.data]

df = pd.DataFrame({"text": first_pdf_page_list, "embedding": embeddings})

In [287]:
df.head()

Unnamed: 0,text,embedding
0,"<image: DeviceRGB, width: 900, height: 198, bp...","[0.004281056113541126, 0.008859566412866116, 0..."
1,Street Line 2\n\n[(mailing_address.street_line...,"[-0.0014177850680425763, -0.021501507610082626..."
2,* What primary area of business does the Appli...,"[0.01169702410697937, 0.03385864198207855, -0...."
3,[(year_established): ]\n* What kind of entity ...,"[-0.004613890778273344, -0.016248919069767, -0..."
4,Explanation\n\n[(common_cyber_disallowed_indus...,"[-0.010090897791087627, -0.039588723331689835,..."


In [288]:
# Example usage
second_pdf_page_list = extract_text_and_fields(second_pdf)

# Display text data
for page in second_pdf_page_list:
    print(page, end='\n\n\n--------PAGE BREAK--------\n\n\n')

Cyber
Insurance Short Application

AB-CYB-SAP-COV 04/2022 ©2022



--------PAGE BREAK--------


Notice

By completing this Application, the Applicant is applying for a Policy which contains one or more Insuring Agreements,
some of which provide liability for Claims ﬁrst made against any Insured during the Policy Period, or any applicable
Extended Reporting Period, and reported to us pursuant to the terms of this Policy. Claim Expenses shall reduce the
applicable Aggregate Limit of Insurance and Sub-Limits of Insurance and are subject to the applicable Retentions. 

Please read the entire Application and Policy carefully before signing. 

Whenever used in this Application, the term "Applicant" shall mean the Named Insured and all Subsidiaries, unless
otherwise stated. All other terms which appear in bold type herein are used in this Application with the same respective
meanings as set forth in the Cyber Insurance Policy (AB-CYB-001 Ed.08/2018). 



--------PAGE BREAK--------


General I

In [289]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 3
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [291]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("Do you take credit cards", df, top_n=3)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.324


'Explanation\n\n[(six_hour_outage_within_past_3_years.explanation): ]\n* Does the Applicant accept payment cards in exchange for goods or services rendered?\n\n[(accepts_credit_cards-yes): Off]\nYes\n\n[(accepts_credit_cards-no): Yes]\nNo\n\n* If the answer to the Credit Cards question is "yes", is the Applicant PCI compliant?\n\n[(pci_compliance_with_na-yes): Off]\nYes\n\n[(pci_compliance_with_na-no): Off]\nNo\n\n[(pci_compliance_with_na-na): Off]\nN/A\n\n* If the answer to the PCI Compliance question is "no", is the Applicant\'s outsourced payment\nprocessor PCI compliant?\n\n[(pci_compliant_payments-yes): Off]\nYes\n\n[(pci_compliant_payments-no): Off]\nNo\n\n[(pci_compliant_payments-na): Off]\nN/A\n\n*Does the Applicant encrypt information that is stored on mobile computing devices, including\nbut not limited to laptops and smart phones?\n\n[(encrypt_mobile_devices-yes): Off]\nYes\n\n[(encrypt_mobile_devices-no): Off]\nNo\n\n* Does the Applicant have formal processes for backing up

relatedness=0.310


"Affirmative answer is required to be eligible for Cowbell's Social Engineering endorsement\n\n[(prevent_unauthorized_employee_wire_transfers-yes): Off]\nYes\n\n[(prevent_unauthorized_employee_wire_transfers-no): Off]\nNo\n\n*Prior to executing an electronic payment, does the Applicant require out-of-band\nauthentication? � Out-of-band authentication is a secondary verification method with the\nrequestor of a funds transfer through a communication channel separate from the original\nrequest.)\n\nPlease check — this is a default answer\n\n[(out_of_band_authentication-yes): Yes]\nYes\n\n[(out_of_band_authentication-no): Off]\nNo\n\n* Does the Applicant protect all of their devices with anti-virus, anti-malware, and endpoint\nprotection software?\n\n[(endpoint_protection_for_all_devices-yes): Yes]\nYes\n\n[(endpoint_protection_for_all_devices-no): Off]\nNo\n\nWhat Endpoint Detection and Response � EDR�  provider does the applicant use?\n\nAt-Bay and TMHCC offer lower premiums for some pro

relatedness=0.281


"[(encrypt_pos_terminals-yes): Off]\nYes\n\n[(encrypt_pos_terminals-no): Off]\nNo\n\n[(encrypt_pos_terminals-na): Off]\nN/A\n\n* Have you experienced a theft or unintended release, disclosure or loss of personal or protected\nrecords in the past five years?\n\nPlease check — this is a default answer\n\n[(loss_of_records_past_5_years-yes): Off]\nYes\n\n[(loss_of_records_past_5_years-no): Yes]\nNo\n\nHave you within the past 12 months completed or agreed to, or do you plan entering into within\nthe next 12 months, a merger, acquisition, or consolidation?\n\n[(merger_acquisition_or_consolidation_past_or_next_12_months-yes): Off]\nYes\n\n[(merger_acquisition_or_consolidation_past_or_next_12_months-no): Off]\nNo\n\n*Does the Applicant have a formal process, including a separate take-down process when\n\nneeded, to ensure any products or services do not infringe on the intellectual property rights of\nothers?\n\n[(ip_infringement_prevention_and_takedown_process-yes): Off]\nYes\n\n[(ip_infrin

In [292]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [311]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = """
        Questions and answers have been extracted from Document1 in order to provide you \
        with the information required to answer the questions in Document2. The answers in \
        Document1 are within square brackets like this [first_part: second_part]. The \
        first_part is a label representing the field name, and the second_part is the \
        true answer. The extraction process was imperfect, so along with these answers in \
        square brackets, there are also repeats of these answers outside the square brackets. \
        Ignore these. If there are any texts resembling answers within Document2, ignore these \
        as well and provide your own answers. Those left in answers were not able to be \
        extracted. The output should be in the format of "(Document2 question) \n (answer from \
        Document1 information) \n\n\n" for each question within Document2. Answer all questions \
        labelled as "Optional".
    """
    question = f'\n\nDocument2 questions:\n"""\n{query}\n"""'
    message = introduction
    for string in strings:
        next_article = f'\n\nDocument1 information:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question

In [312]:
def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the provided texts."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [313]:
for page in second_pdf_page_list:
    print(ask(page))

There are no questions provided in Document2 to answer.
There is no question provided in Document2 to answer.
(Name of Applicant) LUMBER CO LTD 

(Applicant's DBA) 

(Applicant's address) 123 Main St 

(Optional) Suite, Floor, Unit, etc... 

(Applicant's previous ﬁscal year-end revenue) $80,000,000 

(Applicant's primary website) https://www.lumberco.com
(Applicant's industry) None of the above
(None of the above) 
Yes
(Security Controls Does the Applicant store or process personal, health, or credit card information of more than 500,000 individuals?) 

(Does the Applicant keep ofﬂine backups that are disconnected from its network or store backups with a cloud service provider?) Yes

(Does the Applicant have multi-factor authentication enforced on all email access?) Yes

(Does the Applicant have multi-factor authentication enforced on all remote access including VPN or other remote network access?) No
(Appriver: None of the above) 

(Avanan: None of the above) 

(Barracuda: None of the