In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import PyPDF2

def split_rec(full_text):
    # Initialize tiktoken encoding for the specified model
    encoding = tiktoken.encoding_for_model(OPENAI_MODEL)    
    # Count the number of tokens in the extracted text
    num_tokens = len(encoding.encode(full_text))
    
    # Check if the number of tokens exceeds the limit (128K)
    # or the number of characters 
    if num_tokens > 100000 or len(full_text) > 900000:
        print("splitting")
        # Split the pages into two roughly equal parts
        mid_point = len(full_text) // 2
        part1 = full_text[:mid_point]
        part2 = full_text[mid_point:]
        
        split1 = split_rec(part1)
        split2 = split_rec(part2)
        
        return split1 + split2
    else:
        # Return the full text as a single-element list
        return [full_text]    

def text_from_pdf(file_path):
    # Initialize an empty list to store the text of each page
    pages_text = []

    # Read the PDF file
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f, strict=False)
        for page in reader.pages:
            pages_text.append(page.extract_text())
    
    # Combine all the text to check the total token count
    full_text = "\n".join(pages_text)

    return split_rec(full_text)

In [3]:
OPENAI_MODEL="gpt-4o-2024-08-06"

In [4]:
import tiktoken

def merge_usage_report_items(items):
    return {
        "my_tokens": sum([i["my_tokens"] for i in items]),
        "prompt_tokens": sum([i["prompt_tokens"] for i in items]),
        "completion_tokens": sum([i["completion_tokens"] for i in items]),
        "total_tokens": sum([i["total_tokens"] for i in items]),
        "total_cost_usd": sum([i["total_cost_usd"] for i in items]),
    }

enc = tiktoken.encoding_for_model(OPENAI_MODEL)
input_token_cost_usd_per_1m_tokens = 2.5
output_token_cost_usd_per_1m_tokens = 10
_1m = 1000000

def get_usage_report(messages, response):
    content = " ".join([m["content"] for m in messages])
    my_tokens = len(enc.encode(content))
    print(f"My Tokens: {my_tokens}")

    prompt_tokens = response.usage.prompt_tokens
    print(f"Prompt Tokens: {prompt_tokens}")

    completion_tokens = response.usage.completion_tokens
    print(f"Completion Tokens: {completion_tokens}")

    prompt_cost_in_usd = (prompt_tokens / _1m) * input_token_cost_usd_per_1m_tokens
    completion_cost_in_usd = (completion_tokens / _1m) * output_token_cost_usd_per_1m_tokens
    total_cost_usd = prompt_cost_in_usd + completion_cost_in_usd
    print(f"Cost: ${total_cost_usd}")
    
    total_tokens = response.usage.total_tokens
    
    if total_tokens != prompt_tokens + completion_tokens:
        print("WARN: token counts don't match")
        print(total_tokens)
        print(prompt_tokens)
        print(completion_tokens)
        print(prompt_tokens+completion_tokens)
    
    return {
        "my_tokens": my_tokens,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "total_tokens": total_tokens,
        "total_cost_usd": total_cost_usd,        
    }

In [5]:
from pydantic import BaseModel
from typing import Optional
from enum import Enum

class Metric(str, Enum):
    rad_expenses = "research and development expenses"
    risk_management_spending = "risk management spending"
#    debt_to_equity_ratio = "Debt-To-Equity ratio"
#    number_of_stores = "Number of stores"
    return_on_assets = "Return on Assets (ROA)"
#    return_on_equity = "Return on Equity (ROE)"
    customer_acquisition_spending = "customer acquisition spending"
    operating_margin = "operating margin"
    market_capitalization = "market capitalization"
    sustainability_initiatives_spending = "sustainability initiatives spending"
    gross_profit_margin = "gross profit margin"
    net_profit_margin = "net profit margin"
    total_liabilities = "total liabilities"
    total_assets = "total assets"
    intangible_assets = "intangible assets"
    marketing_spending = "marketing spending"
    free_cash_flow = "free cash flow"
    earnings_per_share = "earnings per share (EPS)"
    accounts_receivable = "accounts_receivable"
    acquisition_costs = "acquisition costs"
    shareholders_equity = "shareholders' equity"
    operating_cash_flow = "operating cash flow"
    quick_ratio = "Quick Ratio"
    net_income = "net income"
    inventory = "inventory"
    total_revenue = "total revenue"

#class CompanyRole(str, Enum):
#    ceo = "Chief Executive Officer (CEO)"
#    cfo = "Chief Financial Officer (CFO)"
#    coo = "Chief Operating Officer (COO)"
#    clo = "Chief Legal Officer (CLO)"
#    board_chairman = "Board Chairman"

class Currency(str, Enum):
    euro = "EUR"
    us_dollar = "USD"
    great_britain_pound = "GBP"
    australian_dollar = "AUD"
    other = "OTHER"

class DocumentDataPoint(BaseModel):
    metric_type: Metric
    value: float
    currency: Optional[Currency]
    point_in_time_as_iso_date: str

#class CompanyRoleAssignment(BaseModel):
#    role_type: CompanyRole
#    person_name: str
#    role_assignment_started_as_iso_date: Optional[str]
#    role_assignment_ended_as_iso_date: Optional[str]

class DocumentContent(BaseModel):
    data_points: list[DocumentDataPoint]
#    company_role_assignments: list[CompanyRoleAssignment]

In [6]:
def extract_document_content(text):
    system_prompt = ("You are an assistant with the task of extracting precise information from long documents. "
                     "You will be prompted with the contents of a document. Your task is to extract various metrics "
#                     "as well as company role assignments "
                     "from this document. With each metric, supply the point in "
                     "time when the metric was measured according to the document,"
                     "as well as the currency (if applicable). "
                     "If the metric is an amount, extract the exact amount (e.g. "
                     "if the amount in the document is given as '100 (in thousands)' "
                     "or '100k', extract the value '100000')."
#                     "With each role assignment, supply when the role assignment started and ended, if possible."
                     "\n\n"
                     "Do your best to include as many metrics for as many points in time as possible!")                     
    
    from openai import OpenAI
    client = OpenAI()
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
      ]
    
    
    response = client.beta.chat.completions.parse(
      model=OPENAI_MODEL,
      messages=messages,
      response_format=DocumentContent
    )
    
    usage_report = get_usage_report(messages, response)
    
    data_points = [
        {
            "metric_type": x.metric_type.value,
            "value": x.value,
            "currency": x.currency.value if x.currency else None,
            "point_in_time": x.point_in_time_as_iso_date
        }
        for x in response.choices[0].message.parsed.data_points
    ]
    
#    role_assignments = [
#        {
#            "role_type": x.role_type.value,
#            "person_name": x.person_name,
#            "role_assignment_started_as_iso_date": x.role_assignment_started_as_iso_date,
#            "role_assignment_ended_as_iso_date": x.role_assignment_ended_as_iso_date
#        }
#        for x in response.choices[0].message.parsed.company_role_assignments
#    ]
    
    result = {
        "data_points": data_points,
#        "role_assignments": role_assignments
    }
    return result, usage_report

In [7]:
import os
import csv
import json


# Define the paths
samples_dir = 'samples'
output_dir = 'output'

usage_reports = {}

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Read the CSV file
with open('dataset.csv', 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)

    for row in csv_reader:
        name = row['sha1'].strip().replace(',', '').replace('"', '')  # Clean up the name to be used in filenames
        pdf_path = os.path.join(samples_dir, f'{name}.pdf')
        
        # Check if the PDF file exists
        if os.path.exists(pdf_path):
            # Define the output path for the JSON file
            output_path = os.path.join(output_dir, f'{name}.json')

            if os.path.exists(output_path):
                print(f'{output_path} already exists; skipping.')
            else:
                print(f'Processing {pdf_path}...')
    
                try:    
                    # Extract text from the PDF
                    pdf_texts = text_from_pdf(pdf_path)
        
                    # Extract structured content from the text
                    # Involves "unzipping" following https://stackoverflow.com/questions/12974474/how-to-unzip-a-list-of-tuples-into-individual-lists.
                    [structured_datas, usage_report_items] = [list(t) for t in zip(*[extract_document_content(pdf_text) for pdf_text in pdf_texts])]
                    
                    usage_reports[name] = merge_usage_report_items(usage_report_items)
                    
                    structured_data = {
                        "company_name": row['name'],
                        "data_points": [item for d in structured_datas for item in d["data_points"]],
#                        "role_assignments": [item for d in structured_datas for item in d["role_assignments"]]
                    }

                    # Save the structured data as JSON
                    with open(output_path, 'w') as json_file:
                        json.dump(structured_data, json_file, indent=4)
        
                    print(f'Saved structured data to {output_path}.')
                except Exception as error:
                    print(error)
                    print("Exception caught; skipping...")
                # Uncomment this to only work with the first PDF.
                # break
        else:
            # The file was not found. We ignore this, since we are only working
            # with a small sample.
            pass

usage_report = {
    "summary": merge_usage_report_items(usage_reports.values()),
    "details": usage_reports
}

with open('usage_report_create_knowledge_base.json', 'w') as json_file:
    json.dump(usage_report, json_file, indent=4)

Processing samples/ac9aa244462c80705c3ff046542c02c459989742.pdf...
My Tokens: 79230
Prompt Tokens: 79512
Completion Tokens: 628
Cost: $0.20506000000000002
Saved structured data to output/ac9aa244462c80705c3ff046542c02c459989742.json.
Processing samples/e2b19d2cc2ccab2fd9022326b56b38fb0e772e73.pdf...
My Tokens: 98287
Prompt Tokens: 98568
Completion Tokens: 1252
Cost: $0.25894
Saved structured data to output/e2b19d2cc2ccab2fd9022326b56b38fb0e772e73.json.
Processing samples/e62b2ebe3012cd7e6c57507bc950a46d06b3d06e.pdf...
My Tokens: 51326
Prompt Tokens: 51608
Completion Tokens: 529
Cost: $0.13430999999999998
Saved structured data to output/e62b2ebe3012cd7e6c57507bc950a46d06b3d06e.json.
Processing samples/e765cdd472cb47fa74ee6a52700c61aca645bbee.pdf...
splitting
My Tokens: 48532
Prompt Tokens: 48814
Completion Tokens: 673
Cost: $0.12876500000000002
My Tokens: 52216
Prompt Tokens: 52498
Completion Tokens: 891
Cost: $0.140155
Saved structured data to output/e765cdd472cb47fa74ee6a52700c61aca64