In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import PyPDF2

def text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f, strict=False)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

In [3]:
OPENAI_MODEL="gpt-4o-2024-08-06"

In [4]:
import tiktoken

enc = tiktoken.encoding_for_model(OPENAI_MODEL)
def prompt_info(messages):
    content = " ".join([m["content"] for m in messages])
    tokens = len(enc.encode(content))
    print(f"Tokens: {tokens}")
    cost_in_eur = (tokens / 1000000)*5 
    print(f"Cost: {cost_in_eur}€")

In [5]:
# FIRST ATTEMPT

# system_prompt = ("You are an assistant with the task of extracting precise information from long documents. "
#                  "You will be prompted with a USER QUESTION and a DOCUMENT. Your task is to answer the USER QUESTION "
#                  "precisely and concisely, using only information from the DOCUMENT. If the document does not contain "
#                  "enough information to answer the question, answer only with the text 'N/A'.")
# 
# 
# question = "What was the total revenue in 2022?"
# 
# prompt = ("USER QUESTION\n\n"
#           f"{question}\n\n"
#           "DOCUMENT\n\n"
#           f"{text}")
# 
# from openai import OpenAI
# client = OpenAI()
# 
# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user", "content": prompt},
#   ]
# 
# prompt_info(messages)

In [6]:
from pydantic import BaseModel
from typing import Optional
from enum import Enum

class Metric(str, Enum):
    rad_expenses = "research and development expenses"
    risk_management_spending = "risk management spending"
    debt_to_equity_ratio = "Debt-To-Equity ratio"
    number_of_stores = "Number of stores"
    return_on_assets = "Return on Assets (ROA)"
    return_on_equity = "Return on Assets (ROE)"
    customer_acquisition_spending = "customer acquisition spending"
    operating_margin = "operating margin"
    market_capitalization = "market capitalization"
    sustainability_initiatives_spending = "sustainability initiatives spending"
    gross_profit_margin = "Gross Profit Margin"

class CompanyRole(str, Enum):
    ceo = "Chief Executive Officer (CEO)"
    cfo = "Chief Financial Officer (CFO)"
    coo = "Chief Operating Officer (COO)"
    clo = "Chief Legal Officer (CLO)"
    board_chairman = "Board Chairman"

class Currency(str, Enum):
    euro = "EUR"
    us_dollar = "USD"
    pound_sterling = "GBP"
    other = "OTHER"

class DocumentDataPoint(BaseModel):
    metric_type: Metric
    value: float
    currency: Optional[Currency]
    point_in_time_as_iso_date: str

class CompanyRoleAssignment(BaseModel):
    role_type: CompanyRole
    person_name: str
    role_assignment_started_as_iso_date: Optional[str]
    role_assignment_ended_as_iso_date: Optional[str]

class DocumentContent(BaseModel):
    data_points: list[DocumentDataPoint]
    company_role_assignments: list[CompanyRoleAssignment]

In [7]:
def extract_document_content(text):
    system_prompt = ("You are an assistant with the task of extracting precise information from long documents. "
                     "You will be prompted with the contents of a document. Your task is to extract various metrics "
                     "as well as company role assignments from this document. With each metric, supply the point in "
                     "time when the metric was measured according to the document,"
                     "as well as the currency (if applicable). "
                     "With each role assignment, supply when the role assignment started and ended, if possible")                     
    
    from openai import OpenAI
    client = OpenAI()
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
      ]
    
    prompt_info(messages)
    response = client.beta.chat.completions.parse(
      model=OPENAI_MODEL,
      messages=messages,
      response_format=DocumentContent
    )
    
    formatted = {
        "data_points": [
            {
                "metric_type": x.metric_type.value,
                "value": x.value,
                "currency": x.currency.value if x.currency else None,
                "point_in_time": x.point_in_time_as_iso_date
            }
            for x in response.choices[0].message.parsed.data_points
        ]
    }
    return formatted

In [8]:
import os
import csv
import json

# Define the paths
samples_dir = 'samples'
output_dir = 'output'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Read the CSV file
with open('dataset.csv', 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)

    for row in csv_reader:
        name = row['sha1'].strip().replace(',', '').replace('"', '')  # Clean up the name to be used in filenames
        pdf_path = os.path.join(samples_dir, f'{name}.pdf')
        
        # Check if the PDF file exists
        if os.path.exists(pdf_path):
            # Define the output path for the JSON file
            output_path = os.path.join(output_dir, f'{name}.json')

            if os.path.exists(output_path):
                print(f'{output_path} already exists; skipping.')
            else:
                print(f'Processing {pdf_path}...')
    
                try:    
                    # Extract text from the PDF
                    pdf_text = text_from_pdf(pdf_path)
        
                    # Extract structured content from the text
                    structured_data = extract_document_content(pdf_text)
                    
                    structured_data["company_name"] = row['name']
        
        
                    # Save the structured data as JSON
                    with open(output_path, 'w') as json_file:
                        json.dump(structured_data, json_file, indent=4)
        
                    print(f'Saved structured data to {output_path}.')
                except:
                    print("Exception caught; skipping...")
                # break
        else:
            # print(f'File not found: {pdf_path}')
            pass

output/d81bbc64a4160b9946fea7a895f80e6201f52f27.json already exists; skipping.
output/608c5097dfc6e83505fd2259ad862dcec11a3f96.json already exists; skipping.
output/3696c1b29566acc1eafc704ee5737fb3ae6f3d1d.json already exists; skipping.
output/99be213e4e689294ebae809bfa6a1b5024076286.json already exists; skipping.
output/71b04e0248ecf758990a0ab77bd69344be63bcf4.json already exists; skipping.
Processing samples/6b79f1c1de9d0e39a4576dcd4585849b9465b402.pdf...
Tokens: 219727
Cost: 1.098635€
Exception caught; skipping...
Processing samples/40b5cfe0d7bbf59e186492bfbe1b5002d44af332.pdf...
Tokens: 77178
Cost: 0.38588999999999996€
Saved structured data to output/40b5cfe0d7bbf59e186492bfbe1b5002d44af332.json.
Processing samples/faf8d7d79152d61279eda1cfb58b8236ce2f82fa.pdf...
Tokens: 27773
Cost: 0.138865€
Saved structured data to output/faf8d7d79152d61279eda1cfb58b8236ce2f82fa.json.
Processing samples/4b525836a5d7cb75489f6d93a3b1cf2b8f039bf2.pdf...
Tokens: 136290
Cost: 0.68145€
Exception caught;