In [32]:
from dotenv import load_dotenv

load_dotenv()

True

In [33]:
file_path="samples/053b7cb83115789346e2a9efc7e2e640851653ff.pdf"

In [34]:
import PyPDF2

text = ""
with open(file_path, 'rb') as f:
    reader = PyPDF2.PdfReader(f, strict=False)
    for page in reader.pages:
        text += page.extract_text() + "\n"

In [41]:
OPENAI_MODEL="gpt-4o-2024-08-06"

In [36]:
import tiktoken

enc = tiktoken.encoding_for_model(OPENAI_MODEL)
def prompt_info(messages):
    content = " ".join([m["content"] for m in messages])
    tokens = len(enc.encode(content))
    print(f"Tokens: {tokens}")
    cost_in_eur = (tokens / 1000000)*5 
    print(f"Cost: {cost_in_eur}€")

In [37]:
# FIRST ATTEMPT

# system_prompt = ("You are an assistant with the task of extracting precise information from long documents. "
#                  "You will be prompted with a USER QUESTION and a DOCUMENT. Your task is to answer the USER QUESTION "
#                  "precisely and concisely, using only information from the DOCUMENT. If the document does not contain "
#                  "enough information to answer the question, answer only with the text 'N/A'.")
# 
# 
# question = "What was the total revenue in 2022?"
# 
# prompt = ("USER QUESTION\n\n"
#           f"{question}\n\n"
#           "DOCUMENT\n\n"
#           f"{text}")
# 
# from openai import OpenAI
# client = OpenAI()
# 
# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user", "content": prompt},
#   ]
# 
# prompt_info(messages)

In [48]:
from pydantic import BaseModel
from typing import Optional
from enum import Enum

class FinMetric(str, Enum):
    total_revenue = "total revenue"
    net_income = "net income"
    total_assets = "total assets"
    total_liabilities = "total liabilities"
    shareholders_equity = "shareholders' equity"
    intangible_assets = "intangible assets"
    inventories = "inventories"
    accounts_receivable = "accounts receivable"
    accounts_payable = "accounts payable"
    operating_cash_flow = "operating cash flow"
    free_cash_flow = "free cash flow"
    capital_expenditures = "capital expenditures"
    rad_expenses = "research and development expenses"
    marketing_expenses = "marketing expenses"
    acquisition_costs = "acquisition costs"

class Currency(str, Enum):
    euro = "EUR"
    us_dollar = "USD"
    other = "OTHER"

class DocumentDataPoint(BaseModel):
    metric_type: FinMetric
    value: float
    currency: Optional[Currency]
    point_in_time_as_iso_date: str

class DocumentContent(BaseModel):
    data_points: list[DocumentDataPoint]

ImportError: cannot import name 'StrEnum' from 'enum' (/usr/lib/python3.10/enum.py)

In [45]:
system_prompt = ("You are an assistant with the task of extracting precise information from long documents. "
                 "You will be prompted with the contents of a document. Your task is to extract financial metrics "
                 "from this document. With each metric, supply the point in time when the metric was measured according"
                 "to the document, as well as the currency (if applicable).")

from openai import OpenAI
client = OpenAI()

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": text},
  ]

prompt_info(messages)

Tokens: 79633
Cost: 0.398165€


In [46]:
response = client.beta.chat.completions.parse(
  model=OPENAI_MODEL,
  messages=messages,
  response_format=DocumentContent
)

print(response.choices[0].message.parsed)

data_points=[DocumentDataPoint(metric_type=<FinMetric.total_revenue: 'total revenue'>, value=137283000.0, currency=<Currency.us_dollar: 'USD'>, point_in_time_as_iso_date='2022-12-31'), DocumentDataPoint(metric_type=<FinMetric.net_income: 'net income'>, value=19996000.0, currency=<Currency.us_dollar: 'USD'>, point_in_time_as_iso_date='2022-12-31'), DocumentDataPoint(metric_type=<FinMetric.total_assets: 'total assets'>, value=1393261000.0, currency=<Currency.us_dollar: 'USD'>, point_in_time_as_iso_date='2022-12-31'), DocumentDataPoint(metric_type=<FinMetric.total_liabilities: 'total liabilities'>, value=744196000.0, currency=<Currency.us_dollar: 'USD'>, point_in_time_as_iso_date='2022-12-31'), DocumentDataPoint(metric_type=<FinMetric.shareholders_equity: "shareholders' equity">, value=649065000.0, currency=<Currency.us_dollar: 'USD'>, point_in_time_as_iso_date='2022-12-31'), DocumentDataPoint(metric_type=<FinMetric.operating_cash_flow: 'operating cash flow'>, value=76541000.0, currency=<

In [60]:
formatted = {
    "data_points": [
        {
            "metric_type": x.metric_type.value,
            "value": x.value,
            "currency": x.currency.value,
            "point_in_time": x.point_in_time_as_iso_date
        }
        for x in response.choices[0].message.parsed.data_points
    ]
} 

print(formatted)

{'data_points': [{'metric_type': 'total revenue', 'value': 137283000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'net income', 'value': 19996000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'total assets', 'value': 1393261000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'total liabilities', 'value': 744196000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': "shareholders' equity", 'value': 649065000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'operating cash flow', 'value': 76541000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'capital expenditures', 'value': 185201000.0, 'currency': 'USD', 'point_in_time': '2022-12-31'}, {'metric_type': 'total revenue', 'value': 115936000.0, 'currency': 'USD', 'point_in_time': '2021-12-31'}, {'metric_type': 'net income', 'value': 18342000.0, 'currency': 'USD', 'point_in_time': '2021-12-31'}, {'metric_type'

In [55]:
print(FinMetric.accounts_payable.value)

accounts payable
