In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
OPENAI_MODEL="gpt-4o-2024-08-06"

In [3]:
import tiktoken

def merge_usage_report_items(items):
    return {
        "my_tokens": sum([i["my_tokens"] for i in items]),
        "prompt_tokens": sum([i["prompt_tokens"] for i in items]),
        "completion_tokens": sum([i["completion_tokens"] for i in items]),
        "total_tokens": sum([i["total_tokens"] for i in items]),
        "total_cost_usd": sum([i["total_cost_usd"] for i in items]),
    }

enc = tiktoken.encoding_for_model(OPENAI_MODEL)
input_token_cost_usd_per_1m_tokens = 2.5
output_token_cost_usd_per_1m_tokens = 10
_1m = 1000000

def get_usage_report(messages, response):
    content = " ".join([m["content"] for m in messages])
    my_tokens = len(enc.encode(content))
    print(f"My Tokens: {my_tokens}")

    prompt_tokens = response.usage.prompt_tokens
    print(f"Prompt Tokens: {prompt_tokens}")

    completion_tokens = response.usage.completion_tokens
    print(f"Completion Tokens: {completion_tokens}")

    prompt_cost_in_usd = (prompt_tokens / _1m) * input_token_cost_usd_per_1m_tokens
    completion_cost_in_usd = (completion_tokens / _1m) * output_token_cost_usd_per_1m_tokens
    total_cost_usd = prompt_cost_in_usd + completion_cost_in_usd
    print(f"Cost: ${total_cost_usd}")
    
    total_tokens = response.usage.total_tokens
    
    if total_tokens != prompt_tokens + completion_tokens:
        print("WARN: token counts don't match")
        print(total_tokens)
        print(prompt_tokens)
        print(completion_tokens)
        print(prompt_tokens+completion_tokens)
    
    return {
        "my_tokens": my_tokens,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "total_tokens": total_tokens,
        "total_cost_usd": total_cost_usd,        
    }

In [4]:
import os
import json

# Define the directory containing the JSON files
directory = 'output'

# Initialize an empty list to store the dictionaries
json_list = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.json'):  # Check if the file is a JSON file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as json_file:
            data = json.load(json_file)  # Load the JSON file into a dict
            json_list.append(data)  # Append the dict to the list

# Now json_list contains all the JSON files as dicts


In [5]:
database = [
    {
        "company_name": x["company_name"],
        "data_points": x["data_points"],
#        "company_role_assignments": x["company_role_assignments"] if "company_role_assignments" in x else None
    }
    for x in json_list
]

In [6]:
from pydantic import BaseModel
from typing import Optional

class NumberResponse(BaseModel):
    answer: Optional[float]


class NameResponse(BaseModel):
    person_name: Optional[str]


class BooleanResponse(BaseModel):
    answer: Optional[bool]

def ask_question(question, schema):
    system_prompt = ("You are an assistant with the task of answering QUESTIONS based on a KNOWLEDGE DATABASE. "
                     "If you cannot answer the question, indicate this with a `null` response.")

             
    prompt = ("QUESTION\n\n"
              f"{question}\n\n"
              "KNOWLEDGE DATABASE\n\n"
              f"{json.dumps(database)}")
    
    
    from openai import OpenAI
    client = OpenAI()
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
      ]
    
    match schema:
        case "number": 
            response_format = NumberResponse
            value_extractor = lambda x: x.answer
        case "name": 
            response_format = NameResponse
            value_extractor = lambda x: x.person_name
        case "boolean": 
            response_format = BooleanResponse
            value_extractor = lambda x: x.answer
        case _: raise f"unknown schema {schema}"
    
    response = client.beta.chat.completions.parse(
      model=OPENAI_MODEL,
      messages=messages,
      response_format=response_format
    )
    
    usage_report = get_usage_report(messages, response)
    
    return value_extractor(response.choices[0].message.parsed), usage_report

In [None]:
results = []
usage_reports = []

with open('samples/questions.json', 'r') as json_file:
    items = json.load(json_file)
    for item in items:
        question = item["question"]
        print(question)
        answer, usage_report = ask_question(question, item["schema"])
        print(answer)
        results.append({
            "question": question,
            "schema": item["schema"],
            "answer": "n/a" if answer is None else answer
        })
        usage_reports.append(usage_report)

with open('results.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)

usage_report = {
    "summary": merge_usage_report_items(usage_reports),
    "details": usage_reports
}

with open('usage_report_answer_question.json', 'w') as json_file:
    json.dump(usage_report, json_file, indent=4)

What was the Net Profit Margin of "Oesterreichische Kontrollbank" in June 30, 2023?
My Tokens: 24333
Prompt Tokens: 24388
Completion Tokens: 5
Cost: $0.06102
None
What was the total liabilities of "CrossFirst Bank" in the fiscal year 2023?
My Tokens: 24328
Prompt Tokens: 24383
Completion Tokens: 10
Cost: $0.0610575
5070761400.0
How much more did "Astral Resources NL" spend on marketing compared to "TSX_Y" in June 30, 2021?
My Tokens: 24338
Prompt Tokens: 24393
Completion Tokens: 5
Cost: $0.0610325
None
Which company had a higher free cash flow: "TSX_ACQ", "QUIDELORTHO CORPORATION" or "PowerFleet, Inc.", in the fiscal year 2022?
My Tokens: 24348
Prompt Tokens: 24405
Completion Tokens: 6
Cost: $0.061072499999999995
None
What was the earnings per share (EPS) of "Holley Inc." in June 30, 2023?
My Tokens: 24333
Prompt Tokens: 24388
Completion Tokens: 5
Cost: $0.06102
None
What was the accounts receivable of "Petra Diamonds" in the fiscal year 2023?
My Tokens: 24329
Prompt Tokens: 24384
Comp