In [42]:
import os
import PyPDF2
import json
import pickle
import tiktoken
import pandas as pd

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings.base import OpenAIEmbeddings

from dotenv import load_dotenv

load_dotenv()
# Place a .env file in the root directory with "OPENAI_API_KEY" set

True

## Get Companies

In [43]:
# SAMPLES_PATH = r"C:\Users\felix.krause\code\trustbit\enterprise-rag-challenge\samples"
SAMPLES_PATH = r"data/pdfs"

In [44]:
df = pd.read_csv("data/dataset.csv") 
df

Unnamed: 0,sha1,date,name,size
0,ce9e5024041b2ece2bafa2a9d9516bb174ee8949,2022-10-31,"Anixa Biosciences, Inc.",3996701
1,f71415f9ca0cff70e5fa193616b6197f361130ed,2023-02-21,"Maravai LifeSciences Holdings, Inc.",4033642
2,4a9d2b853e05970776121a810460f0962a18c5a1,2022-XX,KLA Corporation,1181894
3,f973dd219c534accb0d4e72d8e12f51284d48d10,2023-01-01,"Ameresco, Inc.",10648267
4,4e27f4c3402c657d548760cb3a164b036cefaabb,2022-12-31,Battery Minerals Limited,3650701
...,...,...,...,...
7490,a2afcd8165a6dbd0058682680b65d3638a5800eb,2023-02-02,"Arrow Electronics, Inc",922750
7491,215df84494756bd4feebc973657835ef7f14ee16,2022-12-31,Synertec,10474497
7492,20fb970d8705289e835b408c575351295ac16f5f,2022-09-30,TE Connectivity,5753097
7493,c7f3a8c0a38c756438950ce3085076adb4241a32,2022-01-01,OTC_ADDDF,12603054


In [45]:
# load all file names in the folder
sha1_dict_full = df.set_index("sha1").to_dict()["name"]

included_companies = []

for file in os.listdir(SAMPLES_PATH):
    file = file[:-4]
    try:
        included_companies.append(sha1_dict_full[file])
    except KeyError:
        print(file)

In [46]:
# Create sample for development
# COMPANIES = ["Ethernity Networks Ltd", "Limbach Holdings, Inc.", "Accuray Incorporated"] # for development
COMPANIES = included_companies

if COMPANIES:
    df_sample = df[df.name.isin(COMPANIES)]
else:
    df_sample = df.copy()
    
sha1_dict = df_sample.set_index("name").to_dict()["sha1"]

In [47]:
df_sample

Unnamed: 0,sha1,date,name,size
689,ac9aa244462c80705c3ff046542c02c459989742,2022-01-01,Mercia Asset Management PLC,1977782
977,e2b19d2cc2ccab2fd9022326b56b38fb0e772e73,2022-12-31,CrossFirst Bank,2962631
1601,e62b2ebe3012cd7e6c57507bc950a46d06b3d06e,2022-01-01,Sleep Country Canada Holdings Inc.,7576716
1715,e765cdd472cb47fa74ee6a52700c61aca645bbee,2022-12-31,"First Mid Bancshares, Inc.",1284989
1757,6054ec55767fbe6585598ced7afacf5cb8619a13,2022-03-31,MITSUI O.S.K. LINES,6862882
2713,9d7a72445aba6860402c3acce75af02dc045f74d,2022-01-01,TSX_Y,1457663
2779,a706b44ba275c97b8633b0808cd2f90cbb7fe473,2022-01-01,Oesterreichische Kontrollbank,5090936
2992,a8077fe1983a64dc77bddfafbf48242e66111a89,2023-03-28,"PowerFleet, Inc.",7797234
3172,84749ef5c2bbf2a302b6614f31727a95bf29f309,2022-12-31,BAKER STEEL RESOURCES TRUST LIMITED,886428
3321,ba5852cb6c20da35da2ce6ebaafc711d06fe8c1e,2022-12-31,"Caixa Geral de Depósitos, S.A.",16061289


## Load LLM data

In [48]:
# from llama_parse import LlamaParse

# parser = LlamaParse(
#     result_type="markdown",  # "markdown" and "text" are available
#     verbose=True
# )

# Function to load text from different file types
def load_text_from_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    text = ""
    
    if file_extension == ".pdf":
        # TODO https://www.reddit.com/r/LangChain/comments/18yxacm/extracting_data_from_pdf_containing_complex_tables/
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f, strict=False)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    else:
        pass
    
    return text


# Function to load documents from a folder
def load_documents_from_folder(folder_path, companies=[]):
    documents = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file[:-4] in companies: #or len(companies) == 0:
                print("Loading", file)
                file_path = os.path.join(root, file)

                text = load_text_from_file(file_path)
                if text:
                    documents.append(Document(page_content=text, metadata={"source": file_path}))
    return documents

In [49]:
# Could try LLama approach
# TODO https://medium.com/the-ai-forum/rag-on-complex-pdf-using-llamaparse-langchain-and-groq-5b132bd1f9f3
# doc = parser.load_data(os.path.join(FOLDER_PATH, "e51b7204b91cbe7709bd3218e7d2d0c2b8dbb438.pdf"))
# index = VectorStoreIndex.from_documents(doc)

## Try to detect tables from pdf

In [9]:
"""
import camelot

# Extract tables from a PDF file
tables = camelot.read_pdf(os.path.join(FOLDER_PATH, "e51b7204b91cbe7709bd3218e7d2d0c2b8dbb438.pdf"), pages="all")

# Convert the first table to a DataFrame
df = tables[0].df

# Save the DataFrame as a CSV file
tables[0]
"""

'\nimport camelot\n\n# Extract tables from a PDF file\ntables = camelot.read_pdf(os.path.join(FOLDER_PATH, "e51b7204b91cbe7709bd3218e7d2d0c2b8dbb438.pdf"), pages="all")\n\n# Convert the first table to a DataFrame\ndf = tables[0].df\n\n# Save the DataFrame as a CSV file\ntables[0]\n'

In [50]:
# tables[40].df

## Build retriever per company

In [50]:
def get_store_name(string):
    string = string.lower()
    return string.replace(" ", "_").replace(",", "").replace(".", "").replace("'", "")

In [51]:
# WARNING: This will build all retrievers for all companies in the sample (takes around 5 min)
# Needs to be run only once 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300, separators=["\n\n", "\n"])

build_token_usage = {}

for company in df_sample.name.to_list():
    # check if file already exists
    if os.path.exists(f"dbs/{get_store_name(company)}.db"):
        print("Skipping", company)
        continue
    
    try:
        company_docs = load_documents_from_folder(SAMPLES_PATH, [sha1_dict[company]])
        
        if not company_docs:
            raise ValueError(f"No documents found for company: {company}")
        
        texts = text_splitter.split_documents(company_docs)
        embeddings = OpenAIEmbeddings()
        db = FAISS.from_documents(texts, embeddings)
        
        
        # count embeddings tokens
        tokenizer = tiktoken.get_encoding("cl100k_base")
        token_counts = [len(tokenizer.encode(doc.page_content)) for doc in texts]
        build_token_usage[company] = sum(token_counts)
        
        with open(f"dbs/{get_store_name(company)}.db", "wb") as f:
            pickle.dump(db.serialize_to_bytes(), f)
    except Exception as e:
        print(f"Error for company {company}: {e}")
        # raise e,

Loading ac9aa244462c80705c3ff046542c02c459989742.pdf
Loading e2b19d2cc2ccab2fd9022326b56b38fb0e772e73.pdf
Loading e62b2ebe3012cd7e6c57507bc950a46d06b3d06e.pdf
Loading e765cdd472cb47fa74ee6a52700c61aca645bbee.pdf
Loading 6054ec55767fbe6585598ced7afacf5cb8619a13.pdf
Loading 9d7a72445aba6860402c3acce75af02dc045f74d.pdf
Loading a706b44ba275c97b8633b0808cd2f90cbb7fe473.pdf
Loading a8077fe1983a64dc77bddfafbf48242e66111a89.pdf
Loading 84749ef5c2bbf2a302b6614f31727a95bf29f309.pdf
Loading ba5852cb6c20da35da2ce6ebaafc711d06fe8c1e.pdf
Loading 609042c64a759c0ac63e7cf18742be4dd3cc5cd5.pdf
Loading e0d6bb578bfc233c3bdbded229960fed01624631.pdf
Loading 43437bccca01aeae222ec7cc706ba3eaf50be2d7.pdf
Loading 194000c9109c6fa628f1fed33b44ae4c2b8365f4.pdf
Loading f06d7ecc8072de616a4ea35c74e20199de6b0691.pdf
Loading 85fb23ba2910de45e27f8f40170c0f3576043916.pdf
Loading 2779336b845a41544348abb7b3e6e5bd2ff893a2.pdf
Loading f721fa86aa0f17b194fa21917aeabbf8df6511c2.pdf
Loading cbd8fb252e8743dc53961adc53e2b4de9ff603

In [52]:
# store build_token_usage
with open("dbs/build_token_usage.json", "w") as f:
    json.dump(build_token_usage, f)

In [53]:
def get_retriever_dict(companies):
    retriever_dict = {}
    
    for company in companies:
        with open(f"dbs/{get_store_name(company)}.db", "rb") as f:
            db_bytes = pickle.load(f)
        
        db_temp = FAISS.deserialize_from_bytes(db_bytes, OpenAIEmbeddings(),
                                      allow_dangerous_deserialization=True)
        
        retriever_dict[company] = db_temp.as_retriever()
    
    return retriever_dict

In [54]:
retriever_dict = get_retriever_dict(df_sample.name.to_list())

In [55]:
def get_context(query, company):
    # TODO add sophisticated retriever for tables?
    return retriever_dict[company].get_relevant_documents(query)

## Structured inference

In [56]:
# MODEL = "gpt-4o-mini-2024-07-18"
MODEL = "gpt-4o-2024-08-06"

# load system prompt from .md
with open("instructions/answer_guidelines.md") as f:
    ANSWER_GUIDELINES = f.read()
    
with open("instructions/fin_info.md") as f:
    FIN_INFO = f.read()


system_prompt = f"""
You are an intelligent assistant tasked with retrieving and analyzing company data to answer specific questions based on the provided CONTEXT.

Your Objectives:
    - Chain of Thought: Document your reasoning process in 'chain_of_thought', including all intermediary steps. If your analysis involves calculations (e.g., ratios), explicitly state the numerator and denominator.
    - Handling Temporal Data: If the temporal scope of the data isn't an exact match (e.g., full-year data instead of a specific date), use the most relevant available data in the CONTEXT to formulate your response.
    - Final Answer: Based on your reasoning, provide a final answer in 'answer' that strictly adheres to the guidelines outlined in. Accuracy and format compliance are crucial: \n {ANSWER_GUIDELINES} \n

Format:
    - chain_of_thought: Your reasoning process, including intermediary steps
    - answer: Your final answer following the answer guidelines

CONTEXT: 
<CONTEXT>
"""

# TODO enhance by financial knowledge, similar to Pedros approach of giving definitions/explanations of financial terms
# f"Your primary task is to accurately identify and extract specific financial metrics, ratios, and counts, even when synonymous "
# f"or contextually similar terms are used. Here is some useful information about these: \n {FIN_INFO}"

In [58]:
from pydantic import BaseModel
from openai import OpenAI

client = OpenAI()

class AnswerEvent(BaseModel):
    chain_of_thought: str
    answer: int | float | str


def ask_company_expert(query, company):
    print("### ASKING COMPANY EXPERT:", company)
    context = get_context(query, company)
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt.replace("<CONTEXT>", str(context))},
            {"role": "user", "content": query},
        ],
        response_format=AnswerEvent,
    )
    
    return completion.choices[0].message.parsed, completion.usage

In [16]:
ask_company_expert("How many assets does 'Accuray Incorporated' have in 2022 in dollars?", "Accuray Incorporated")[0]  # 472849000

### ASKING COMPANY EXPERT: Accuray Incorporated


  warn_deprecated(


AnswerEvent(chain_of_thought="To find the total assets, I refer to the balance sheet, which provides the total liabilities and stockholders' equity values, accounting for all assets as well. However, the original breakdown of liabilities alone is not explicitly listed, we have the sum of liabilities and stockholders’ equity, which is the relevant figure for total assets. In June 2022, the total liabilities and stockholders’ equity are reported as $472,849. This figure represents the total assets of the company, following the accounting equation: Assets = Liabilities + Equity. Thus, the total assets held by Accuray Incorporated in 2022 is $472,849.", answer=472849)

In [17]:
ask_company_expert("How many assets does 'Ethernity Networks Ltd' have in 2022 in dollars?", "Ethernity Networks Ltd")[0]

### ASKING COMPANY EXPERT: Ethernity Networks Ltd


AnswerEvent(chain_of_thought="To answer this question, we need to identify the section of the financial statements that provides the Statement of Financial Position. This statement includes details on the company's assets as of December 31, 2022. However, the context provided does not include this section, and there is no specific mention of the asset figures for Ethernity Networks Ltd in 2022. As this critical data is missing, we're unable to calculate or provide an answer regarding the total assets.", answer='n/a')

In [None]:
ask_company_expert("What are the total R&D expenses of 'Ethernity Networks Ltd' in 2021?", "Ethernity Networks Ltd")[0]

## Multiple agents

In [59]:
# DELEGATION MANAGER

INVOLVED_COMPANIES = df_sample.name.to_list()

system_prompt_delegation = f"""
You will receive queries related to one or more companies. You have access to specialized agents capable of retrieving data for each specific company. To delegate tasks to these agents, follow these guidelines:

    1. Identify Companies: Extract the exact company names from the query and list them under 'companies'. Ensure that the company names match exactly with those provided in {INVOLVED_COMPANIES}. If the query mentions companies that are not in this list, enter 'SKIP' for both 'companies' and 'queries', and note this in the 'chain_of_thought'. If the company name in the query is incomplete (e.g., missing 'Inc.', 'Ltd.'), attempt to match it with the correct company name from the list.

    2. Formulate Queries: 
        - For each company identified, create a precise query that the specialized agents can use to retrieve the required information. 
        - List these queries under 'queries'. 
        - Add some information on where in annual company reports the specialized agents can usually find the information. 
        - If the query involves ratios, comparisons, or requires multiple metrics, suggest how the specialized agents should approach the problem to ensure accurate results. For instance, if calculating a ratio, indicate which metrics to retrieve and compare.
        - If using abbreviations also mention the full phrase or name ((e.g., for 'R&D' also use 'research and development').
        - If the query asks specifically for data for a specific date, also ask to provide the best available data if only full-year data is available. Be transparent about it in the 'chain_of_thought'.
    
    4. Chain of Thought: Provide a 'chain_of_thought' explaining how you identified the companies, formulated the queries, and any assumptions or steps taken to reach your final output. If you marked any queries as 'SKIP', explain why.
"""


class DelegationRequest(BaseModel):
    chain_of_thought: str
    companies: list[str]
    queries: list[str]


def ask_delegation_manager(query):    
    print("### ASKING DELEGATION MANAGER: ")
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt_delegation},
            {"role": "user", "content": query},
        ],
        response_format=DelegationRequest,
    )
    
    return completion.choices[0].message.parsed, completion.usage

In [19]:
response, tokens = ask_delegation_manager("Has 'Ethernity Networks Ltd' more liabilities than 'Accuray Incorporated' in 2022?")
response

### ASKING DELEGATION MANAGER: 


DelegationRequest(chain_of_thought='The query mentions two companies: \'Ethernity Networks Ltd\' and \'Accuray Incorporated.\' Both names match exactly with the list provided. Therefore, they are extracted as companies whose data need retrieval.\n\nThe query is asking for a comparison of liabilities between these two companies for the year 2022. To address this, specialized agents need to extract the total liabilities as reported in each company\'s balance sheet for 2022. Annual company reports usually contain the balance sheet under the section titled "Financial Statements" or "Balance Sheet." If specific 2022 data is not available, agents should provide the nearest full-year data available and mention this adjustment in the results.\n\nThis approach ensures a direct comparison of liabilities as requested.', companies=['Ethernity Networks Ltd', 'Accuray Incorporated'], queries=['Retrieve the total liabilities from the balance sheet for Ethernity Networks Ltd for the year 2022, or prov

In [20]:
tokens

CompletionUsage(completion_tokens=249, prompt_tokens=609, total_tokens=858)

In [None]:
ask_delegation_manager("How many assets does 'Ethernity Networks Ltd' have in 2022 in dollars?")[0]

In [None]:
ask_delegation_manager("Did Global Medical REIT Inc. have a greater Debt-to-Equity ratio than Zegona Communications plc in Q2 2021?")[0]

In [60]:
# EXECUTION MANAGER

system_prompt_execution = f"""
You are an intelligent assistant tasked with synthesizing a final answer to the USER_QUERY using specific company data provided in the CONTEXT. The CONTEXT includes information generated by expert systems along with their reasoning process.

Your Objectives:
    - Final Answer Construction: Your primary task is to deliver a final, accurate response to the USER_QUERY based on the provided CONTEXT.
    - Handling Temporal Data: If the temporal scope of the data isn't an exact match (e.g., full-year data instead of Q4), use the best available data from the CONTEXT to formulate your answer.
    - Chain of Thought: Document your reasoning process in 'chain_of_thought', including intermediary steps and considerations that led to your final answer.
    - Strict Adherence: The final answer must be compliant with the specific guidelines. Accuracy, clarity, and adherence to these guidelines are critical: \n {ANSWER_GUIDELINES} \n 

Format:
    - chain_of_thought: Your reasoning process, including intermediary steps
    - answer: Your final answer following the guidelines. It should be of format: <OUTPUT_TYPE>, e.g., 'name', 'number' (including 0), 'boolean', etc.

Provided Inputs:

USER_QUERY: \n <QUERY> \n
CONTEXT: \n <CONTEXT>
"""

class ExecutionRequest(BaseModel):
    chain_of_thought: str
    answer: int | float | str

def ask_execution_manager(query, context, output_type):
    system_prompt_execution_full = system_prompt_execution.replace("<QUERY>", query).replace("<CONTEXT>", context).replace("<OUTPUT_TYPE>", output_type)
    
    print("### ASKING EXECUTION MANAGER: ")

    print("Context:", context)
    
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt_execution_full},
            {"role": "user", "content": query},
        ],
        response_format=ExecutionRequest,
    )
    
    return completion.choices[0].message.parsed, completion.usage

In [69]:
def get_total_token_usage(tokens_list):
    summed_token_usage = {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}
    if type(tokens_list[0]) is dict:
        for usage in tokens_list:
            summed_token_usage["completion_tokens"] += usage["completion_tokens"]
            summed_token_usage["prompt_tokens"] += usage["prompt_tokens"]
            summed_token_usage["total_tokens"] += usage["total_tokens"]
    else:
        for usage in tokens_list:
            summed_token_usage["completion_tokens"] += usage.completion_tokens
            summed_token_usage["prompt_tokens"] += usage.prompt_tokens
            summed_token_usage["total_tokens"] += usage.total_tokens
                
    return summed_token_usage

In [62]:
# FINAL PIPELINE

def ask_question(query, output_type):
    print("## Query:", query)
    delegations, token_usage_delegations = ask_delegation_manager(query)
    print("Delegations: \n", delegations)
    
    expert_context = {}
    token_usage_expert = []
    for company, company_query in zip(delegations.companies, delegations.queries):
        if company.lower() == "skip" or company_query.lower() == "skip":
            expert_context[company] = "No data available"
        else:
            expert_context[company], token_usage_expert_tmp = ask_company_expert(company_query, company)
            token_usage_expert.append(token_usage_expert_tmp)
    
    final_answer, token_usage_execution = ask_execution_manager(query, str(expert_context), output_type)
    
    tokens_used = [token_usage_delegations] + token_usage_expert + [token_usage_execution]
    
    return final_answer, get_total_token_usage(tokens_used)

In [23]:
response, tokens = ask_question("Who is the CEO of Tesla Inc.?", "name") # n/a
response

## Query: Who is the CEO of Tesla Inc.?
### ASKING DELEGATION MANAGER: 
Delegations: 
 chain_of_thought="The query asks for the CEO of Tesla Inc., but Tesla Inc. is not listed among the companies provided in the instructions. Therefore, according to the guidelines, I have to mark this as 'SKIP'. The companies I have to work with are specific, and Tesla Inc. is not among them." companies=['SKIP'] queries=['SKIP']
### ASKING EXECUTION MANAGER: 
Context: {'SKIP': 'No data available'}


ExecutionRequest(chain_of_thought='The context provided indicates that there is no data available (\'SKIP\': \'No data available\'). Based on the strict adherence guidelines, when information is not available within the provided context, the response should be "n/a". Therefore, although I know Elon Musk is the CEO as of my training data until October 2023, in this task, I\'m required to rely strictly on the provided context, which has no data available.', answer='n/a')

In [35]:
response, tokens = ask_question("Who is the CEO of Ethernity Networks?", "name") # David Levi
response

## Query: Who is the CEO of Ethernity Networks?
### ASKING DELEGATION MANAGER: 
Delegations: 
 chain_of_thought="The query specifically mentions 'Ethernity Networks', which matches exactly with 'Ethernity Networks Ltd' from the provided list of companies. Therefore, I identified 'Ethernity Networks Ltd' as the company of interest.\n\nTo find out the CEO, the specialized agent should look in the company's latest annual report or the most recent corporate announcement section, as these usually provide details about current executives and any recent changes in leadership." companies=['Ethernity Networks Ltd'] queries=["Retrieve the name of the current CEO of Ethernity Networks Ltd from the company's latest annual report or the most recent corporate announcement."]
### ASKING COMPANY EXPERT: Ethernity Networks Ltd
### ASKING EXECUTION MANAGER: 
Context: {'Ethernity Networks Ltd': AnswerEvent(chain_of_thought='From the provided CONTEXT, the Statutory and Other Information section lists the 

ExecutionRequest(chain_of_thought='The CONTEXT clearly indicates that the list of current directors of Ethernity Networks includes David Levi as the Chief Executive Officer. This information is considered reliable as it reflects the latest available details from that subsection, which typically provides current position details within such documents.', answer='David Levi')

In [38]:
tokens

{'completion_tokens': 269, 'prompt_tokens': 4658, 'total_tokens': 4927}

In [None]:
ask_question("Has 'Ethernity Networks Ltd' more liabilities than 'Accuray Incorporated' in 2022?", "boolean")[0] # Accuray: 419,660,000$, Ethernity: 12,257,291$ -> no is correct answer

In [None]:
ask_question('Did "Global Medical REIT Inc." have a greater Debt-to-Equity ratio than "Zegona Communications plc" in Q2 2021?', "boolean")[0]

In [None]:
ask_question("What are the total R&D expenses of 'Ethernity Networks Ltd' in 2021?", "number")[0]

## Obtain final results 

In [71]:
# 5 min runtime for 40 questions

results = []
token_usages = []

with open('data/questions.json', 'r') as json_file:
    items = json.load(json_file)
    for item in items:
        question = item["question"]
        response, token_usage = ask_question(question, item["schema"]) # TODO could introduce schema as Daniel with arg=item["schema"] and classes/pydantic
        print(response)
        results.append({
            "question": question,
            "schema": item["schema"],
            "answer": response.answer,
        })
        token_usages.append(token_usage)
        
    total_token_usages = get_total_token_usage(token_usages)

# with open('results/results_dev.json', 'w') as json_file:
#     json.dump(results, json_file, indent=4)

## Query: What was the Net Profit Margin of "Oesterreichische Kontrollbank" in June 30, 2023?
### ASKING DELEGATION MANAGER: 
Delegations: 
 chain_of_thought='The query asks about "Oesterreichische Kontrollbank," which matches exactly with the list of known companies. The request is for the \'Net Profit Margin\' on a specific date, June 30, 2023. To provide this information, the Net Profit Margin for that specific date needs to be calculated, which involves retrieving both \'Net Profit\' and \'Revenue\' for the period ending June 30, 2023. Since companies may primarily report full-year or quarterly results, if exact half-year data is unavailable, the closest available quarterly data should be used. The company’s financials section should contain the \'Net Profit\' and \'Revenue\' metrics necessary for this calculation. If only aggregate figures are reported, they should be adjusted proportionally to estimate the June 30 figure.' companies=['Oesterreichische Kontrollbank'] queries=["Ret

In [72]:
final_token_usage = {
    "prefill_prompt_tokens": 0,
    "prefill_completion_tokens": sum([v for v in build_token_usage.values()]),
    "answer_prompt_tokens": total_token_usages["prompt_tokens"],
    "answer_completion_tokens": token_usage["completion_tokens"],
}

In [73]:
final_token_usage

{'prefill_prompt_tokens': 0,
 'prefill_completion_tokens': 2508475,
 'answer_prompt_tokens': 248578,
 'answer_completion_tokens': 913}

In [75]:
total_tokens = sum([v for v in final_token_usage.values()])
cost = total_tokens / 1_000_000 * 5
cost

13.78983