In [27]:
import xml.etree.ElementTree as ET
import re

def find_elements_by_context_ref(xml_file, context_id):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        matching_elements = []
        for element in root.iter():

            if element.get("contextRef") == context_id and "us-gaap" in element.tag:
                truncated_content = element.text[:100] if element.text else ""  # Truncate content
                element.text = truncated_content
                
                ele = ET.tostring(element, encoding="unicode").replace("ns0", "us-gaap")
                if "TextBlock" in ele or "style=" in ele:
                    continue

                ele = ele.replace('xmlns:us-gaap="http://fasb.org/us-gaap/2023"', "").replace(f'contextRef="{context_id}"', "") 
                ele = re.sub(r"</.*?>", "</>", ele)  # Remove closing tag text (to reduce token count)
                ele = re.sub(r"\w+=\".*?\"", "", ele)  # Remove attributes
                ele = re.sub(r"\s+", " ", ele)  # Remove consecutive spaces

                matching_elements.append(ele)

        return "\n".join(matching_elements)

    except FileNotFoundError:
        print(f"Error: XML file not found: {xml_file}")
        return ""


def add_xml(qa_string, limit=1000000):
    if '<' not in qa_string or ',id:' not in qa_string:
        return qa_string

    # Extract information from the QA string
    start = qa_string.find("<") + 1
    end = qa_string.find(">")
    placeholder = qa_string[start:end]
    parts = placeholder.split(",id:")
    doc_path = "DowJones30/" + parts[0]
    
    context_id = parts[1]

    # Get the XML content using the custom grep function
    xml_content = find_elements_by_context_ref(doc_path, context_id)[:limit]

    # Replace the placeholder with the XML content
    new_qa_string = qa_string.replace(f"<{placeholder}>", xml_content + "\n\n")
    return new_qa_string


In [37]:
import json
from typing import List, Dict
from tqdm import tqdm
import re
import random
import os.path

def get_xbrl_dataset(data: List[Dict], example_q=None, example_a=None):
    """
    Saves entries with matching category1 or category2 in the format for fine-tuning.

    Args:
        data (List[Dict]): The input JSON data.
        category (str): The category name to match.
        output_file (str): The output file path.
    """
    
    results = {}
    for entry in tqdm(data):
        if (entry["doc_path"], entry["answer"], entry["contextID"][0]) in results.keys():
            continue
            
        question = entry["query"]
        question = re.sub(r"\(.*?\)", "", question)
        doc_path = entry["doc_path"]
        context_ids = entry["contextID"]

        if not os.path.isfile('DowJones30/' + doc_path):
            # print(f"missing file {doc_path}")
            continue

            
        example_qa = ""
        if example_q != None and example_a != None:
            example_qa = f"\nExample question: {example_q}\nExample answer: {example_a}"
            
        context = \
        f""""You are a knowledgeable XBRL assistant that can answer questions based on XML data. 
             You will be provided with a context extracted from an XBRL file and a question related to it. The example question can help you to learn the format of the answer.
             Your task is to analyze the XBRL context and provide an accurate and very concise answer to the question, DO NOT output xml, code, explanation or create new question.
            \nXBRL file:\n ```xml\n <{doc_path},id:{context_ids[0]}> ```\n
            {example_qa}
            \nQuestion: {question}
            \nAnswer:"""


        context_xml = add_xml(context)
        if len(context_xml) > 24000:
            continue
        
        target = entry["raw_answer"]
        # print(entry["answer"])
        # entry["doc_path"], entry["answer"], entry["contextID"][0]
        results[entry["doc_path"], entry["answer"], entry["contextID"][0]] = {"context": context_xml, "target": str(target)}
        
    print("final length", len(results))
    return list(results.values())

def save_entries_for_finetuning(dataset, cat):
      test_data = dataset[:100]
      train_data = dataset[100:]
    
      # Save the train data as JSONL
      with open(f"xbrl_{cat}_train.jsonl", "w") as f_train:
        for example in train_data:
          f_train.write(json.dumps(example) + "\n")
    
      # Save the test data as JSONL
      with open(f"xbrl_{cat}_test.jsonl", "w") as f_test:
        for example in test_data:
          f_test.write(json.dumps(example) + "\n")
        
def gen_xbrl(cat, example_q, example_a):
    with open("data/XBRL.json", "r", encoding="utf-8") as f:
        data = json.load(f)
        filtered_data = [entry for entry in data if entry['category1'] == cat or entry['category2'] == cat]
        
        print(f"Total data size for this {cat}: {len(filtered_data)}")
        random.shuffle(filtered_data)
        
        # train_data = filtered_data[split_size:]
        # train_data = train_data
        
        dataset = get_xbrl_dataset(filtered_data[:2500], example_q, example_a)
        dataset = dataset[:1500]
        save_entries_for_finetuning(dataset, cat)
        

In [36]:
gen_xbrl("xbrl_tags", 
         example_q = "What is the US GAAP XBRL tag for Cash and Cash Equivalents as reported by Example Company Inc for the Fiscal Year ending in FY 2022", 
         example_a = "us-gaap:AnExampleTagName")

Total data size for this xbrl_tags: 2730


100%|███████████████████████████████████████| 2730/2730 [00:30<00:00, 88.33it/s]

final length 546





In [38]:
gen_xbrl("value", 
         example_q = "What is the value of Exapmle company's income for the Fiscal year ending in FY 2020?", 
         example_a = "2540000000")

Total data size for this value: 12600


100%|███████████████████████████████████████| 2500/2500 [01:47<00:00, 23.32it/s]


final length 1574


In [21]:
gen_xbrl("formula_formatted_with_tags", 
         example_q = "What is the formula for the Gross Profit Margin of Example Inc, formatted with the relevant US GAAP XBRL tags, for the fiscal year ending in FY 2020?", 
         example_a = "(us-gaap:SomeExampleTag / us-gaap:OtherExampleTag) * 100")

Total data size for this formula_formatted_with_tags: 835


100%|█████████████████████████████████████████| 835/835 [00:10<00:00, 82.51it/s]

final length 166





In [4]:
gen_xbrl("formula_calculation", 
         example_q = "How much was Example Inc's Quick Ratio for the Fiscal Year concluding in FY 2010?", 
         example_a = "0.05")

Total data size for this formula_calculation: 4195


  2%|█████▋                                                                                                                                                                                                                                                                  | 82/3795 [00:04<03:03, 20.19it/s]


KeyboardInterrupt: 

In [39]:
!rm xbrl_train.jsonl
!cat *value_train* *tags_train* *tags_train* *tags_train*> xbrl_train.jsonl

