In [1]:
import requests

urls = [f'https://raw.githubusercontent.com/RegNLP/ObliQADataset/refs/heads/main/StructuredRegulatoryDocuments/{i}.json' for i in range(1, 41)]

data = []

for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        data.append(response.json())
    else:
        print(f"Failed to download {url}")

# Now `data` contains the content of all 40 files
print(f"Downloaded {len(data)} files.")

Downloaded 40 files.


In [2]:
import json
from datasets import Dataset

document_to_sections = {}
for document in data:
    sections = {}
    for section in document:
        passage_id = section['PassageID']
        section_key = '.'.join(passage_id.split('.')[:1])
        passage = section['Passage']
        full_text = f"{passage_id}{' ' + passage if len(passage) > 0 else ''}"

        if section_key not in sections:
            sections[section_key] = []
        sections[section_key].append(full_text)
    document_to_sections[document[0]['DocumentID']] = {
        'id': document[0]['DocumentID'],
        'sections_dict_list': list(sections.values()),
        'sections': ['\n'.join(v) for v in sections.values()],
        'full_text': '\n'.join([item for sublist in sections.values() for item in sublist])
    }
with open('obliqa_documents.json', 'w') as f:
    f.write(json.dumps(document_to_sections, indent=4))

In [3]:
document_to_sections[1]

{'id': 1,
 'sections_dict_list': [['1. INTRODUCTION',
   '1.1 Jurisdiction',
   '1.1.1',
   '1.1.1.(1) The AML Rulebook is made in recognition of the application of the Federal AML Legislation in the Abu Dhabi Global Market ("ADGM").',
   '1.1.1.(2) Nothing in the AML Rulebook affects the operation of Federal AML Legislation.',
   '1.2 Application',
   '1.2.1',
   "1.2.1.(1) Subject to (2), the AML Rulebook applies to:\n(a)\tevery Relevant Person in respect of all its activities carried out in or from the ADGM; and\n(b)\tthe Persons specified in Rule \u200e1.3.3 as being responsible for a Relevant Person's compliance with the AML Rulebook.",
   '1.2.1.(2) In respect of a Relevant Person that is:\n(a)\tan Authorised Person, other than a Credit Rating Agency, and a Recognised Body, only the requirements of Chapters \u200e1 to \u200e14 of the AML Rulebook apply;\n(b)\ta Representative Office, only the requirements of Chapters \u200e1 to \u200e6 and \u200e11 to \u200e14 of the AML Rulebook

In [4]:
import json
from datasets import load_dataset

# Load the dataset from the remote URL
dataset = load_dataset('json', data_files={
    'test': 'https://raw.githubusercontent.com/RegNLP/ObliQADataset/refs/heads/main/ObliQA_test.json',
    'train': 'https://raw.githubusercontent.com/RegNLP/ObliQADataset/refs/heads/main/ObliQA_train.json'
})


In [5]:
print(json.dumps(dataset['test'][5], indent=4))

{
    "QuestionID": "84a753ca-2a42-4909-bf85-e4d5654cc605",
    "Question": "What should an Authorized Person aim to do with a mixed remittance before it is credited to the Client Account?",
    "Passages": [
        {
            "DocumentID": 3,
            "Passage": "Whenever possible the Authorised Person should seek to split a mixed remittance before crediting the Client Account.",
            "PassageID": "14.4.6.Guidance.4."
        }
    ],
    "Group": 1
}


In [8]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': dataset['train'].select(range(1000)),
    'test': dataset['test']
})
dataset

DatasetDict({
    train: Dataset({
        features: ['QuestionID', 'Question', 'Passages', 'Group'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['QuestionID', 'Question', 'Passages', 'Group'],
        num_rows: 2786
    })
})

In [9]:
def process_record(record):
    passages = record['Passages']
    gold_text = '\n'.join([passage['Passage'] for passage in passages])
    citations = [[str(passage['DocumentID']), document_to_sections[passage['DocumentID']]['full_text']] for passage in passages]
    all_sections = [[str(passage['DocumentID']), document_to_sections[passage['DocumentID']]['sections']] for passage in passages]
    oracle_documents_passages = []
    for document_id, sections in all_sections:
        for section in sections:
            oracle_documents_passages.append([document_id, section])
    
    return {
        'docid': str(record['QuestionID']),
        'previous_text': record['Question'],
        'gold_text': gold_text,
        'citations': citations,
        'oracle_documents_passages': oracle_documents_passages,
    }

dataset = dataset.map(process_record, num_proc=40)

Map (num_proc=40):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.select_columns(['docid', 'previous_text', 'gold_text', 'citations', 'oracle_documents_passages'])
dataset

DatasetDict({
    train: Dataset({
        features: ['docid', 'previous_text', 'gold_text', 'citations', 'oracle_documents_passages'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['docid', 'previous_text', 'gold_text', 'citations', 'oracle_documents_passages'],
        num_rows: 2786
    })
})

In [11]:
dataset.push_to_hub('ylkhayat/OBLI_QA-generation-workshop', data_dir='data')

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/716 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ylkhayat/OBLI_QA-generation-workshop/commit/72808267e83c8811b714792267f58c26cd6178b5', commit_message='Upload dataset', commit_description='', oid='72808267e83c8811b714792267f58c26cd6178b5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ylkhayat/OBLI_QA-generation-workshop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ylkhayat/OBLI_QA-generation-workshop'), pr_revision=None, pr_num=None)

In [12]:
print(json.dumps(dataset['train'][0], indent=4))

{
    "docid": "a10724b5-ad0e-4b69-8b5e-792aef214f86",
    "previous_text": "Under Rules 7.3.2 and 7.3.3, what are the two specific conditions related to the maturity of a financial instrument that would trigger a disclosure requirement?",
    "gold_text": "Events that trigger a disclosure. For the purposes of Rules 7.3.2 and 7.3.3, a Person is taken to hold Financial Instruments in or relating to a Reporting Entity, if the Person holds a Financial Instrument that on its maturity will confer on him:\n(1)\tan unconditional right to acquire the Financial Instrument; or\n(2)\tthe discretion as to his right to acquire the Financial Instrument.\n",
    "citations": [
        [
            "11",
        ]
    ],
    "oracle_documents_passages": [
        [
            "11",
            "1. INTRODUCTION\n1.1 Application\n1.1.1\n1.1.1.(1) The Rules in this Rulebook (\"MKT\") are made for the purposes of the Financial Services and Markets Regulations 2015 (\"FSMR\") and apply to every Person to

In [None]:
# - no answer
# - passages concatenated (gold text)
# - evaluate (correctness ) -> generated / gold text
# - faithfulness (generated / top k)
# - retrieval over the documents not all