In [2]:
from datasets import load_dataset
from more_itertools import windowed_complete
from tqdm import tqdm
import ast
import os


num_proc = os.cpu_count() - 5

ecthr_dataset = load_dataset("RashidHaddad/ECTHR-PCR")
def parse_strings_to_arrays(example):
    example['facts'] = ast.literal_eval(example['facts']) if isinstance(example['facts'], str) else example['facts']
    example['law'] = ast.literal_eval(example['law']) if isinstance(example['law'], str) else example['law']
    example['citations'] = ast.literal_eval(example['citations']) if isinstance(example['citations'], str) else example['citations']
    return example

ecthr_dataset = ecthr_dataset.map(parse_strings_to_arrays, num_proc=num_proc)['train']
ecthr_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['appno', 'date', 'citations', 'facts', 'law'],
    num_rows: 15729
})

In [3]:
ecthr_dataset[0]

{'appno': '214/56',
 'date': '1962-03-27T00:00:00',
 'citations': [],
 'facts': ['I',
  '1. Mr. Raymond De Becker, journalist and writer of Belgian nationality, presently living in Paris, was condemned to death by the Brussels Conseil de Guerre on 24th July 1946 on the ground that between 13th June 1940 and 5th October 1943 he had collaborated with the German authorities in Belgium in various ways and capacities, principally in the exercise of his functions as general editor of the Belgian daily newspaper Le Soir (Articles 66, 113, 117 and 118 bis of the Belgian Penal Code).',
  'De Becker was, in particular, found guilty of having "participated in the enemy ’ s transformation of legal institutions or organisations, of having undermined the loyalty of Belgian citizens to the King and the State in time of war" and of having "furthered the enemy ’ s policy and designs"; of having "deliberately directed, practised, incited, promoted and encouraged propaganda against resistance to the enem

In [4]:
ecthr_workshop = ecthr_dataset.filter(lambda x: len(x['citations']) > 5)
current_ecthr_workshop = ecthr_workshop.select(range(7000))
current_ecthr_workshop

Dataset({
    features: ['appno', 'date', 'citations', 'facts', 'law'],
    num_rows: 7000
})

In [5]:
import ipdb


def construct_prev_text_gold_text(record):
    
    facts = record['facts']
    gold_text = record['law']
    citations = record['citations']
    prev_text = ' '.join(facts)
    all_potential_gold_text = windowed_complete(gold_text, 2)
    all_potential_gold_text = [(' '.join(beginning), ' '.join(sents)) for beginning, sents, _ in all_potential_gold_text]
    for potential_prev_text, potential_gold_text in all_potential_gold_text:
        existing_citations = [citation for citation in citations if citation in potential_gold_text]
        if len(existing_citations) >= 2: # same as clerc
            prev_text = ' '.join([prev_text, potential_prev_text])
            return {
                "prev_text": prev_text,
                "gold_text": potential_gold_text,
                "citations": existing_citations
                }
    return {
        "prev_text": prev_text,
        "gold_text": None,
        "citations": citations
    }
    
current_ecthr_workshop = current_ecthr_workshop.map(construct_prev_text_gold_text, num_proc=num_proc)

In [6]:
current_ecthr_workshop = current_ecthr_workshop.filter(lambda x: x['gold_text'] is not None)
current_ecthr_workshop = current_ecthr_workshop.select_columns(['appno', 'prev_text', 'gold_text', 'citations'])
current_ecthr_workshop = current_ecthr_workshop.select(range(6000))
current_ecthr_workshop

Dataset({
    features: ['appno', 'prev_text', 'gold_text', 'citations'],
    num_rows: 6000
})

In [7]:
split_current_ecthr_workshop = current_ecthr_workshop.train_test_split(test_size=0.16651)
split_current_ecthr_workshop = split_current_ecthr_workshop.rename_column('prev_text', 'previous_text')
split_current_ecthr_workshop

DatasetDict({
    train: Dataset({
        features: ['appno', 'previous_text', 'gold_text', 'citations'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['appno', 'previous_text', 'gold_text', 'citations'],
        num_rows: 1000
    })
})

In [8]:
split_current_ecthr_workshop

DatasetDict({
    train: Dataset({
        features: ['appno', 'previous_text', 'gold_text', 'citations'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['appno', 'previous_text', 'gold_text', 'citations'],
        num_rows: 1000
    })
})

In [9]:

echr_workshop_hf_name = f"ECHR-generation-workshop"
split_current_ecthr_workshop.push_to_hub(f"ylkhayat/{echr_workshop_hf_name}", data_dir="data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/528 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ylkhayat/ECHR-generation-workshop/commit/35f02732a99b1e447bac4b5ee763aa2d845ac88c', commit_message='Upload dataset', commit_description='', oid='35f02732a99b1e447bac4b5ee763aa2d845ac88c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ylkhayat/ECHR-generation-workshop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ylkhayat/ECHR-generation-workshop'), pr_revision=None, pr_num=None)

In [10]:
from pymongo import MongoClient
import re
import pandas as pd

# db connection setup
URI = "mongodb://%s:%s@f27se1.in.tum.de:27017/echr" % ("echr_read", "echr_read")# local
# URI = "mongodb://%s:%s@localhost:27017/echr" % ("echr_read", "echr_read") # server
client = MongoClient(URI)
database = client['echr']
# db setup
hejud = database["hejud"]

In [11]:
# Overview of all of the fields currently available. Some post-processing fields are optional, so not all documents have them.
['_id', 'originatingbody', 'ECHRRanking', 'appnoparts', 'representedby', 'sharepointid', 'typedescription', 'resolutionnumber', 'nonviolation', 'scl', 'organisations', 'documentcollectionid', 'judges', 'courts', 'conclusion', 'documentcollectionid2', 'meetingnumber', 'externalsources', 'doctypebranch', 'appno', 'respondent', 'application', 'importance', 'extractedappno', 'kpdateAsText', 'rulesofcourt', 'ecli', 'isplaceholder', 'Rank', 'violation', 'publishedby', 'judgementdate', 'dmdocnumber', 'sclappnos', 'separateopinion', 'doctype', 'languageisocode', 'introductiondate', 'reportdate', 'kpthesaurus', 'issue', 'applicability', 'languagenumber', 'docname', 'article', 'counter', 'kpdate', 'doctext_html', 'doctext_pdf', 'scl_array', 'doc_text', 'pdf', 'html', 'START', 'PROCEDURE', 'INTRODUCTION', 'PROCEDURE_AND_FACTS', 'FACTS', 'RELEVANT_LEGAL_FRAMEWORK', 'RELEVANT_DOMESTIC_LAW', 'LAW', 'PROCEEDINGS_BEFORE_THE_COMMISSION', 'FINAL_SUBMISSIONS_MADE_TO_THE_COURT_BY_THE_GOVERNMENT', 'FINAL_SUBMISSIONS_TO_THE_COURT', 'COURT_CONCLUSION', 'SEPARATE_OPINION', 'SUPPLEMENTARY_OBSERVATIONS', 'FULL_TEXT', 'sentences', 'FACTS_segmented', 'FACTS_segmented_no_headers', 'FACTS_segmented_new', 'FACTS_segmented_no_headers_new', 'PCR_FACTS', 'PCR_REMAINDER', 'PCR_CONCLUSION', 'PCR_REMAINDER_REMAINDER', 'SCL_EXTRACTIONS', 'articles_from_conclusion', 'articles_from_header_sentences', 'articles_merged', 'V_DOT_EXTRACTIONS', 'SCL_APPNO_DIRECT', 'APPNO_DIRECT', 'APPNOS_MERGE', 'APPNOS_MERGE2', 'split_votes']

['_id',
 'originatingbody',
 'ECHRRanking',
 'appnoparts',
 'representedby',
 'sharepointid',
 'typedescription',
 'resolutionnumber',
 'nonviolation',
 'scl',
 'organisations',
 'documentcollectionid',
 'judges',
 'courts',
 'conclusion',
 'documentcollectionid2',
 'meetingnumber',
 'externalsources',
 'doctypebranch',
 'appno',
 'respondent',
 'application',
 'importance',
 'extractedappno',
 'kpdateAsText',
 'rulesofcourt',
 'ecli',
 'isplaceholder',
 'Rank',
 'violation',
 'publishedby',
 'judgementdate',
 'dmdocnumber',
 'sclappnos',
 'separateopinion',
 'doctype',
 'languageisocode',
 'introductiondate',
 'reportdate',
 'kpthesaurus',
 'issue',
 'applicability',
 'languagenumber',
 'docname',
 'article',
 'counter',
 'kpdate',
 'doctext_html',
 'doctext_pdf',
 'scl_array',
 'doc_text',
 'pdf',
 'html',
 'START',
 'PROCEDURE',
 'INTRODUCTION',
 'PROCEDURE_AND_FACTS',
 'FACTS',
 'RELEVANT_LEGAL_FRAMEWORK',
 'RELEVANT_DOMESTIC_LAW',
 'LAW',
 'PROCEEDINGS_BEFORE_THE_COMMISSION',
 'FI

In [None]:
all_citations = split_current_ecthr_workshop['train']['citations'] + split_current_ecthr_workshop['test']['citations']
all_appnos = split_current_ecthr_workshop['train']['appno'] + split_current_ecthr_workshop['test']['appno']
all_set_citations = set([item for sublist in all_citations for item in sublist])
all_set_appnos = set(all_appnos)
all_citations_dict_text = {}
for doc in tqdm(hejud.find({"appno": {"$in": list(all_set_citations)}})):
    full_text = ' '.join(doc['sentences'])
    for app_no in doc['appno']:
        all_citations_dict_text[app_no] = full_text




0it [00:00, ?it/s]

3413it [00:03, 1013.77it/s]


In [14]:
def get_citation_text(record):
    citations = record['citations']
    new_citations = []
    for citation in citations:
        citation_text = all_citations_dict_text[citation]
        if not citation_text:
            raise ValueError(f"[!] oh no! {citation} not found!")
        new_citations.append([citation, citation_text])
    return { "citations": new_citations }
        
split_current_ecthr_workshop = split_current_ecthr_workshop.map(get_citation_text, num_proc=num_proc)

Map (num_proc=43):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=43):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
split_current_ecthr_workshop.push_to_hub(f"ylkhayat/{echr_workshop_hf_name}", data_dir="data")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/528 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ylkhayat/ECHR-generation-workshop/commit/1a6f34611f697db0faae17f21657821c5059696f', commit_message='Upload dataset', commit_description='', oid='1a6f34611f697db0faae17f21657821c5059696f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ylkhayat/ECHR-generation-workshop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ylkhayat/ECHR-generation-workshop'), pr_revision=None, pr_num=None)

In [13]:
import json


with open("all_citations_dict_text.jsonl", "w") as f:
    for key, value in all_citations_dict_text.items():
        f.write(json.dumps({key: value}) + '\n')

In [None]:
# Getting an example document. NOTE (!!!): Because not all documents include all fields, might need to rerun this to find
# a useful document.
# doc = hejud.find_one() # one way, doesn't randomly sample

# Sample at random:
docs = hejud.aggregate([{ '$sample': { 'size': 25 } }])
docs = list(docs)


for doc in docs:
    print('THE LAW' in doc['PCR_REMAINDER_REMAINDER'])
doc = docs[0]

In [None]:
# Note: Earlier ones are original from the database. Fields post 'START' field have been added in postprocessing. Rashid's work begins
# at field 'sentences' and those are the most accurate.
print(doc.keys())

In [None]:
# From the original metadata, the most important fields when working with ECHR data are:
print(doc['_id'])
print(doc['appno'])
print(doc['docname'])

In [22]:
# The db is structured into docs which mirror the pages on the HUDOC website. A case has a unique application number,
# but can be linked to multiple such documents if there were revisions.

In [23]:
# I have parsed the html carefully and the resulting fields are the fairest compromise between granularity and
# error rate. Unfortunately, the documents had deceptively similar, but not perfectly consistent structure. I
# handled as many exceptions as possible.

# Overall, the lowest common denominator was to preserve sentence level splits, where a sentence is most commonly a paragraph
# from the case. A paragraph typically begins with a number, other than the document and section headers. Note that subparagraphs
# can include their own numbering systems.

In [None]:
# In the event that you would like to draw upon the original html, refer to:
print(doc['html'])

In [None]:
# Most likely, you would like to access one abstraction level higher: the sentence level plain text.
# This is preserved as a list of strings.

print(doc['sentences'])

In [None]:
# The advantage of the ECHR judgements is that they have distinct sections for facts, legal reasoning, etc.
# The breakpoints are somewhat regular (common headers), and I handled most exceptions. The following fields encode lists
# of strings for each section:
print([doc_key for doc_key in doc.keys() if "REMAINDER" in doc_key])
# Facts
print(doc['PCR_FACTS'])
# Law
print(doc['PCR_REMAINDER_REMAINDER'])
# Conclusion
print(doc['PCR_CONCLUSION'])

In [None]:
# ECHR judgements cite prior cases if they are relevant. The citation structure is only somewhat consistent.
# A lot of effort was invested to parse a citation graph accross the documents. The precision and recall are 0.86 and 0.89 respectively.
# The various mining strategies and their precursors are stored in the following fields:

# Strasbourg case law citations (the scrape included these citations in a metadata page, which appear to be manually extracted from the doc.
# Some include appnos, which we stored in APPNO_DIRECT. Others required more sophisticated, error prone matching.
# print(doc['SCL_EXTRACTIONS'])
# print(doc['SCL_APPNO_DIRECT'])

# Citations from the text, whenever v. occured. Not all citations included a defendant, so v. was not a sufficient indicator of a citation.
# print(doc['V_DOT_EXTRACTIONS'])

# Citations where the appnos are mentioned. Also not consistently available. Occasionally erroneous when referring to a different case
# numbering system, like from a national court (mostly dealt with).
# print(doc['APPNO_DIRECT'])


# A merged set of these strategies. This is the final list of cases cited by a given case.
# print(doc['APPNOS_MERGE'])
print(doc['APPNOS_MERGE2'])
# print(doc['APPNO_DIRECT'])


In [None]:
# Judgements are made at the article level for each article that is believed to be violated.
# The list of such alleged article violations has been mined and stored at:
print(doc['articles_merged'])

# The set originates from to mining strategies of different sections. For the components, see:
# print(doc['articles_from_conclusion'])
# print(doc['articles_from_header_sentences'])

In [None]:
# Judgements are made by a vote of multiple judges for each article. The outcome of these votes has been mined and stored at:
print(doc['split_votes'])