In [None]:
# This introduces TUM Legal Tech's scrape of the HUDOC website, stored in a MongoDB on the chair's server. 
# This portion is restricted to the English language judgements.
# Website URL: https://hudoc.echr.coe.int/#{%22documentcollectionid2%22:[%22GRANDCHAMBER%22,%22CHAMBER%22]}

# written by Rashid Haddad, HiWi at TUM Legal Tech Chair

In [1]:
from pymongo import MongoClient
import re
import pandas as pd

# db connection setup
URI = "mongodb://%s:%s@f27se1.in.tum.de:27017/echr" % ("echr_read", "echr_read")# local
# URI = "mongodb://%s:%s@localhost:27017/echr" % ("echr_read", "echr_read") # server
client = MongoClient(URI)
database = client['echr']
# db setup
hejud = database["hejud"]

In [2]:
# Getting an example document. NOTE (!!!): Because not all documents include all fields, might need to rerun this to find
# a useful document.
# doc = hejud.find_one() # one way, doesn't randomly sample

# Sample at random:
docs = hejud.aggregate([{ '$sample': { 'size': 25 } }])
docs = list(docs)


for doc in docs:
    print('THE LAW' in doc['PCR_REMAINDER_REMAINDER'])
doc = docs[0]

False
True
True
True
True
True
False
True
True
False
True
True
False
True
True
True
True
True
True
True
True
False
True
True
True


In [3]:
# Note: Earlier ones are original from the database. Fields post 'START' field have been added in postprocessing. Rashid's work begins
# at field 'sentences' and those are the most accurate.
print(doc.keys())

dict_keys(['_id', 'originatingbody', 'ECHRRanking', 'appnoparts', 'representedby', 'sharepointid', 'typedescription', 'resolutionnumber', 'nonviolation', 'scl', 'organisations', 'documentcollectionid', 'judges', 'courts', 'conclusion', 'documentcollectionid2', 'meetingnumber', 'externalsources', 'doctypebranch', 'appno', 'respondent', 'application', 'importance', 'extractedappno', 'kpdateAsText', 'rulesofcourt', 'ecli', 'isplaceholder', 'Rank', 'violation', 'publishedby', 'judgementdate', 'dmdocnumber', 'sclappnos', 'separateopinion', 'doctype', 'languageisocode', 'introductiondate', 'reportdate', 'kpthesaurus', 'issue', 'applicability', 'languagenumber', 'docname', 'article', 'counter', 'kpdate', 'referencedate', 'doctext_html', 'doctext_pdf', 'scl_array', 'doc_text', 'pdf', 'html', 'START', 'PROCEDURE', 'INTRODUCTION', 'PROCEDURE_AND_FACTS', 'FACTS', 'RELEVANT_LEGAL_FRAMEWORK', 'RELEVANT_DOMESTIC_LAW', 'LAW', 'PROCEEDINGS_BEFORE_THE_COMMISSION', 'FINAL_SUBMISSIONS_MADE_TO_THE_COURT_B

In [4]:
# From the original metadata, the most important fields when working with ECHR data are:
print(doc['_id'])
print(doc['appno'])
print(doc['docname'])

001-57580
['7151/75', '7152/75']
CASE OF SPORRONG AND LÖNNROTH v. SWEDEN


In [22]:
# The db is structured into docs which mirror the pages on the HUDOC website. A case has a unique application number,
# but can be linked to multiple such documents if there were revisions.

In [23]:
# I have parsed the html carefully and the resulting fields are the fairest compromise between granularity and
# error rate. Unfortunately, the documents had deceptively similar, but not perfectly consistent structure. I
# handled as many exceptions as possible.

# Overall, the lowest common denominator was to preserve sentence level splits, where a sentence is most commonly a paragraph
# from the case. A paragraph typically begins with a number, other than the document and section headers. Note that subparagraphs
# can include their own numbering systems.

In [None]:
# In the event that you would like to draw upon the original html, refer to:
print(doc['html'])

In [None]:
# Most likely, you would like to access one abstraction level higher: the sentence level plain text.
# This is preserved as a list of strings.

print(doc['sentences'])

In [None]:
# The advantage of the ECHR judgements is that they have distinct sections for facts, legal reasoning, etc.
# The breakpoints are somewhat regular (common headers), and I handled most exceptions. The following fields encode lists
# of strings for each section:
print([doc_key for doc_key in doc.keys() if "REMAINDER" in doc_key])
# Facts
print(doc['PCR_FACTS'])
# Law
print(doc['PCR_REMAINDER_REMAINDER'])
# Conclusion
print(doc['PCR_CONCLUSION'])

In [None]:
# ECHR judgements cite prior cases if they are relevant. The citation structure is only somewhat consistent.
# A lot of effort was invested to parse a citation graph accross the documents. The precision and recall are 0.86 and 0.89 respectively.
# The various mining strategies and their precursors are stored in the following fields:

# Strasbourg case law citations (the scrape included these citations in a metadata page, which appear to be manually extracted from the doc.
# Some include appnos, which we stored in APPNO_DIRECT. Others required more sophisticated, error prone matching.
# print(doc['SCL_EXTRACTIONS'])
# print(doc['SCL_APPNO_DIRECT'])

# Citations from the text, whenever v. occured. Not all citations included a defendant, so v. was not a sufficient indicator of a citation.
# print(doc['V_DOT_EXTRACTIONS'])

# Citations where the appnos are mentioned. Also not consistently available. Occasionally erroneous when referring to a different case
# numbering system, like from a national court (mostly dealt with).
# print(doc['APPNO_DIRECT'])


# A merged set of these strategies. This is the final list of cases cited by a given case.
# print(doc['APPNOS_MERGE'])
print(doc['APPNOS_MERGE2'])
# print(doc['APPNO_DIRECT'])


In [None]:
# Judgements are made at the article level for each article that is believed to be violated.
# The list of such alleged article violations has been mined and stored at:
print(doc['articles_merged'])

# The set originates from to mining strategies of different sections. For the components, see:
# print(doc['articles_from_conclusion'])
# print(doc['articles_from_header_sentences'])

In [None]:
# Judgements are made by a vote of multiple judges for each article. The outcome of these votes has been mined and stored at:
print(doc['split_votes'])