Nornamlisation

Import

In [1]:
import json
from pathlib import Path
from collections import defaultdict, Counter
import copy
import re


STEP 1 : Load data

In [6]:
file_path = Path("../data/tagged_citations.jsonl")
citations = []

with file_path.open("r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        citations.append(data)

example_citations = citations[:1]
example_citations


[{'ecli': 'ECLI:DE:OVGSH:2004:0407.2LB73.03.00.txt',
  'citation': 'BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3',
  'label': 'CITATION',
  'tags': [{'token': 'BVerwG', 'tag': 'B-Gericht'},
   {'token': ',', 'tag': 'O'},
   {'token': 'Urt', 'tag': 'O'},
   {'token': '.', 'tag': 'O'},
   {'token': 'v', 'tag': 'O'},
   {'token': '.', 'tag': 'O'},
   {'token': '26', 'tag': 'B-Datum'},
   {'token': '.', 'tag': 'I-Datum'},
   {'token': '06', 'tag': 'I-Datum'},
   {'token': '.', 'tag': 'I-Datum'},
   {'token': '1987', 'tag': 'I-Datum'},
   {'token': '-', 'tag': 'O'},
   {'token': '8', 'tag': 'B-Aktenzeichen'},
   {'token': 'C', 'tag': 'I-Aktenzeichen'},
   {'token': '21', 'tag': 'I-Aktenzeichen'},
   {'token': '/', 'tag': 'I-Aktenzeichen'},
   {'token': '86', 'tag': 'I-Aktenzeichen'},
   {'token': '-', 'tag': 'O'},
   {'token': ',', 'tag': 'O'},
   {'token': 'BVerwGE', 'tag': 'B-Zeitschrift'},
   {'token': '78', 'tag': 'B-Nummer'},
   {'token': ',', 'tag': 'O'},
   {'token': '3', 

In [7]:


def split_citation_entry(entry, idx):
    split_indices = [i for i, t in enumerate(entry["tags"]) if t["token"] == ";" and t["tag"] == "O"]
    if not split_indices:
        entry["id"] = f"cit{idx}"
        return [entry]

    split_indices.append(len(entry["tags"]))
    
    new_entries = []
    start = 0
    for i, end in enumerate(split_indices):
        tags_part = entry["tags"][start:end]
        text_part = " ".join(t["token"] for t in tags_part).strip()
        new_entry = copy.deepcopy(entry)
        new_entry["tags"] = tags_part
        new_entry["citation"] = text_part
        new_entry["id"] = f"cit{idx}.split{i}"
        new_entries.append(new_entry)

        start = end + 1

    return new_entries

citations_split = []
for idx, citation in enumerate(citations):  # citations_raw is the original list
    citations_split.extend(split_citation_entry(citation, idx))

citations = citations_split



In [8]:
for c in citations[:1]:
    print("Citation:", c["citation"])
    print("ID:", c["id"])
    print("Tokens and Tags:")
    for token in c["tags"]:
        print(f"  {token['token']:<15} {token['tag']}")
    print("-" * 60)


Citation: BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3
ID: cit0
Tokens and Tags:
  BVerwG          B-Gericht
  ,               O
  Urt             O
  .               O
  v               O
  .               O
  26              B-Datum
  .               I-Datum
  06              I-Datum
  .               I-Datum
  1987            I-Datum
  -               O
  8               B-Aktenzeichen
  C               I-Aktenzeichen
  21              I-Aktenzeichen
  /               I-Aktenzeichen
  86              I-Aktenzeichen
  -               O
  ,               O
  BVerwGE         B-Zeitschrift
  78              B-Nummer
  ,               O
  3               B-Seite-Beginn
------------------------------------------------------------


Step 2 : Extract and normalize fields

In [5]:
def normalize_date(tokens):
    digits = [t for t in tokens if t.isdigit()]
    if len(digits) == 3:
        day, month, year = digits
        return f"{day}-{month.zfill(2)}-{year.zfill(2)}"
    return " ".join(tokens)


def normalize_case_number(tokens):
    return " ".join(tokens)

def normalize_field(field, tokens):
    if field == "Datum":
        return normalize_date(tokens)
    elif field == "Aktenzeichen":
        return normalize_case_number(tokens)
    else:
        return " ".join(tokens)

def extract_normalized_fields(tagged_tokens):
    fields = defaultdict(list)
    current_label = None
    i = 0

    while i < len(tagged_tokens):
        token = tagged_tokens[i]["token"]
        tag = tagged_tokens[i]["tag"]

        if (
            token.lower() == "s"
            and i + 2 < len(tagged_tokens)
            and tagged_tokens[i + 1]["token"] == "."
            and tagged_tokens[i + 2]["tag"] == "B-Seite-Beginn"
        ):
            page_token = tagged_tokens[i + 2]["token"]
            fields["Seite-Beginn"].append(["Seite", page_token])
            i += 3
            continue

        if tag.startswith("B-"):
            current_label = tag[2:]
            fields[current_label].append([token])
        elif tag.startswith("I-") and current_label:
            fields[current_label][-1].append(token)
        else:
            current_label = None
        i += 1

    normalized_fields = {}
    for label, chunks in fields.items():
        normalized_fields[label] = [normalize_field(label, chunk) for chunk in chunks]

    return normalized_fields


for citation in citations:
    citation["normalized_fields"] = extract_normalized_fields(citation["tags"])
    
for example in citations[:10]:
    print("Original:", example["citation"])
    print("Normalized Fields:", example["normalized_fields"])
    print("ID:", example["id"])
    print("-" * 80)



Step 3 : Expand Abbreviation

In [22]:

abbreviation_dict = {
    "BGH": "Bundesgerichtshof",
    "BVerwG": "Bundesverwaltungsgericht",
    "OVG": "Oberverwaltungsgericht",
    "BVerfG": "Bundesverfassungsgericht",
    "Urt.": "Urteil",
    "v.": "vom",
    "Az.": "Aktenzeichen",
    "NJW": "Neue Juristische Wochenschrift",
    "BVerwGE": "Entscheidungen des Bundesverwaltungsgerichts",
    "GVOBl": "Gesetz- und Verordnungsblatt",
    "FEVS": "Entscheidungen der Verwaltungsgerichte in Schleswig-Holstein",
    "NVwZ": "Neue Zeitschrift für Verwaltungsrecht",
    "LPK": "Loseblattsammlung Praxis-Kommentar"
}

def expand_abbreviations(normalized_fields):
    expanded = {}
    for field, values in normalized_fields.items():
        expanded_values = []
        for val in values:
            words = val.split()
            expanded_words = [
                abbreviation_dict.get(w.strip("."), w) for w in words
            ]
            expanded_values.append(" ".join(expanded_words))
        expanded[field] = expanded_values
    return expanded

for citation in citations:
    citation["expanded_fields"] = expand_abbreviations(citation["normalized_fields"])

for example in citations[:10]:
    print("Original:", example["citation"])
    print("Expanded Fields:", example["expanded_fields"])
    print("ID:", example["id"])
    print("-" * 80)


Original: BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3
Expanded Fields: {'Gericht': ['Bundesverwaltungsgericht'], 'Datum': ['26-06-1987'], 'Aktenzeichen': ['8 C 21 / 86'], 'Zeitschrift': ['Entscheidungen des Bundesverwaltungsgerichts'], 'Nummer': ['78'], 'Seite-Beginn': ['3']}
ID: cit0
--------------------------------------------------------------------------------
Original: Urt. d. Senats v. 26.01.2000 - 2 L 236/98 -
Expanded Fields: {'Datum': ['26-01-2000'], 'Aktenzeichen': ['2 L 236 / 98']}
ID: cit1
--------------------------------------------------------------------------------
Original: BVerwG, Urt. v. 27.10.1982 - 3 C 6.82 -, BVerwGE 66, 218
Expanded Fields: {'Gericht': ['Bundesverwaltungsgericht'], 'Datum': ['27-10-1982'], 'Aktenzeichen': ['3 C 6 . 82'], 'Zeitschrift': ['Entscheidungen des Bundesverwaltungsgerichts'], 'Nummer': ['66'], 'Seite-Beginn': ['218']}
ID: cit2
--------------------------------------------------------------------------------
Original: hierzu BV

Step 3b : Cleaning spaces

In [23]:
def clean_spacing(text):
    return re.sub(r"\s*([/.])\s*", r"\1", text)

def clean_all_expanded_fields(expanded_fields):
    cleaned = {}
    for field, values in expanded_fields.items():
        cleaned[field] = [clean_spacing(val) for val in values]
    return cleaned

for citation in citations:
    citation["expanded_fields"] = clean_all_expanded_fields(citation["expanded_fields"])

for example in citations[:10]:
    print("Original:", example["citation"])
    print("Expanded Fields:", example["expanded_fields"])
    print("ID:", example["id"])
    print("-" * 80)



Original: BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3
Expanded Fields: {'Gericht': ['Bundesverwaltungsgericht'], 'Datum': ['26-06-1987'], 'Aktenzeichen': ['8 C 21/86'], 'Zeitschrift': ['Entscheidungen des Bundesverwaltungsgerichts'], 'Nummer': ['78'], 'Seite-Beginn': ['3']}
ID: cit0
--------------------------------------------------------------------------------
Original: Urt. d. Senats v. 26.01.2000 - 2 L 236/98 -
Expanded Fields: {'Datum': ['26-01-2000'], 'Aktenzeichen': ['2 L 236/98']}
ID: cit1
--------------------------------------------------------------------------------
Original: BVerwG, Urt. v. 27.10.1982 - 3 C 6.82 -, BVerwGE 66, 218
Expanded Fields: {'Gericht': ['Bundesverwaltungsgericht'], 'Datum': ['27-10-1982'], 'Aktenzeichen': ['3 C 6.82'], 'Zeitschrift': ['Entscheidungen des Bundesverwaltungsgerichts'], 'Nummer': ['66'], 'Seite-Beginn': ['218']}
ID: cit2
--------------------------------------------------------------------------------
Original: hierzu BVerwG, 

Step 4 : Reconstruction

In [24]:
def format_date_german(iso_date):
    try:
        year, month, day = iso_date.split("-")
        return f"{day}.{month}.{year}"
    except Exception:
        return iso_date

def reconstruct_citation(citation):
    expanded_fields = citation["expanded_fields"]

    def get(field):
        return expanded_fields.get(field, [])

    def first(field):
        return get(field)[0] if get(field) else None

    def join_list(field):
        joined = " / ".join(get(field)) if get(field) else None
        return re.sub(r'\s*/\s*', '/', joined) if joined else None  # clean slashes

    court = first("Gericht")
    date = first("Datum")
    case = first("Aktenzeichen")
    journal = first("Zeitschrift")
    volume = first("Nummer")
    page = first("Seite-Beginn") or first("Seite-Fundstelle")
    law = first("Gesetz")
    paragraph = " ".join(get("Paragraph")).replace("§", "").strip()
    randnummer = " ".join(get("Randnummer"))
    title = " ".join(get("Titel"))
    authors = join_list("Autor")
    edition = " ".join(get("Auflage")).replace("Aufl", "").strip()
    year = first("Jahr")

    parts = []

    if court or case or journal:
        if court:
            parts.append(court)
        if date:
            parts.append(f"Urteil vom {format_date_german(date)}")
        if case:
            parts.append(f"- {case} -")
        if journal and volume and page:
            parts.append(f"{journal} {volume}, {page}")
        elif journal and page:
            parts.append(f"{journal}, {page}")
        elif journal:
            parts.append(journal)
        if year and not volume:
            parts.append(year)

    if authors or law or paragraph or randnummer or title or edition:
        subparts = []
        if authors:
            subparts.append(authors)
        if title:
            subparts.append(title)
        if edition:
            subparts.append(f"{edition} Aufl.")
        if law:
            subparts.append(law)
        if paragraph:
            subparts.append(f"§ {paragraph}")
        if randnummer:
            subparts.append(f"Rdnr. {randnummer}")
        if year and not journal:
            subparts.append(year)
        if subparts:
            parts.append(", ".join(subparts))

    inline_text = " ".join([t["token"] for t in citation.get("tags", [])])
    if re.search(r"\ba\.a\.O\.\b", inline_text, flags=re.IGNORECASE):
        parts.append("a.a.O.")
    elif re.search(r"\bebd\.\b", inline_text, flags=re.IGNORECASE):
        parts.append("ebd.")



    return ", ".join(parts).strip()



for citation in citations:
    citation["normalized_text"] = reconstruct_citation(citation)

for example in citations[:10]:
    print("Original:", example["citation"])
    print("Normalized:", example["normalized_text"])
    print("ID:", example["id"])
    print("-" * 80)



Original: BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3
Normalized: Bundesverwaltungsgericht, Urteil vom 1987.06.26, - 8 C 21/86 -, Entscheidungen des Bundesverwaltungsgerichts 78, 3
ID: cit0
--------------------------------------------------------------------------------
Original: Urt. d. Senats v. 26.01.2000 - 2 L 236/98 -
Normalized: Urteil vom 2000.01.26, - 2 L 236/98 -
ID: cit1
--------------------------------------------------------------------------------
Original: BVerwG, Urt. v. 27.10.1982 - 3 C 6.82 -, BVerwGE 66, 218
Normalized: Bundesverwaltungsgericht, Urteil vom 1982.10.27, - 3 C 6.82 -, Entscheidungen des Bundesverwaltungsgerichts 66, 218
ID: cit2
--------------------------------------------------------------------------------
Original: hierzu BVerwG, Urt. v. 26.06.1987, a.a.O.
Normalized: Bundesverwaltungsgericht, Urteil vom 1987.06.26
ID: cit3
--------------------------------------------------------------------------------
Original: GVOBl. S. 239
Normalized: 

Step 4b : Checking reconstruction

In [3]:
unhandled_citations = [c for c in citations if not c.get("normalized_text")]

for c in unhandled_citations[:5]:
    print("Original:", c["citation"])
    print("ECLI:", c.get("ecli", "—"))
    print("Tokens:", " ".join([t["token"] for t in c.get("tags", [])]))
    print("ID:", c["id"])
    print("-" * 80)

print(f"\nTotal citations with empty normalization: {len(unhandled_citations)}")

output_path = Path("unhandled_citations.jsonl")
with output_path.open("w", encoding="utf-8") as f:
    for c in unhandled_citations:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")


NameError: name 'citations' is not defined

In [26]:
tag_counter = Counter()
for citation in unhandled_citations:
    tags = [t["tag"] for t in citation.get("tags", []) if t["tag"] != "O"]
    tag_counter.update(tags)

tag_counter.most_common()

[('I-Datum', 3330),
 ('B-Datum', 974),
 ('B-Seite-Beginn', 970),
 ('I-Titel', 423),
 ('I-Aktenzeichen', 375),
 ('B-Jahr', 303),
 ('B-Seite-Fundstelle', 245),
 ('B-Nummer', 121),
 ('I-Gericht', 93),
 ('I-Randnummer', 81),
 ('B-Wort:aaO', 43),
 ('I-Zeitschrift', 21),
 ('I-Paragraph', 17),
 ('I-Gesetz', 11),
 ('I-Nummer', 8),
 ('B-Wort:Auflage', 2),
 ('B-Editor', 2)]

These citations are not completely empty — most have 1 or 2 low-signal tags, but no core anchor field like:
B-Gericht
B-Datum
B-Aktenzeichen
B-Autor, B-Titel
Many likely refer to continuations or secondary references, e.g.:
"= NVwZ-RR 1999, 66" — a page in another journal
"vgl. Rdnr. 29" — points to a number in an earlier citation

Step 4c : labeling

Labeling as partial (potential reference), maybe later with the ecli code we can discover what they are refering to
A citation is "partial" if:
It lacks key identifying fields (Gericht, Aktenzeichen, Zeitschrift, etc.)
OR it contains inline reference markers (a.a.O., ebd.)
EVEN IF a minimal normalization was produced (e.g., just Datum)

In [27]:
for citation in citations:
    expanded = citation.get("expanded_fields", {})
    inline_text = " ".join([t["token"] for t in citation.get("tags", [])])
    has_inline_ref = re.search(r"\ba\.a\.O\.\b|\bebd\.\b", inline_text, flags=re.IGNORECASE)

    has_core_info = any(field in expanded for field in [
        "Gericht", "Aktenzeichen", "Zeitschrift", "Autor", "Titel"
    ])

    if not expanded:
        citation["citation_type"] = "non-legal"
    elif not has_core_info or has_inline_ref:
        citation["citation_type"] = "partial"
    else:
        citation["citation_type"] = "full"


In [97]:
for c in citations[:1]:
    print("Original:", c["citation"])
    print("Normalized:", c.get("normalized_text", ""))
    print("Citation Type:", c["citation_type"])
    print("ID:", c["id"])
    print("-" * 80)

Original: BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3
Normalized: Bundesverwaltungsgericht, Urteil vom 1987.06.26, - 8 C 21/86 -, Entscheidungen des Bundesverwaltungsgerichts 78, 3
Citation Type: full
ID: cit0
--------------------------------------------------------------------------------


In [None]:
for c in citations:
    if c.get("citation_type") == "non-legal":
        print("-" * 80)
        print("Original:", c["citation"])
        print("Normalized:", c.get("normalized_citation", ""))
        print("Citation Type:", c.get("citation_type"))
        print("ID:", c["id"])


Check 1 – Non-legal citations with strong legal fields

In [85]:
suspicious_legal_fields = ["Gericht", "Aktenzeichen", "Zeitschrift", "Paragraph", "Titel", "Gesetz"]
for cit in citations:
    if cit.get("citation_type") == "non-legal":
        fields = cit.get("expanded_fields", {})
        if any(field in fields for field in suspicious_legal_fields):
            print("-" * 80)
            print("❗️Possible misclassified legal citation")
            print("Original:", cit["citation"])
            print("Normalized Fields:", fields)
            print("ID:", cit["id"])




Check 2 – Non-legal citations with legal-looking patterns

In [86]:
legal_patterns = [
    r"\bAz\b", r"\bAz\.", r"\bBGBl\b", r"\bBVerwG\b", r"\b§\s*\d+", r"\bUrteil\b", r"\bECLI:",
    r"\d+\s*[A-Z]+\s*\d+/\d+",  # Aktenzeichen patroon
]

for cit in citations:
    if cit.get("citation_type") == "non-legal":
        text = cit.get("citation", "")
        if any(re.search(pat, text) for pat in legal_patterns):
            print("-" * 80)
            print("❗️Legal-looking pattern in non-legal citation")
            print("Original:", cit["citation"])
            print("ID:", cit["id"])


Relabel Non-legal citations with legal-looking patterns

In [28]:
fallback_patterns = [
    r"\bAz\b", r"\bAz\.", r"\bBGBl\b", r"\bBVerwG\b", r"\bUrteil\b", r"\bBeschluss\b", 
    r"\bSenat\b", r"\b§\s*\d+", r"\bECLI:", r"\ba\.a\.O\.", r"\bm\.w\.N\.", r"\bjuris\b", 
    r"\bständige\s+Rechtsprechung\b", r"\b\d+\s*[A-Z]+\s*\d+/\d+", r"\bEntscheidung\b"
]

for citation in citations:
    if citation.get("citation_type") == "non-legal":
        if any(re.search(p, citation.get("citation", ""), flags=re.IGNORECASE) for p in fallback_patterns):
            citation["citation_type"] = "partial"


# Save as JSONL

In [29]:
import json
from pathlib import Path

output_path = Path("../data/normalized_citations.jsonl")

with output_path.open("w", encoding="utf-8") as f:
    for citation in citations:
        json_line = json.dumps(citation, ensure_ascii=False)
        f.write(json_line + "\n")
