In [1]:
import json
from tqdm import tqdm

# Load the data
with open("../data/retagged_all_citations.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Function to group citations using BIO tagging (B-XXX and I-XXX)
def group_chunks_by_bio(tags):
    groups = []
    current_group = []
    current_label = None

    for token in tags:
        label = token["tag"]
        if label.startswith("B-"):
            if current_group:
                groups.append(current_group)
            current_group = [token]
            current_label = label[2:]
        elif label.startswith("I-") and current_label == label[2:]:
            current_group.append(token)
        else:
            if current_group:
                groups.append(current_group)
                current_group = []
                current_label = None

    if current_group:
        groups.append(current_group)

    return groups

# Create grouped output for inspection
grouped_chunks_output = []
for entry in tqdm(data[:20], desc="Grouping by BIO"):
    chunks = group_chunks_by_bio(entry["tags"])
    grouped_chunks_output.append({
        "citation": entry["citation"],
        "groups": [
            {
                "label": chunk[0]["tag"][2:],
                "tokens": [tok["token"] for tok in chunk]
            }
            for chunk in chunks
        ]
    })

import pandas as pd
pd.set_option('display.max_colwidth', None)
df_chunks = pd.DataFrame(grouped_chunks_output)
df_chunks.head(10)


Grouping by BIO: 100%|███████████████████████| 20/20 [00:00<00:00, 48573.29it/s]


Unnamed: 0,citation,groups
0,"BVerwG, Urt. v. 26.06.1987 - 8 C 21/86 -, BVerwGE 78, 3","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['26', '.', '06', '.', '1987']}, {'label': 'Aktenzeichen', 'tokens': ['8', 'C', '21', '/', '86']}, {'label': 'Zeitschrift', 'tokens': ['BVerwGE']}, {'label': 'Nummer', 'tokens': ['78']}, {'label': 'Seite-Beginn', 'tokens': ['3']}]"
1,Urt. d. Senats v. 26.01.2000 - 2 L 236/98 -,"[{'label': 'Datum', 'tokens': ['26', '.', '01', '.', '2000']}, {'label': 'Aktenzeichen', 'tokens': ['2', 'L', '236', '/', '98']}]"
2,"BVerwG, Urt. v. 27.10.1982 - 3 C 6.82 -, BVerwGE 66, 218","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['27', '.', '10', '.', '1982']}, {'label': 'Aktenzeichen', 'tokens': ['3', 'C', '6', '.', '82']}, {'label': 'Zeitschrift', 'tokens': ['BVerwGE']}, {'label': 'Nummer', 'tokens': ['66']}, {'label': 'Seite-Beginn', 'tokens': ['218']}]"
3,"hierzu BVerwG, Urt. v. 26.06.1987, a.a.O.","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['26', '.', '06', '.', '1987']}]"
4,GVOBl. S. 239,"[{'label': 'Zeitschrift', 'tokens': ['GVOBl']}, {'label': 'Seite-Beginn', 'tokens': ['239']}]"
5,"BVerwG, Urt. v. 18.03.1999 - 5 C 11.98 -, FEVS 49, 443","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['18', '.', '03', '.', '1999']}, {'label': 'Aktenzeichen', 'tokens': ['5', 'C', '11', '.', '98']}, {'label': 'Zeitschrift', 'tokens': ['FEVS']}, {'label': 'Nummer', 'tokens': ['49']}, {'label': 'Seite-Beginn', 'tokens': ['443']}]"
6,"BVerwG, Urt. v. 06.02.2003 - 5 C 15.02 -, BVerwGE 117, 364","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['06', '.', '02', '.', '2003']}, {'label': 'Aktenzeichen', 'tokens': ['5', 'C', '15', '.', '02']}, {'label': 'Zeitschrift', 'tokens': ['BVerwGE']}, {'label': 'Nummer', 'tokens': ['117']}, {'label': 'Seite-Beginn', 'tokens': ['364']}]"
7,"hierzu: BVerwG, Urt. v. 06.02.2003, a.a.O. ; LPK-BSHG, 6. Aufl., § 107 Rdnr. 20","[{'label': 'Gericht', 'tokens': ['BVerwG']}, {'label': 'Datum', 'tokens': ['06', '.', '02', '.', '2003']}, {'label': 'Titel', 'tokens': ['LPK']}, {'label': 'Gesetz', 'tokens': ['BSHG']}, {'label': 'Auflage', 'tokens': ['6']}, {'label': 'Auflage', 'tokens': ['Aufl']}, {'label': 'Paragraph', 'tokens': ['§', '107']}, {'label': 'Randnummer', 'tokens': ['20']}]"
8,"Urt. v. 18.03.1999, a.a.O.","[{'label': 'Datum', 'tokens': ['18', '.', '03', '.', '1999']}]"
9,"so auch noch OVG Lüneburg, Urt. v. 13.11.1990 - 9 K 11/89 -, NVwZ-RR 1992, 40 ; im Anschluss daran der Senat als obiter dictum in seinem Urteil v. 24.08.2000 - 2 L 226/98 -, insoweit in NordÖR 2001, 221 nicht abgedruckt","[{'label': 'Gericht', 'tokens': ['OVG', 'Lüneburg']}, {'label': 'Datum', 'tokens': ['13', '.', '11', '.', '1990']}, {'label': 'Aktenzeichen', 'tokens': ['9', 'K', '11', '/', '89']}, {'label': 'Zeitschrift', 'tokens': ['NVwZ', '-', 'RR']}, {'label': 'Jahr', 'tokens': ['1992']}, {'label': 'Seite-Beginn', 'tokens': ['40']}, {'label': 'Datum', 'tokens': ['24', '.', '08', '.', '2000']}, {'label': 'Aktenzeichen', 'tokens': ['2', 'L', '226', '/', '98']}, {'label': 'Zeitschrift', 'tokens': ['NordÖR']}, {'label': 'Jahr', 'tokens': ['2001']}, {'label': 'Seite-Beginn', 'tokens': ['221']}]"


Fix typos using Levenshtein distance

In [None]:
from difflib import get_close_matches
from collections import Counter

# Known legal abbreviations (expand as needed)
known_abbreviations = [
    "BVerwG", "BVerfG", "OVG", "BGH", "VG", "SG", "LSG", "BSG", "BFH",
    "BGBl", "NJW", "BVerwGE", "BayVBl", "ECLI", "BGB", "GG", "StGB", "VwGO"
]

corrections = []
correction_counter = Counter()

for citation in citations:
    for token in citation["tags"]:
        if token["tag"].startswith("B-") or token["tag"].startswith("I-"):
            original = token["token"]
            matches = get_close_matches(original, known_abbreviations, n=1, cutoff=0.8)
            if matches and matches[0] != original:
                corrected = matches[0]
                corrections.append((original, corrected))
                correction_counter[token["tag"]] += 1
                token["token"] = corrected  # Apply correction



correction_counts = Counter(corrections)

# Sort and print the unique corrections with how many times each occurred
print("🔧 Unique corrections made (with counts):")
for (original, corrected), count in correction_counts.most_common():
    print(f"  - {original} → {corrected}  ({count}×)")

print("\n🧮 Total corrections by tag type:")
for tag, count in correction_counter.items():
    print(f"  {tag:<15}: {count}")

