In [18]:
import pandas as pd
import re
import os
from tqdm import tqdm
from pathlib import Path

This notebook is for experimentation and exploration only. The normal preprocessing pipeline will be run by the corresponding .py file.

In [2]:
import nltk, stanza
nltk.download("punkt")
nltk.download("stopwords")
stanza.download("de", package="gsd", processors="tokenize,mwt,pos,lemma")  # German model  

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\timgr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timgr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 4.29MB/s]                    
2025-11-01 19:43:09 INFO: Downloaded file to C:\Users\timgr\stanza_resources\resources.json
2025-11-01 19:43:09 INFO: Downloading these customized packages for language: de (German)...
| Processor       | Package      |
----------------------------------
| tokenize        | gsd          |
| mwt             | gsd          |
| pos             | gsd_charlm   |
| lemma           | gsd_nocharlm |
| pretrain        | conll17      |
| backward_charlm | newswiki     |
| forward_charlm  | newswik

In [3]:
from datasets import load_dataset
dataset = load_dataset("fincorpus-de-10k.py", split="train")

For the sake of exploration and trying out different stuff, we will use a small subset of the entire corpus

In [4]:
subset = dataset.shuffle(seed=42).select(range(10))

In [5]:
df = pd.DataFrame(subset)
df["char_len"] = df["text"].str.len()
df["num_lines"] = df["text"].str.count(r"\n") + 1
df.describe(include="all")

Unnamed: 0,filename,text,char_len,num_lines
count,10,10,10.0,10.0
unique,10,10,,
top,txt/Final_terms/DE000DB9U1Y1.pdf.txt,Endgültige Bedingungen Nr. 11 vom 16. Juni 202...,,
freq,1,1,,
mean,,,64451.1,1833.4
std,,,32495.523321,977.078662
min,,,10266.0,476.0
25%,,,45457.25,1101.25
50%,,,55709.0,1573.5
75%,,,86111.5,2486.25


we can see here that some of our files are already very large

# domain specific normalization
We want to keep Abbreviations that cause sentence-break carnage. For that we will replace dots with ⟂ in the abbreviations that we know and revert that after tokenization

In [6]:
ABBREV = [
    r"Abs\.", r"Nr\.", r"Art\.", r"Kap\.", r"Anm\.", r"z\. ?B\.", r"u\. ?a\.", r"vgl\.", r"ca\.", r"bzw\.",
    r"Dr\.", r"Dipl\.", r"Prof\.", r"Hr\.", r"Fr\.", r"i\. ?V\.", r"i\. ?S\.", r"i\. ?d\. ?R\.",
]
ABBREV_RX = re.compile(r"(" + "|".join(ABBREV) + r")")

In [7]:
def protect_abbrev(text: str) -> str:
    #replace the '.' in known abbreviations with '⟂'
    def repl(m):
        return m.group(0).replace(".", "⟂")
    return ABBREV_RX.sub(repl, text)

def unprotect(text: str) -> str:
    return text.replace("⟂", ".")

#to keep § while still normalizing spaces
def normalize_paragraph_sign(text: str) -> str:
    return re.sub(r"§\s+", "§ ", text)

def light_clean(text: str) -> str:
    t = text.strip()
    t = normalize_paragraph_sign(t)
    t = re.sub(r"[ \t]+", " ", t)          
    t = re.sub(r"\n{3,}", "\n\n", t)        
    t = protect_abbrev(t)
    return t

df["text_norm"] = df["text"].map(light_clean)

# Stanza pipeline and sentence segmentations

In [8]:
nlp = stanza.Pipeline(
    "de", processors="tokenize,pos,lemma",
    tokenize_no_ssplit=False,  
    use_gpu=False, download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

2025-11-01 19:43:37 INFO: Loading these models for language: de (German):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd_charlm   |
| lemma     | gsd_nocharlm |

2025-11-01 19:43:37 INFO: Using device: cpu
2025-11-01 19:43:37 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-11-01 19:43:38 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-11-01 19:43:38 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2025-11-01 19:43:39 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-11-01 19:43:39 INFO: Done loading processors!


In [9]:
def annotate(text: str):
    #if stanza blows up on pathological monstrosities, slice or skip
    return nlp(text)


doc = annotate(df["text_norm"].iloc[0])
[(i, len(s.words)) for i, s in enumerate(doc.sentences)][:10]

[(0, 8),
 (1, 35),
 (2, 17),
 (3, 40),
 (4, 19),
 (5, 13),
 (6, 5),
 (7, 37),
 (8, 31),
 (9, 15)]

## lets look at some and see if our sentence segmentation works properly 

In [14]:
import random
sample_indices = random.sample(range(len(df)), 2)
print("Inspecting docs:", sample_indices)

for idx in sample_indices:
    text = df.loc[idx, "text_norm"]
    print(f"\n================= DOC {idx} =================")
    print(text[:500], "...\n")  

    doc = nlp(text)
    print(f"Num sentences: {len(doc.sentences)}")
    for i, s in enumerate(doc.sentences[:3]):  
        print(f"\n--- Sentence {i+1} ---")
        print(" ".join([w.text for w in s.words]))

Inspecting docs: [6, 9]

Datum der Endgültigen Bedingungen und des ersten öffentlichen Angebots: 17.01.2023 
 
 
ENDGÜLTIGE BEDINGUNGEN 
Landesbank Baden-Württemberg 
(LEI: B81CK4ESI35472RHJ606) 
 
 
 
 
24.000.000 EUR 
LBBW 3,41 % Festzins-Anleihe 
festverzinsliche Schuldverschreibungen 
(die "Schuldverschreibungen") 
 
ISIN-Code: DE000LB38W46 
 
 
 
emittiert unter dem 
 
Angebotsprogramm zur Emission von Schuldverschreibungen und Pfandbriefen 
 
Die Gültigkeit des Basisprospekts der Landesbank Baden-Württemberg (die  ...

Num sentences: 312

--- Sentence 1 ---
Datum der Endgültigen Bedingungen und des ersten öffentlichen Angebots : 17.01.2023

--- Sentence 2 ---
ENDGÜLTIGE BEDINGUNGEN Landesbank Baden - Württemberg ( LEI : B81CK4ESI35472RHJ606 )

--- Sentence 3 ---
24.000.000 EUR LBBW 3,41 % Festzins - Anleihe festverzinsliche Schuldverschreibungen ( die " Schuldverschreibungen " )

Endgültige Bedingungen Nr⟂ 397 vom 01.04.2022
zum Basisprospekt B vom 27. April 2021
geändert durch d

In [15]:
i = sample_indices[0]
doc = nlp(df.loc[i, "text_norm"])

for s_i, sent in enumerate(doc.sentences[:2]):
    print(f"\nSentence {s_i+1} tokens:")
    print(f"{'TEXT':15} {'LEMMA':15} {'POS':10}")
    print("-"*40)
    for w in sent.words:
        print(f"{w.text:15} {w.lemma:15} {w.upos:10}")


Sentence 1 tokens:
TEXT            LEMMA           POS       
----------------------------------------
Datum           Datum           NOUN      
der             der             DET       
Endgültigen     endgültig       ADJ       
Bedingungen     Bedingung       NOUN      
und             und             CCONJ     
des             der             DET       
ersten          erst            ADJ       
öffentlichen    öffentlich      ADJ       
Angebots        Angebot         NOUN      
:               :               PUNCT     
17.01.2023      17.01.2023      NUM       

Sentence 2 tokens:
TEXT            LEMMA           POS       
----------------------------------------
ENDGÜLTIGE      ENDGÜLTIGE      PROPN     
BEDINGUNGEN     BEDINGUNGEN     PROPN     
Landesbank      Landesbank      PROPN     
Baden           Baden           PROPN     
-               -               PUNCT     
Württemberg     Württemberg     PROPN     
(               (               PUNCT     
LEI             LE

the processed seemed to have worked fine so we can now export it into CoNLL

In [19]:
from stanza.utils.conll import CoNLL
from tqdm import tqdm
import os

EXPORT_N = len(df)  # or 1000 if you have time

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

for i in tqdm(range(EXPORT_N)):
    text = df.loc[i, "text_norm"]
    doc = nlp(text)
    out_path = out_dir / f"doc_{i:05d}.conllu"
    CoNLL.write_doc2conll(doc, str(out_path))

100%|██████████| 10/10 [08:03<00:00, 48.38s/it]


In [20]:
sample_file = sorted(out_dir.glob("*.conllu"))[0]
print(sample_file)
print("\n".join(sample_file.read_text(encoding="utf-8").splitlines()[:30]))

data\processed\doc_00000.conllu
# text = Endgültige Bedingungen Nr⟂ 11 vom 16.
# sent_id = 0
1	Endgültige	endgültig	ADJ	ADJA	Case=Nom|Degree=Pos|Gender=Fem|Number=Plur	0	_	_	start_char=0|end_char=10
2	Bedingungen	Bedingung	NOUN	NN	Case=Nom|Gender=Fem|Number=Plur	1	_	_	start_char=11|end_char=22
3	Nr⟂	Nr⟂	PROPN	NN	Case=Nom|Gender=Fem|Number=Sing	2	_	_	start_char=23|end_char=26
4	11	11	PROPN	CARD	NumType=Card	3	_	_	start_char=27|end_char=29
5-6	vom	_	_	_	_	_	_	_	start_char=30|end_char=33
5	von	von	ADP	APPR	_	4	_	_	_
6	dem	der	DET	ART	Case=Dat|Definite=Def|Gender=Neut|Number=Sing|PronType=Art	5	_	_	_
7	16	16	NUM	CARD	NumType=Card	6	_	_	start_char=34|end_char=36|SpaceAfter=No
8	.	.	PUNCT	$.	_	7	_	_	start_char=36|end_char=37

# text = Juni 2020 DEUTSCHE BANK AG Emission von bis zu 5.000.000 Marktzinsanleihen (entspricht Produkt-Nr⟂ 41 in der Wertpapierbeschreibung für Schuldverschreibungen) zu je EUR 100,00 mit einem Gesamtnennbetrag von bis zu EUR 500.000.000
# sent_id = 1
1	Juni	Juni	PROPN