In [2]:
# install requirements once
!pip install spacy enchant

import pickle, re
import spacy
from spacy.cli import download as spacy_download
from transformers import pipeline

# 1) Load your merged pickle
with open('/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added.pkl', 'rb') as f:
    df = pickle.load(f)

# 2) Make sure spaCy German is there
try:
    nlp = spacy.load('de_core_news_sm')
except OSError:
    spacy_download('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')

# 3) Set up your spelling corrector
corrector = pipeline(
    task='text2text-generation',
    model='oliverguhr/spelling-correction-german-base',
    device=-1
)

# optional: small dictionary check fallback
try:
    import enchant
    dict_de = enchant.Dict('de_DE')
    is_valid = lambda w: dict_de.check(w)
except Exception:
    is_valid = lambda w: w.isalpha()

# 4) Grab the first article, take its first 10 sentences
full_text = df.loc[0, 'plainpagefulltext']
doc = nlp(full_text)
first_10_sents = [sent.text for sent in doc.sents][:10]
snippet = " ".join(first_10_sents)
print("=== ORIGINAL SNIPPET ===")
print(snippet)

# 5) Run correction only on that snippet
def correct_text(text):
    corrected = text
    # find all word-tokens
    toks = re.findall(r"\b\w+\b", text)
    for tok in toks:
        # skip if looks valid
        if is_valid(tok):
            continue
        # generate a correction
        prompt = f"Korrigiere das falsch erkannte Wort '{tok}' im deutschen Satz: \"{text}\". Gib nur das Ersatzwort zurück."
        out = corrector(prompt, max_length=16, num_return_sequences=1)
        corr = out[0]['generated_text'].strip() or tok
        # substitute globally
        corrected = re.sub(rf"\b{re.escape(tok)}\b", corr, corrected)
    return corrected

fixed_snippet = correct_text(snippet)
print("\n=== CORRECTED SNIPPET ===")
print(fixed_snippet)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


Device set to use cpu


=== ORIGINAL SNIPPET ===
fft, M — Schwäbischer Merkllr 3 Stuttgart — Donnerstag, 19. September 1940 Starke Erbitterung im Londoner Osten Zunehmende Trunkenheit als Zeichen der versagenden Widerstandskraft Kopenhagen 19. Sept. (Sonderdienst j Amerikanische Korrespondenten er- immer deutlicher die Auflösungserscheinnn- ttt in der englischen Hauptstadt. Londons untere Kevolkerungsschichten , so be- *.L t beispielsweise der Vertreter der „Newyork l’Lg« verlören in erschreckend wachsen- »em Maße jedes Vertrauen zu der -vmtssühning. Als Zeichen für die zunehmende Rmweiflung führt der Berichterstatter an, daß die in den Armuts- und Industrievierteln des Mns wohnungslos gewordene Bevölkerung immer drehender und nachdrücklicher von der Regierung JLc, daßdieHäuserdeswohlhaben- den Westens, deren Besitzer es sich hätten Wen können, ihre Wohnungen in der Stadt mit m Aufenthalt auf dem sicheren Lande zu der- lwschen, den obdachlosen Londoner« zur Verfügung Allt würden. Stärkste Erbitterung herrsche

Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== CORRECTED SNIPPET ===
fft, M — Schwäbischer Merkllr 'Fft, M - Schwäbischer Merkllr 3 Stuttgart - Donnerstag, 'Fft, M - Schwäbischer Merkllr 3 Stuttgart - Donnerstag, 'Fft, M - Schwäbischer Merkllr 3 Stuttgart - Donnerstag, 19. September 1940 Starke Erbitterung im Londoner Osten Zunehmende Trunkenheit als Zeichen der versagenden Widerstandskraft Kopenhagen 19. Sept. (Sonderdienst j Amerikanische Korrespondenten er- immer deutlicher die Auflösungserscheinnn- ttt in der englischen Hauptstadt. Londons untere Bevolkerungsschichten , so be- c’tv beispielsweise der Vertreter der (New York l’Agr) verlören in erschreckend wachsen- mußem Maße jedes Vertrauen zu der -Vomtssühning. Als Zeichen für die zunehmende Reiflung führt der Berichterstatter an, daß die in den Armuts- und Industrievierteln des Mns wohnungslos gewordene Bevölkerung immer drehender und nachdrücklicher von der Regierung JL, daß die Häuser des wohlhaben- den Westen, deren Bes. September 'Fft, M - Schwäbischer Merkllr 3 Stut

In [1]:
pip install spacy enchant


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pickle
import re

# Optional dictionary check via pyenchant or fallback to simple alpha check
try:
    import enchant
    # Ensure Dict is available
    if hasattr(enchant, 'Dict'):
        dict_de = enchant.Dict('de_DE')
        def is_valid_word(token):
            return dict_de.check(token)
    else:
        raise ImportError("enchant.Dict not found")
except Exception:
    print("Warning: pyenchant not available or 'Dict' missing; using alphabetic fallback.")
    def is_valid_word(token):
        # Treat tokens without digits and only letters as valid
        return token.isalpha()

from transformers import pipeline
import spacy
from spacy.cli import download as spacy_download
import pandas as pd



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#load data
with open('/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added.pkl', 'rb') as f:
    articles = pickle.load(f)

In [4]:
# Inspect the loaded data structure
print("Loaded 'articles' object type:", type(articles))
try:
    print("Length of articles:", len(articles))
except Exception:
    pass
# Show first three entries for structure
for i in range(min(3, len(articles))):
    print(f"--- Entry {i} repr ---")
    try:
        entry = articles.iloc[i]
        print(entry.to_dict())  # show full row as dict
    except Exception as e:
        print("Error accessing row via iloc:", e)
        entry = articles[i] if isinstance(articles, list) else None
        print(repr(entry)[:500])
    print()


Loaded 'articles' object type: <class 'pandas.core.frame.DataFrame'>
Length of articles: 104236
--- Entry 0 repr ---
{'page_id': 'UYPNVCGFNZRHE4VD4AV6LMBHODX7QDNA-FILE_0007_DDB_FULLTEXT', 'pagenumber': 7, 'publication_date': Timestamp('1940-09-19 12:00:00'), 'place_of_distribution': ['Stuttgart'], 'language': ['ger'], 'plainpagefulltext': 'fft, M — Schwäbischer Merkllr 3 Stuttgart — Donnerstag, 19. September 1940 Starke Erbitterung im Londoner Osten Zunehmende Trunkenheit als Zeichen der versagenden Widerstandskraft Kopenhagen 19. Sept. (Sonderdienst j Amerikanische Korrespondenten er- immer deutlicher die Auflösungserscheinnn- ttt in der englischen Hauptstadt. Londons untere Kevolkerungsschichten, so be- *.L t beispielsweise der Vertreter der „Newyork l’Lg« verlören in erschreckend wachsen- »em Maße jedes Vertrauen zu der -vmtssühning. Als Zeichen für die zunehmende Rmweiflung führt der Berichterstatter an, daß die in den Armuts- und Industrievierteln des Mns wohnungslos gewordene Bev

In [5]:
# If loaded as DataFrame, use it; else convert list of dicts to DataFrame
if isinstance(articles, pd.DataFrame):
    df = articles
elif isinstance(articles, list) and isinstance(articles[0], dict):
    df = pd.DataFrame(articles)
else:
    raise ValueError("Unsupported data format: expected DataFrame or list of dicts")


In [6]:
# Identify the text column
TEXT_COL = 'plainpagefulltext'
if TEXT_COL not in df.columns:
    raise KeyError(f"Expected column '{TEXT_COL}' not found in DataFrame")


In [7]:
#Initialize tools
# Ensure spaCy German model is installed and loaded
def load_german_model(name='de_core_news_sm'):
    try:
        return spacy.load(name)
    except OSError:
        print(f"Model '{name}' not found. Downloading...")
        spacy_download(name)
        return spacy.load(name)

nlp = load_german_model('de_core_news_sm')

In [8]:
# 2. Initialize spaCy German model
try:
    nlp = spacy.load('de_core_news_sm')
except OSError:
    spacy_download('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')

Take correction code for German, found on Hugging Face

In [9]:
corrector = pipeline(
    task='text2text-generation',
    model='oliverguhr/spelling-correction-german-base',
    device=-1  # set to -1 if using CPU only
)

# Cache for corrected tokens
correction_cache = {}

Device set to use cpu


In [10]:
def correct_text(text):
    doc = nlp(text)
    corrected_sentences = []

    for sent in doc.sents:
        s = sent.text
        tokens = re.findall(r"\b\w+\b", s)
        for tok in tokens:
            if is_valid_word(tok) and tok not in correction_cache:
                # valid and not previously corrected
                continue
            if tok in correction_cache:
                replacement = correction_cache[tok]
            else:
                prompt = (
                    f"Korrigiere das falsch erkannte Wort '{tok}' im deutschen Satz: \"{s}\"."
                    "Gib nur das Ersatzwort zurück."
                )
                out = corrector(prompt, max_length=16, num_return_sequences=1)
                replacement = out[0].get('generated_text', '').strip() or tok
                correction_cache[tok] = replacement

            s = re.sub(rf"\b{re.escape(tok)}\b", replacement, s)
        corrected_sentences.append(s)

    return ' '.join(corrected_sentences)

In [11]:
# 4. Test on first 3 entries
small = df.head(3).copy()
small['corrected_text'] = small[TEXT_COL].fillna('').apply(correct_text)
df['corrected_text'] = corrected_texts

Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

In [10]:
df_small = df.head(10).copy()

# Apply corrections row-wise on df_small only
df_small['corrected_text'] = df_small[TEXT_COL].fillna('').apply(correct_text)

# Attach corrected column back or use df_small directly for saving
df['corrected_text'] = corrected_texts


Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

KeyboardInterrupt: 

In [11]:
# 5. Save results
out_path = '/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added_corrected.pkl'
with open(out_path, 'wb') as f:
    pickle.dump(df, f)
print(f"Saved corrected DataFrame with {len(df)} rows to {out_path}")

Saved corrected DataFrame with 104236 rows to /Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added_corrected.pkl


In [13]:


# 5. Save small results only
test_file = '/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added_corrected_3.pkl'
with open(test_file, 'wb') as f:
    pickle.dump(small, f)
print(f"Saved 3 corrected articles to {test_file}")

# Quick display of test subset
for idx, row in small.iterrows():
    print(f"--- Entry {idx} ---")
    print("Original:", row[TEXT_COL][:200].replace('',' '))
    print("Corrected:", row['corrected_text'][:200].replace('',' '))
    print()

# 6. Load and inspect full corrected data
print("Loading full corrected dataset...")
with open('/Users/maren/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added_corrected.pkl', 'rb') as f:
    corrected_full = pickle.load(f)

if isinstance(corrected_full, pd.DataFrame):
    print("Corrected DataFrame shape:", corrected_full.shape)
    print(corrected_full[['page_id', TEXT_COL, 'corrected_text']].head(3))
else:
    print("Loaded list of items. Showing first 3 entries:")
    for item in corrected_full[:3]:
        if isinstance(item, dict):
            print({k: item[k] for k in ['page_id', TEXT_COL, 'corrected_text'] if k in item})
        else:
            print(item[:200])

Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


: 

In [None]:
# 4. Apply corrections row-wise
corrected_texts = []
for _, row in df.iterrows():
    raw = row[TEXT_COL] or ''
    corrected_texts.append(correct_text(raw))

Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=16) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

RuntimeError: Invalid buffer size: 50.86 GB

In [18]:
# Quick check: print before/after for first 5 rows
for idx in range(min(5, len(df))):
    print(f"--- Row {idx} ---")
    print("Original:\n", df.at[idx, TEXT_COL][:200])
    print("Corrected:\n", df.at[idx, 'corrected_text'][:200])
    print()

Sample before/after for first 5 items:
--- Item 0 ---
'Page-ID' im deutschen Satz 'Page-ID' gibt nur das Ersatzwort zurück.
--- Item 1 ---
pagenumber
--- Item 2 ---
'Publication-Date' im deutschen Satz 'publication-date' gibt nur das Ersatzwort zurück.
--- Item 3 ---
Place-of-Distribution' im deutschen Satz 'Place-of-Distribution' gibt nur das Ersatzwort zurück.
--- Item 4 ---
language


In [19]:
import pickle

# Paths
raw_path      = '/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added.pkl'
corrected_path = '/Users/marencordts/Desktop/Semantic_Data_Stories/course-data-stories/merged_all_added_corrected.pkl'

# Load
with open(raw_path, 'rb') as f:
    raw_articles = pickle.load(f)

with open(corrected_path, 'rb') as f:
    corr_articles = pickle.load(f)

# Compare first few
for i in range(5):
    raw  = raw_articles[i] if isinstance(raw_articles[i], str) else raw_articles[i].get('text', '')
    corr = corr_articles[i] if isinstance(corr_articles[i], str) else corr_articles[i].get('corrected_text', '')
    print(f"--- Article {i} ---")
    print("RAW      :", raw[:200].replace('\n',' '))
    print("CORRECTED:", corr[:200].replace('\n',' '))
    print()


KeyError: 0