In [1]:
import spacy
from spacy.tokens import Doc

nlp = spacy.load("de_core_news_sm")

In [8]:
text = "Die Kinder liefen schnell."
doc = nlp(text)
lemmas = [token.lemma_ for token in doc]
print(lemmas)

['der', 'Kind', 'laufen', 'schnell', '--']


In [9]:
# Your predefined list of tokens (e.g., from BERT tokenizer or raw split)
tokens = ["Die", "Kinder", "liefen", "schnell", "."]

# Create a Doc
doc = Doc(nlp.vocab, words=tokens)

# Process it with the pipeline (for POS tagging, lemmatization, etc.)
doc = nlp.get_pipe("lemmatizer")(doc)

# You can also run full pipeline (optional, slower):
# doc = nlp(doc.text)  # or use nlp.pipe([doc]) for many

# Inspect lemmas
for token in doc:
    print(f"{token.text} → {token.lemma_}")


Die → Die
Kinder → Kinder
liefen → liefen
schnell → schnell
. → .


In [10]:
# Your predefined tokens
tokens = ["Die", "Kinder", "liefen", "schnell", "."]

# Create a blank Doc
doc = Doc(nlp.vocab, words=tokens)

# Run the full spaCy pipeline on the Doc
for name, proc in nlp.pipeline:
    doc = proc(doc)

# Now check lemmas
for token in doc:
    print(f"{token.text} → {token.lemma_}")


Die → der
Kinder → Kind
liefen → laufen
schnell → schnell
. → --


In [11]:
list(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f812d2829e0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f812d263340>),
 ('morphologizer',
  <spacy.pipeline.morphologizer.Morphologizer at 0x7f812d263580>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f8130e25690>),
 ('lemmatizer',
  <spacy.pipeline.edit_tree_lemmatizer.EditTreeLemmatizer at 0x7f812d263400>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f812d023340>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f8130606810>)]

In [12]:
# Your predefined list of tokens (e.g., from BERT tokenizer or raw split)
tokens = ["Die", "Kinder", "liefen", "schnell", "."]

# Create a Doc
doc = Doc(nlp.vocab, words=tokens)

# Process it with the pipeline (for POS tagging, lemmatization, etc.)
doc = nlp.get_pipe("tok2vec")(doc)
#doc = nlp.get_pipe("tagger")(doc)
#doc = nlp.get_pipe("morphologizer")(doc)
#doc = nlp.get_pipe("parser")(doc)
doc = nlp.get_pipe("lemmatizer")(doc)
#doc = nlp.get_pipe("attribute_ruler")(doc)
#doc = nlp.get_pipe("ner")(doc)

# You can also run full pipeline (optional, slower):
# doc = nlp(doc.text)  # or use nlp.pipe([doc]) for many

# Inspect lemmas
for token in doc:
    print(f"{token.text} → {token.lemma_}")


Die → der
Kinder → Kind
liefen → laufen
schnell → schnell
. → --


In [13]:
# Your predefined list of tokens (e.g., from BERT tokenizer or raw split)
tokens = ["Die", "Kinder", "liefen", "schnell", "."]

# Create a Doc
doc = Doc(nlp.vocab, words=tokens)

for name, proc in nlp.pipeline:
    #doc = proc(doc)
    doc = nlp.get_pipe(name)(doc)
    
# You can also run full pipeline (optional, slower):
# doc = nlp(doc.text)  # or use nlp.pipe([doc]) for many

# Inspect lemmas
for token in doc:
    print(f"{token.text} → {token.lemma_}")


Die → der
Kinder → Kind
liefen → laufen
schnell → schnell
. → --


In [None]:
cursor.execute(
    "select s.text, e.sentence_id, e.token_index, e.word, e.lemma, e.embedding "
    "from embeddings__dbmdz__bert_base_german_cased__test as e join sentences as s on e.sentence_id = s.sentence_id "
    "where lemma='der';"
)
rows = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
for row in rows:
    row_dict = dict(zip(colnames, row))
    print(row_dict)