Downloading and setting up the Stanza model for Afrikaans

In [1]:
!pip install stanza


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [2]:
import stanza  # Import the stanza library

# Download and initialize the NLP pipeline for Afrikaans,
# which performs sentence splitting, tokenization, POS tagging, and lemmatization
stanza.download('af')  # Download the Afrikaans language model

# Create a text processing pipeline with specified processors
nlp = stanza.Pipeline('af', processors='tokenize,pos,lemma')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: af (Afrikaans) ...


Downloading https://huggingface.co/stanfordnlp/stanza-af/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/af/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: af (Afrikaans):
| Processor | Package            |
----------------------------------
| tokenize  | afribooms          |
| pos       | afribooms_charlm   |
| lemma     | afribooms_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


Text preprocessing function (lemmatization, replacement of proper nouns, pronouns, and numerals with tokens)

In [3]:
import re
from collections import OrderedDict
from typing import Callable

def prepare_afrikaans_text_stanza(
    input_path: str,
    output_path: str,
    nlp: Callable,
    replace_pron: bool = True,
    replace_propn: bool = False,
    replace_num: bool = True,
    merge_compound_nouns: bool = False,
    handle_separable_verbs: bool = False,
) -> None:
    """
    Processes Afrikaans text using Stanza NLP with optional normalization.

    Args:
        input_path (str): Path to input text file.
        output_path (str): Path to write processed text.
        nlp (Callable): Initialized Stanza pipeline for Afrikaans.
        replace_pron (bool): Replace personal pronouns with [PRON].
        replace_propn (bool): Replace proper nouns with [PROPN].
        replace_num (bool): Replace numerals with [NUM].
        merge_compound_nouns (bool): Naively split long compound nouns.
        handle_separable_verbs (bool): Naively separate verb prefixes.
    """

    ARTICLES = {"die", "'n", "al", "hierdie", "daardie", "sommige", "enigste"}
    PERSONAL_PRONOUNS = {"ek", "jy", "hy", "sy", "dit", "ons", "julle", "hulle", "u"}
    COMMON_VERB_PREFIXES = {"op", "af", "uit", "aan", "mee", "in"}

    with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue

            doc = nlp(line)
            for sent in doc.sentences:
                tokens = []

                for word in sent.words:
                    text = word.text.lower()
                    lemma = word.lemma.lower() if word.lemma else text
                    upos = word.upos

                    if upos == "PUNCT":
                        continue

                    if upos == "DET" and text in ARTICLES:
                        tokens.append(text)

                    elif upos == "PRON":
                        if replace_pron and text in PERSONAL_PRONOUNS:
                            tokens.append("[PRON]")
                        else:
                            tokens.append(text)

                    elif upos == "PROPN":
                        tokens.append("[PROPN]" if replace_propn else text)

                    elif upos == "NUM":
                        tokens.append("[NUM]" if replace_num else text)

                    elif upos == "VERB" and handle_separable_verbs:
                        for prefix in COMMON_VERB_PREFIXES:
                            if lemma.startswith(prefix) and len(lemma) > len(prefix):
                                tokens.extend([lemma[len(prefix):], prefix])
                                break
                        else:
                            tokens.append(lemma)

                    elif merge_compound_nouns and upos == "NOUN" and "_" not in lemma:
                        if len(lemma) > 6:
                            tokens.extend([lemma[:3], lemma[3:]])  # Naive split
                        else:
                            tokens.append(lemma)

                    else:
                        tokens.append(lemma)

                if tokens:
                    fout.write(" ".join(tokens) + "\n")


Running preprocessing on the corpus

In [5]:
prepare_afrikaans_text_stanza('input_afrikaans.txt', 'output_afrikaans.txt', nlp)
