In [548]:
# If ipytest isn't installed in your environment, run:
# %pip install -q ipytest pytest

## configuration

In [549]:
# "Księga [numer księgi]"
CHAPTER_TITLE_REGEX_PL = r'^\s*Księga\s+(\w+|[IVXLCDM\d]+)\s*$'

DEFAULT_CONFIG_PL = {
    'top_longest_words_number': 10,
    'top_most_common_words_number': 10,
    'ignored_words': [
        'i', 'a', 'że', 'lub', "w", "się", "z", "na", "nie", "jak", "do", "to", "o", "za",
        "po", "co", "od", "lecz", "bo", "gdy", "ja",
    ],
    'ignored_intro_lines': [
        'Adam Mickiewicz',
        'Pan Tadeusz',
        'czyli ostatni zajazd na Litwie',
        'ISBN 978-83-288-2495-9'
    ],
    'ignored_regexps': [
        r'^.*?ISBN\s+978-83-\d{3}-\d{4}-\d{1,2}\s*',
        CHAPTER_TITLE_REGEX_PL
    ]
}

# dla testowego pliku z teksrami piosenek [Chorus] [Verse 1] etc
LYRICS_TAGS_REGEX = r'\[.*?\]'

DEFAULT_CONFIG_EN = {
    'top_longest_words_number': 10,
    'top_most_common_words_number': 10,
    'ignored_intro_lines': [],
    'ignored_words': ['a', 'the', 'do'],
    'ignored_regexps': [
        LYRICS_TAGS_REGEX
    ]
}


Technical functions (file loading, handling errors etc)

In [550]:
def ensure_string(input_text):
    """Raises TypeError if input_text is not a string."""
    if not isinstance(input_text, str):
        raise TypeError(
            f"expected a string, got {type(input_text).__name__}"
        )


def load_text(filename):
    """Loads text from a file."""
    ensure_string(filename)
    if not filename.endswith(".txt"):
        raise ValueError(f"Expected a .txt file, got {filename!r} instead.")
    file = open(filename, "r", encoding="utf-8")
    text = file.read()
    file.close()

    return text

In [551]:
# TODO remove me !!
def tokenize_by_split(input_text):
    """Tokenizes text using string.split().
    DO NOT USE IT.
    This is a simple naive implementation that ignores punctuation etc
    """
    ensure_string(input_text)
    return input_text.split()


print(tokenize_by_split('Ala ma kota, a kot ma Alę!'))

['Ala', 'ma', 'kota,', 'a', 'kot', 'ma', 'Alę!']


In [552]:
import re


def tokenize_by_regexp(input_text):
    """Tokenizes text using a regular expression to find word boundaries.
    - Converts text to lowercase.
    - Extracts word-like tokens (letters, numbers, underscores, digits).
    - Raises TypeError for non-string inputs.
    """
    ensure_string(input_text)
    return re.findall(r"\b\w+\b", input_text.lower())


In [553]:
def longest_words(input_text, n=10):
    """returns top n longest workds in input_text
    it is using the build-in function sorted
    """
    ensure_string(input_text)
    all_words = tokenize_by_regexp(input_text)
    words = sorted(all_words, key=len, reverse=True)[:n]
    return words


In [554]:
def most_common_words(input_text, n=10):
    """returns top n most common workds in input_text
    it is using the collections.Counter class
    """
    from collections import Counter

    ensure_string(input_text)
    all_words = tokenize_by_regexp(input_text)
    word_counts = Counter(all_words)
    most_common = word_counts.most_common(n)
    return [word for word, count in most_common]

In [555]:
# TODO: remove me. not needed
def most_common_words_in_text_file(filename, n=10):
    """returns top n most common words in filename from current folder
    """
    return most_common_words(load_text(filename), n)


most_common_words_in_text_file('one-more-cup-of-coffee.txt')

['your', 'the', 'to', 'of', 'is', 'one', 'more', 'cup', 'coffee', 'and']

In [556]:
# TODO: remove me. not needed
def longest_words_in_text_file(filename, n=10):
    """returns top n longest workds in filename from current folder
    """
    return longest_words(load_text(filename), n)


In [557]:
def cleanup_text(text, config=DEFAULT_CONFIG_PL):
    """Removes words from text."""
    flags = re.IGNORECASE | re.DOTALL | re.MULTILINE
    for regexp in config.get("ignored_regexps", []):
        text = re.sub(regexp, "", text, flags=flags)

    # remove ignored common words
    for word in config.get("ignored_words", []):
        regex = r'\b' + re.escape(word) + r'\b'
        text = re.sub(regex, '', text, flags=re.IGNORECASE)
    return text

In [558]:
def ex01_imp():
    cleaned_text = cleanup_text(load_text('pan-tadeusz.txt'))
    print("Najdłuższe słowa w Panu Tadeuszu to:")
    print(', '.join(longest_words(cleaned_text)))

    print("Najczęściej występujące słowa w Panu Tadeuszu to:")
    print(', '.join(most_common_words(cleaned_text)))


ex01_imp()

Najdłuższe słowa w Panu Tadeuszu to:
niebezpieczeństwach, białopiotrowiczowi, najprzykładniejszy, niebezpieczeństwem, niebezpieczeństwem, niebezpieczeństwem, nierozstrzygniony, białopiotrowiczem, niebezpieczeństwo, niebezpieczeństwa
Najczęściej występujące słowa w Panu Tadeuszu to:
już, tak, pan, jest, ale, był, nim, rzekł, go, tylko


### Unit Tests

In [559]:
import ipytest
import pytest

ipytest.autoconfig()  # integrate pytest with the notebook

test longest_words

In [560]:

@pytest.mark.parametrize(
    "text, n, expected",
    [
        ("Ala ma kota", 1, ["kota"]),
        ("Ala ma kota", 2, ["kota", "ala"]),
        ("Ala ma kota", 3, ["kota", "ala", "ma"]),
        ("To be, or not to be, that is the question.", 2, ["question", "that"])
    ],
)
def test_longest_words(text, n, expected):
    assert longest_words(text, n) == expected

test tokenize_by_regexp

In [561]:
tokenize = tokenize_by_regexp


@pytest.mark.parametrize(
    "text, expected",
    [
        ("123 abc", ["123", "abc"]),
        # skipping interpunction
        ("Ala ma kota, kot ma Alę!", ["ala", "ma", "kota", "kot", "ma", "alę"]),
        # other witespace characters
        ("123 abc\n456\tdef", ["123", "abc", "456", "def"]),
        # edge cases
        ("", []),
        ("   \t\n  ", []),
    ],
)
def test_tokenize(text, expected):
    assert tokenize(text) == expected

test most_common_words

In [562]:

@pytest.mark.parametrize(
    "text, n, expected",
    [
        ("world world world", 1, ['world']),
        ("world world world", 3, ['world']),
        ("world world world.", 4, ['world']),
        ("World WORLD, worLD!", 3, ['world']),
        ("Hello WORLD! Hello World, Hello Bob", 3, ['hello', 'world', "bob"]),

        ("", 3, []),
    ]
)
def test_most_common_words(text, n, expected):
    assert most_common_words(text, n) == expected

In [563]:
def test_cleanup_text():
    text = load_text('one-more-cup-of-coffee.txt')
    config = DEFAULT_CONFIG_EN
    cleaned_text = cleanup_text(text, config)

    assert "[Chorus]" not in cleaned_text
    assert "[Verse 1]" not in cleaned_text
    assert "[Verse 2]" not in cleaned_text

    # "the" is part of "annother"
    assert "another" in cleaned_text
    # the is removed as a separate word
    assert "the " not in cleaned_text

    assert "One more cup of coffee for  road" in cleaned_text

In [564]:
ipytest.run()

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                             [100%][0m
[32m[32m[1m16 passed[0m[32m in 0.02s[0m[0m


<ExitCode.OK: 0>