Statistikoj de Esperantaj tekstoj, prenitaj el https://tekstaro.com/elshuti.html
La tekstoj estas kreitaj 4-an de Marto 2025.

In [None]:
# utility methods

import csv
from collections import defaultdict

def write_items_into_file(items, file_name, header=['key', 'value']):
    """
    Writes a list of items into a CSV file with two columns: key and value.
    
    Args:
        items (dict): The list of items to write into the file.
        file_name (str): The name of the output CSV file.
        header (list): The header for the CSV file.
    """
    with open(file_name, 'w', encoding='utf-8', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(header)
        csvwriter.writerows(items)
    
    
def read_csv_into_dict(file_name, skip_header=True):
    """
    Reads a CSV file into a dictionary with keys as the first column and values as the second column.
    
    Args:
        file_path (str): The path to the CSV file.
        
    Returns:
        defaultdict: A dictionary with keys and values from the CSV file.
    """
    data_dict = defaultdict(int)
    with open(file_name, 'r', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        if skip_header:
            next(csvreader)
        for row in csvreader:
            key, value = row
            data_dict[key] += int(value)
    return data_dict




In [4]:
import os

base_dir = "./html_kun_streketoj/tekstoj"
print(f'base_dir= {base_dir}')

file_names = os.listdir(base_dir)
print(f'there are = {len(file_names)} files')
# for file_name in sorted(file_names):
#     print(file_name)


base_dir= ./html_kun_streketoj/tekstoj
there are = 120 files


In [None]:
#  read all files and extract the text between <p id[^>]+> and </p>
#  and replace <span class="streketo">•</span> with '_'
#  and write all texts into one file 'all_texts_joined.html'

import re

all_texts = []

for file_name in file_names:
    print(f'processing file: {file_name}')
    file_name = os.path.join(base_dir, file_name)
    with open(file_name, 'r', encoding='utf-8') as file:
        content = file.read()
        fragments = [re.sub(r'^<p id[^>]+>|</p>$', '', fragment) for fragment in re.findall(r'<p id[^>]+>.*?</p>', content, re.DOTALL)]
        fragments = [fragment.replace('<span class="streketo">•</span>', '_') for fragment in fragments]
        all_texts.extend(fragments)
    print('   done')

output_file = 'all_texts_joined.html'
print(f'writing texts {len(all_texts)} lines into {output_file}')
with open(output_file, 'w', encoding='utf-8') as file:
    file.write('\n'.join(all_texts))

print(" all done!")

# 18.4 s
# writing texts 250518 lines into all_texts_joined.html

In [8]:
# Read the file 'all_texts_joined.html', remove all html tags, write the result into 'all_texts_joined.txt'

with open('all_texts_joined.html', 'r', encoding='utf-8') as file:
    content = file.read()

def remove_html_tags(text):
    re_open = re.compile('<[^<]+?>')
    without_open_tag = re.sub(re_open, '', text)
    re_close = re.compile('</[^<]+?>')
    without_close_tag = re.sub(re_close, '', without_open_tag)
    return without_close_tag

# Remove any HTML tag in the format "<tag" + any content but not ">" till ">" and then "</" till ">"
cleaned_content = remove_html_tags(content)

# Write the result into 'all_texts_joined.txt'
output_txt_file = 'all_texts_joined.txt'
with open(output_txt_file, 'w', encoding='utf-8') as file:
    file.write(cleaned_content)

print(f"Content without html tags written to {output_txt_file}")

Content without html tags written to all_texts_joined.txt


In [9]:
# Read the file 'all_texts_joined.txt', remove all punctuation characters, digits, and extra spaces


with open('all_texts_joined.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# Replace all minus signs with underscores if the minus is inside a word
content = re.sub(r'(?<=\w)-(?=\w)', '_', content)

# Remove all punctuation characters
content = re.sub(r'[^\w\s]', '', content)
# Remove all digits
content = re.sub(r'\d+', '', content)
# Remove all extra spaces and tabs
content = re.sub(r'\t+', ' ', content)
content = re.sub(r'\s+', ' ', content)
# # Remove all new line characters
# content = re.sub(r'\n+', ' ', content)

output_file = 'all_texts_without_punctuation.txt'
# Write the modified content to output_file
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(content)

print(f"all texts without punctuation saved to '{output_file}'")

# 13.4s

all texts without punctuation saved to 'all_texts_without_punctuation.txt'


In [12]:
# calculate the frequency of each word in 'all_texts_without_punctuation.txt'
# and write the result into 'words_frequency.csv'

from collections import Counter

# Read the file 'all_texts_without_punctuation.txt'
with open('all_texts_without_punctuation.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content into words and convert to lowercase
all_words = content.lower().split()

# Remove all words starting with "http"
all_words = [word for word in all_words if not word.startswith("http")]

# Calculate the frequency of each word
word_counts = Counter(all_words)

# Remove all words containing characters not in the Esperanto alphabet
esperanto_chars = "abcĉdefgĝhĥijĵklmnoprsŝtuŭvz_"
eo_words = [word for word in all_words if all(char in esperanto_chars for char in word)]

word_counts = Counter(eo_words)

print(f"total words = {len(all_words)}, Esperanto words= {len(eo_words)}, unique words = {len(word_counts)}")

# Sort the words by frequency in descending order
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Write the result into 'words_frequency.csv'
output_csv_file = 'words_frequency.csv'
write_items_into_file(sorted_word_counts, output_csv_file, header=['word', 'count'])

print(f"Word frequencies written to '{output_csv_file}'")

# 16.9s
# total words = 12605345, Esperanto words= 12532375, unique words = 371937


total words = 12605345, Esperanto words= 12532375, unique words = 371937
Word frequencies written to 'words_frequency.csv'


In [None]:
# read csv file 'words_frequency.csv' 
# convert each word to normal form 
# put normal form into new dictionary, updating frequency, if this form is already in the dictionary
# write the result into 'words_frequency_normalized.csv'

def normalize_word(word):
    if word.endswith("_n"):
        word = word[:-2]
    if word.endswith("_j"):
        word = word[:-2]
    if word.endswith(("_as", "_is", "_os", "_us")):
        word = word[:-2] + "i"
    if word.endswith(("_u")):
        word = word[:-1] + "i"
    return word

# Read the CSV file and normalize words
words_counts = read_csv_into_dict('words_frequency.csv')

normalized_word_counts = defaultdict(int)
for word, count in words_counts.items():
    normalized_word = normalize_word(word)
    normalized_word_counts[normalized_word] += int(count)
print(f"normalized words count = {len(normalized_word_counts)}")

# Write the normalized word frequencies to a new CSV file
sorted_items = sorted(normalized_word_counts.items(), key=lambda x: x[1], reverse=True)
output_file_name = 'words_frequency_normalized.csv'
write_items_into_file(sorted_items, output_file_name, header=['word', 'count'])

print(f"Normalized word frequencies written to '{output_file_name}'")

# normalized words count = 261004

normalized words count = 261004
Normalized word frequencies written to 'words_frequency_normalized.csv'


In [None]:
from collections import defaultdict

# Read the file 'words_frequency_normalized.csv' and divide the words into two dictionaries

words_counts = read_csv_into_dict('words_frequency_normalized.csv')

words_with_underscore = defaultdict(int)
words_without_underscore = defaultdict(int)
for word, count in words_counts.items():
    count = int(count)
    if '_' in word:
        words_with_underscore[word] += count
    else:
        words_without_underscore[word] += count

# Write the words with underscores to 'words_with_underscore_frequency.csv'
print(f'count of words with underscores = {len(words_with_underscore)}')
write_items_into_file(sorted(words_with_underscore.items(), key=lambda x: x[1], reverse=True), 'words_with_underscore_frequency.csv', header=['word', 'count'])
print("Words with underscores written to 'words_with_underscore_frequency.csv'")

# Write the words without underscores to 'words_without_underscore_frequency.csv'
print(f'count of words without underscores = {len(words_without_underscore)}')
write_items_into_file(sorted(words_without_underscore.items(), key=lambda x: x[1], reverse=True), 'words_without_underscore_frequency.csv', header=['word', 'count'])
print("Words without underscores written to 'words_without_underscore_frequency.csv'")

# count of words with underscores = 199703
# count of words without underscores = 61301


count of words with underscores = 199703
Words with underscores written to 'words_with_underscore_frequency.csv'
count of words without underscores = 61301
Words without underscores written to 'words_without_underscore_frequency.csv'


here is manual step: look into words_without_underscore_frequency.csv, select all Esperanto words without endings (numbers, prepositions, correlative words). they are at top. about 160 upper lines.
copy them into words_with_underscore_frequency.csv file


word,count
la,1131201
de,538681
kaj,405750
en,270668
al,175182
mi,149265
ne,141823
kiu,130081
li,121481
por,112062
ke,111441
pri,92462
tiu,90918
sed,74336
vi,72739
ili,72719
ĝi,67035
kun,66680
kiel,59181
ni,56211
el,55851
tio,55670
pli,49601
oni,46296
ŝi,43419
per,40944
ĉiu,40224
aŭ,37631
sur,35458
nur,35275
ankaŭ,34969
da,33852
ĉu,29811
se,29521
dum,29069
kiam,28706
ĉi,28663
ĉar,28193
unu,27514
plej,27333
si,25989
kio,25881
pro,25086
post,23574
ol,22731
ĉe,22487
jam,22416
nun,22008
tiel,21865
inter,21804
eĉ,21760
tre,20887
laŭ,19923
tamen,18970
du,18094
antaŭ,17922
ĝis,16851
tie,16528
do,16474
kontraŭ,14611
tia,14264
mem,14027
je,13817
kie,13198
ankoraŭ,13057
iu,13046
tiam,12680
ja,12251
iom,11156
ĉio,10946
sen,10672
ĉiam,10046
uea,9902
jen,9857
neniu,8453
plu,8183
sub,8142
tri,7902
io,7832
tuj,7658
kies,7502
neniam,7218
kvankam,7129
nek,7126
preskaŭ,7115
kvazaŭ,6916
tra,6582
tro,6577
ia,6527
jes,6266
nenio,6202
kial,6186
tial,6071
tiom,6042
kia,5496
ajn,5340
ĉirkaŭ,4549
nu,4516
ho,4506
ambaŭ,4321
hodiaŭ,4128
krom,4039
dek,3985
almenaŭ,3942
apud,3890
iam,3877
kiom,3851
super,3774
zamenhof,3764
ktp,3742
kvar,3673
malgraŭ,3479
ties,3457
kvin,3371
baldaŭ,3326
mil,3226
anstataŭ,2932
apenaŭ,2757
ĉia,2753
ĵus,2382
nenia,2286
des,2241
tejo,2227
ekster,2152
ses,2147
for,2115
sep,1831
izrael,1664
cent,1629
po,1511
david,1480
ĉie,1452
iel,1372
ok,1259
neniel,1236
trans,1164
eksa,1147
ci,966
morgaŭ,933
ju,788
hieraŭ,746
naŭ,726
ve,716
preter,471
nenie,357
neniom,311
adiaŭ,311
ĉies,305
ajna,255
ies,251
far,194


In [None]:
import locale
from collections import defaultdict

# Read the file 'words_with_underscore_frequency.csv' into dictionary word-> count
words_with_frequencies = read_csv_into_dict('words_with_underscore_frequency.csv')
        
# Remove all starting underscores and update the dictionary
updated_words_with_frequencies = defaultdict(int)
for word, count in words_with_frequencies.items():
    word = re.sub(r'__', '_', word)
    updated_word = word.lstrip('_').rstrip('_')
    updated_words_with_frequencies[updated_word] += int(count)

# Sort the words by frequency in descending order, then by word in alphabetical order with locale Esperanto
locale.setlocale(locale.LC_ALL, 'eo.UTF-8')
words_with_frequencies = sorted(updated_words_with_frequencies.items(), key=lambda x: (-x[1], locale.strxfrm(x[0])))

# Write the sorted words into 'words_frequency_1.csv'
output_file_name= 'words_frequency_1.csv'
write_items_into_file(words_with_frequencies, output_file_name, header=['word', 'count'])

print(f'count of words = {len(words_with_frequencies)}')
print("Words with frequencies written to 'words_frequency_1.csv'")

# 199571

count of words = 199571
Words with frequencies written to 'words_frequency_1.csv'


remove all words from `words_frequency_1.csv` which have frequency 1

count of words = 93190

In [None]:
from collections import defaultdict
import locale

# Read the file 'words_frequency_1.csv' into dictionary word->count
words_with_frequencies = read_csv_into_dict('words_frequency_3.csv')

part_frequencies = defaultdict(int)  # Initialize defaultdict with int as default factory
for word, frequency in words_with_frequencies.items():
    parts = word.split('_')  
    num_parts = len(parts)
    if num_parts == 1:
        part = parts[0]
        if part:
            part_frequencies[part] += frequency
    elif num_parts > 1:
        for index, part in enumerate(parts):
            if part: 
                if index == 0: 
                    formatted_part = part + "-"
                elif index == num_parts - 1:
                    formatted_part = "-" + part
                else: 
                    formatted_part = "-" + part + "-"
                part_frequencies[formatted_part] += frequency    

# Sort the words by frequency in descending order, then by word in alphabetical order with locale Esperanto
locale.setlocale(locale.LC_ALL, 'eo.UTF-8')
sorted_part_frequencies = sorted(part_frequencies.items(), key=lambda x: (-x[1], locale.strxfrm(x[0])))

output_file = 'word_parts_frequency.csv'
# Write the sorted words into 'word_parts_frequency.csv'
write_items_into_file(sorted_part_frequencies, output_file)

print(f'count of word parts = {len(sorted_part_frequencies)}')
print(f"Word parts with frequencies written to '{output_file}'")

# count of word parts = 16186

count of word parts = 16186
Word parts with frequencies written to 'word_parts_frequency.csv'


In [42]:
from collections import defaultdict

part_frequencies = read_csv_into_dict('word_parts_frequency.csv')

#  group all parts into three dictionaries: prefixes, inner parts, suffixes
prefixes = defaultdict(int)
suffixes = defaultdict(int) 
inner_parts = defaultdict(int)
for part, frequency in part_frequencies.items():
    if part.startswith('-') and part.endswith('-'):
        inner_parts[part] += frequency
    elif part.startswith('-'):
        suffixes[part] += frequency
    elif part.endswith('-'):
        prefixes[part] += frequency
    else:
        inner_parts[part] += frequency
        
# write each group into separate files

locale.setlocale(locale.LC_ALL, 'eo.UTF-8')
sorted_prefixes = sorted(prefixes.items(), key=lambda x: (-x[1], locale.strxfrm(x[0])))
print(f'count of prefixes = {len(sorted_prefixes)}')
write_items_into_file(sorted_prefixes, 'word_parts_prefixes.csv', header=['prefix', 'count'])
print("prefixes written into 'word_parts_prefixes.csv'")

sorted_inner_parts = sorted(inner_parts.items(), key=lambda x: (-x[1], locale.strxfrm(x[0])))
print(f'count of inner parts = {len(sorted_inner_parts)}')
write_items_into_file(sorted_inner_parts, 'word_parts_inner_parts.csv', header=['inner_part', 'count'])
print("inner parts written into 'word_parts_inner_parts.csv'")

sorted_suffixes = sorted(suffixes.items(), key=lambda x: (-x[1], locale.strxfrm(x[0])))
print(f'count of suffixes = {len(sorted_suffixes)}')
write_items_into_file(sorted_suffixes, 'word_parts_suffixes.csv', header=['suffix', 'count'])
print("suffixes written into 'word_parts_suffixes.csv'")


# count of prefixes = 11909
# count of inner parts = 4255
# count of suffixes = 110

count of prefixes = 11848
prefixes written into 'word_parts_prefixes.csv'
count of inner parts = 4240
inner parts written into 'word_parts_inner_parts.csv'
count of suffixes = 98
suffixes written into 'word_parts_suffixes.csv'


there are a lot of 'suffixes', most of them are not esperanto words.
need manual filtering of this words and updating of file `words_frequency_1.csv`
the result will be stored into `words_frequency_2.csv`

run the following filtering code from word_frequency_2.csv or word_frequency_3.csv
and again group by prefix,inner,suffix - until there are 100 suffixes

In [43]:
# Read the file 'words_frequency_2.csv' into dictionary word->count
words_with_frequencies = read_csv_into_dict('words_frequency_3.csv')
print(f'total count of words = {len(words_with_frequencies)}')

suffixes = read_csv_into_dict('word_parts_suffixes.csv')

rare_suffixes = {suffix[1:]: count for suffix, count in suffixes.items() if count < 9}
print(f'count of words with rare suffixes = {len(rare_suffixes)}')

write_items_into_file(sorted(rare_suffixes.items(), key=lambda x: x[1], reverse=True), 'rare_suffixes.csv', header=['suffix', 'count'])
print("rare suffixes written into 'rare_suffixes.csv'")

# Remove words ending with rare suffixes
words_to_delete = []
for word,count in words_with_frequencies.items():
    suffix = word.split('_')[-1]
    if suffix != word and suffix in rare_suffixes:
        words_to_delete.append(word)
        print(f'removed word with rare suffix: {word},{count} because of suffix: {suffix},{rare_suffixes[suffix]}')

# Delete the collected words after iteration
for word in words_to_delete:
    del words_with_frequencies[word]
    
print(f'count of words after removing words with rare suffixes = {len(words_with_frequencies)}')
# Write the remaining words into 'words_frequency_3.csv'
output_file_name = 'words_frequency_3.csv'
write_items_into_file(sorted(words_with_frequencies.items(), key=lambda x: x[1], reverse=True), output_file_name, header=['word', 'count'])
print(f"Words with frequencies written to '{output_file_name}'")




total count of words = 92047
count of words with rare suffixes = 1
rare suffixes written into 'rare_suffixes.csv'
removed word with rare suffix: nu_nu,2 because of suffix: nu,2
count of words after removing words with rare suffixes = 92046
Words with frequencies written to 'words_frequency_3.csv'
