## Описание
Из папки data со всеми статьями газеты берём тексты на эвенском. И собираем частотный словарь.

Зачем?

Чтобы 1 раз разметить слово, а потом брать разметку частотных не от анализазатора, а из словаря. + чтобы понимать, с какими случаями анализатору важнее справляться.

In [1]:
pip install razdel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [2]:
import os
import re
from razdel import sentenize
from collections import Counter, OrderedDict

In [3]:
PUNCTUATION = "!\"#$%&'()*+, -—./:;<=>?@[\]^_`{|}~«»"

In [4]:
TRANSLIT_TABLE = {"дя" : "d'a",
    "тя" : "t'a",
    "ня" : "n'a",
    "де" : "d'e",
    "те" : "t'e",
    "че" : "če",
    "фе" : "f'e",
    "не" : "n'e",
    "дю" : "d'u",
    "тю" : "t'u",
    "ню" : "n'u",
    "дё" : "d'o",
    "тё" : "t'o",
    "нё" : "n'o",
    "сё" : "s'o",
    "ди" : "d'i",
    "а" : "a",
    "б" : "b",
    "в" : "w",
    "ҕ" : "γ",
    "г" : "g",
    "дь" : "d'",
    "ть" : "t'",
    "д" : "d",
    "ж" : "ž",
    "з" : "z",
    "и" : "i",
    "й" : "j",
    "ӄ" : "q",
    "к" : "k",
    "ль" : "l'",
    "л" : "l",
    "м" : "m",
    "нь" : "n'",
    "н" : "n",
    "ӈ" : "ŋ",
    "о" : "o",
    "п" : "p",
    "р" : "r",
    "с" : "s",
    "т" : "t",
    "у" : "u",
    "ф" : "f",
    "ч" : "č",
    "ц" : "c",
    "ш" : "š",
    "щ" : "š'",
    "ы" : "ɨ",
    "э" : "e",
    "ья" : "ja",
    "ью" : "ju",
    "ъя" : "ja",
    "ъю" : "ju",
    "ю" : "ju",
    "я" : "ja",
    "е" : "je",
    "ъ" : "",
    "ь" : "'",
    "." : ".",
    "," : ",",
    "?" : "?",
    "!" : "!",
    "-" : "-",
    }

translit = OrderedDict(TRANSLIT_TABLE)
vowels = {"е":"\'e", "я":"\'a", "ю":"\'u", "ё":"\'o"}

In [5]:
def transliterate(string, translit, vowels):
    pattern = re.compile(r'(?<=[врпсгклзцвбмшжӈҕӄВРПСГКЛЗЦВБМШЖӇӃҔ])[еяюё]')
    string = pattern.sub(lambda x:vowels[x.group()], string)
    for cyr, ipa in translit.items():
        string = string.replace(cyr, ipa).replace(cyr.title(), ipa.title())
    return string
    
def simple_symbols(string):
    string = string.replace('γ','g').replace('q','k')
    return string

In [6]:
filenames = []
file_list = os.listdir('data')
for filename in file_list:
    if "eve" in filename:
        filenames.append(filename)

all_words = []
for filename in filenames:
    path = 'data/'+filename
    with open(path, encoding="utf-8") as f:
        text = f.read()
    sentences = []
    for prgraph in text.split('\n'):
        for sent in list(sentenize(prgraph)):
            sentences.append(sent.text)
    sentences = [sent for sent in sentences if sent != '']
    for sent in sentences:
        words = sent.split()
        for word in words:
           if word.strip(PUNCTUATION) != '':
               to_analyse = simple_symbols(transliterate(word.strip(PUNCTUATION).lower(),
                                       translit, vowels))
               all_words.append(to_analyse)

In [7]:
counter = Counter(all_words).most_common()

In [8]:
frequency_list = '\n'.join(['\t'.join(list(map(str, pair))) for pair in counter])

In [9]:
with open('frequency_list.csv', 'w', encoding='utf-8') as file:
    file.write(frequency_list)

In [None]:
frequency_list

"bisin\t331\naič\t318\nn'an\t252\nerek\t224\ntik\t198\nbi\t191\nbu\t186\nmut\t164\nnoŋɨn\t157\nerew\t155\non\t150\naj\t145\nkuŋal\t140\ntarmɨ\t120\nmer\t117\nbiďin\t116\noja\t103\ng'ekiw\t98\nbiden\t97\nbaldun\t95\nač\t94\nťemi\t93\ni\t92\nunɨt\t89\ntačin\t88\ntemi\t84\nele\t80\nbisni\t80\nbis\t78\nčeleďur\t78\norɨčil\t77\nibďiri\t77\nmun\t76\nbisɨkɨn\t76\nerɨgɨr\t76\ntala\t75\noďin\t73\negďen\t72\nnoŋɨrtɨn\t72\ntaraw\t69\nollɨw\t68\nesni\t66\nn'anda\t66\nibďiril\t66\nbiwettɨn\t64\načča\t62\nmen\t61\nanŋɨnu\t60\nentekeje\t59\nojaw\t59\norɨmŋɨl\t57\ng'eki\t56\nodɨkɨn\t55\nečin\t54\norɨm\t54\nčelewen\t53\ngurgew\t53\ntarɨkɨm\t53\nw\t53\ntarak\t52\na\t52\ntadu\t51\numɨttu\t51\namnɨk\t50\njawda\t48\ngurge\t48\nmin\t48\nďulle\t46\njami\t45\norɨn\t45\njak\t45\nbisil\t45\nmunu\t45\nbɨstrinskij\t44\nbisitɨn\t44\nordu\t44\ngunin\t43\nekič\t42\nbej\t42\n3\t42\nkobalan\t40\nebďendɨ\t40\nokat\t39\nmerker\t39\nďulɨpki\t39\ninɨŋu\t39\n1\t39\nbɨj\t39\ntaduk\t38\nďur\t36\nmundu\t36\nadukanow\t36\nnurg

In [10]:
len(counter), len(all_words)

(18613, 42174)