# Making a Frequency Distribution for Transliterated Greek

In [1]:
from collections import Counter
import numpy as np
import pickle

from tqdm import tqdm
from cltk.corpus.readers import get_corpus_reader
from cltk.prosody.latin.string_utils import remove_punctuation_dict
from sklearn.pipeline import Pipeline

from joblib import load

In [2]:
from tqdm import tqdm
from cltk.corpus.readers import get_corpus_reader
from cltk.prosody.latin.string_utils import remove_punctuation_dict
from sklearn.pipeline import Pipeline

from joblib import load

### Load the Transliterate Greek Pipeline built in another notebook `loanwords_problems_solutions`

In [3]:
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

In [7]:
process_greek_pipeline = load('process_greek_text_pipeline.0.22.1.joblib')

### Create Perseus Greek Corpus Reader

In [8]:
perseus_greek = get_corpus_reader(language='greek', corpus_name='greek_text_perseus')
greek_texts = perseus_greek.fileids() 

In [9]:
word_counter = Counter()

for file in tqdm(greek_texts, total=len(greek_texts), unit='files'):
    X_greek_transliterated = process_greek_pipeline.fit_transform([list(perseus_greek.words(file))])
    for word in X_greek_transliterated[0]:
        word_counter.update({word: 1})

100%|██████████| 222/222 [04:45<00:00,  1.29s/files]


In [10]:
word_counter.most_common(10)

[('kai', 96043),
 ('de', 49241),
 ('men', 23247),
 ('tōn', 21936),
 ('to', 19808),
 ('tēn', 19392),
 ('en', 19085),
 ('ho', 18969),
 ('d', 17139),
 ('tou', 16483)]

In [12]:
total_words = sum(word_counter.values())
word_probabilities = {key : val/total_words for key,val in word_counter.items()}
word_probabilities['kai']

0.05150631420658943

### Let's save the counter for resuse.

In [13]:
with open('freq_dist.greek.transliterated.pkl', 'wb') as writer:
    pickle.dump(word_probabilities, writer)