# Making a Frequency Distribution for Transliterated Greek

In [1]:
from collections import Counter
import numpy as np
import pickle

In [2]:
from tqdm import tqdm
from cltk.corpus.readers import get_corpus_reader
from cltk.prosody.latin.string_utils import remove_punctuation_dict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from joblib import load

### Add parent directory to path so we can access our common code

In [3]:
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

In [4]:
from mlyoucanuse.corpus_analysis_fun import create_probability_dist

### Load the Transliterate Greek Pipeline built in another notebook `loanwords_problems_solutions`

In [6]:
process_greek_pipeline = load('process_greek_text_pipeline.0.20.2.joblib')

### Create Perseus Greek Corpus Reader

In [7]:
perseus_greek = get_corpus_reader(language='greek', corpus_name='greek_text_perseus')
greek_texts = perseus_greek.fileids() 

In [8]:
word_counter = Counter()

for file in tqdm(greek_texts, total=len(greek_texts), unit='files'):
    X_greek_transliterated = process_greek_pipeline.fit_transform([list(perseus_greek.words(file))])
    for word in X_greek_transliterated[0]:
        word_counter.update({word: 1})

100%|██████████| 222/222 [06:57<00:00,  1.63s/files]


In [9]:
word_counter.most_common(10)

[('kai', 94747),
 ('de', 49182),
 ('men', 23222),
 ('tōn', 21808),
 ('to', 19620),
 ('tēn', 19248),
 ('en', 18851),
 ('ho', 18834),
 ('d', 17112),
 ('tou', 16393)]

### The raw count number isn't very usable, so we'll normalize 

In [10]:
word_probabilities = create_probability_dist(word_counter)
word_probabilities['kai']

0.9999990000000001

### Now that normalized number looks more managable. Let's save the counter for resuse.

In [11]:
with open('freq_dist.greek.transliterated.pkl', 'wb') as writer:
    pickle.dump(word_probabilities, writer)