# Making a Frequency Distribution
### Often, when working with a corpus or a body of words that belong to a corpus, it's helpful to use the metric of a frequency distribution.  


In [1]:
%load_ext autoreload
%autoreload 2

### Standard library imports

In [2]:
from collections import Counter
import numpy as np
import pickle

from tqdm import tqdm
from cltk.corpus.readers import get_corpus_reader
from cltk.prosody.latin.string_utils import remove_punctuation_dict
from cltk.stem.latin.j_v import JVReplacer

### Add parent directory to path so we can access our common code

In [3]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

### custom library imports

In [4]:
from mlyoucanuse.aeoe_replacer import AEOEReplacer 

In [5]:
latin_reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')

In [6]:
word_counter = Counter()
jv_replacer = JVReplacer()
aeoe_replacer = AEOEReplacer()

latin_texts = latin_reader.fileids()

for file in tqdm(latin_texts , total=len(latin_texts), unit='files'):
    for word in latin_reader.words(file):
        if word.isalpha():
            word = aeoe_replacer.replace(jv_replacer.replace(word))
            word_counter.update({word: 1})

100%|██████████| 2141/2141 [09:17<00:00,  3.84files/s]


In [7]:
word_counter.most_common(10)

[('et', 426293),
 ('in', 264106),
 ('est', 170471),
 ('non', 154878),
 ('ad', 127206),
 ('ut', 115909),
 ('cum', 100822),
 ('quod', 95409),
 ('qui', 86340),
 ('si', 79408)]

In [8]:
total_words = sum(word_counter.values())
word_counter['et']

426293

In [9]:
word_counter['et']/float(total_words)

0.032510768653979456

### kai is the Greek word for 'and' transliterated into Latin. It is one of the most common words in Greek, and thus it is the one Greek word most likely to appear as loanword, as such we could use it as a threshold for detecting whether or not a random word is candidate for being a transliterated Greek loanword; we'll try this in another notebook.

In [10]:
word_counter['kai']

482

In [11]:
word_counter['kai'] / float(total_words)

3.675920198365466e-05

In [16]:
total_words = sum(word_counter.values())
word_probabilities = {key : val/total_words for key,val in word_counter.items()}

### Let's save the counter for reuse.

In [17]:
with open('freq_dist.latin.pkl', 'wb') as writer:
    pickle.dump(word_probabilities, writer)

### Let's prove that we can load and use what we just saved

In [18]:
latin_frequency_dist = None
with open('freq_dist.latin.pkl', 'rb') as reader:
    latin_frequency_dist = pickle.load(reader)

In [19]:
latin_frequency_dist['rex']

0.0005071549651271857

## That's all for now folks