In [1]:
import os.path
from collections import Counter
from glob import glob

import inspect
import os
import pickle
import sys
from cltk.corpus.latin.phi5_index import PHI5_INDEX
from cltk.corpus.readers import get_corpus_reader
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenize.word import WordTokenizer
from random import sample
from tqdm import tqdm
from typing import List, Dict, Tuple
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
 
from mlyoucanuse.aeoe_replacer import AEOEReplacer
from mlyoucanuse.text_cleaners import ( normalize_accents, disappear_angle_brackets,
                                        drop_punct, disappear_round_brackets,
                                        truecase, dehyphenate, accept_editorial,
                                        swallow_braces, swallow_obelized_words,
                                        swallow_square_brackets)

In [2]:
import cltk
cltk.__version__

'0.1.121'

## Text Cleaning
from http://udallasclassics.org/wp-content/uploads/maurer_files/APPARATUSABBREVIATIONS.pdf

[...] Square brackets, or in recent editions wavy brackets ʺ{...}ʺ, enclose words etc. that an editor thinks should be deleted (see ʺdel.ʺ) or marked as out of place (see ʺsecl.ʺ).

[...] Square  brackets  in  a  papyrus  text, or in an  inscription,  enclose  places  where words have been lost through physical damage.  If this happens in mid-line, editors  use  ʺ[...]ʺ.    If  only  the  end  of  the  line  is  missing,  they  use  a  single  bracket ʺ[...ʺ      If  the  lineʹs  beginning  is  missing,  they  use  ʺ...]ʺ    Within  the  brackets, often each dot represents one missing letter.

[[...]] Double brackets enclose letters or words deleted by the medieval copyist himself.

(...) Round  brackets  are  used  to  supplement  words  abbreviated  by  the  original  copyist; e.g. in an inscription: ʺtrib(unus) mil(itum) leg(ionis) IIIʺ

<...> diamond  (  =  elbow  =  angular)  brackets  enclose  words  etc.  that  an  editor  has  added (see ʺsuppl.ʺ)

†   An obelus (pl. obeli) means that the word(s etc.) is very plainly corrupt, but the editor  cannot  see  how  to  emend.    If  only  one  word  is  corrupt,  there  is  only  one obelus, which precedes the word; if two or more words are corrupt, two obeli  enclose  them.    (Such  at  least  is  the  rule--but  that  rule  is  often  broken,  especially  in  older  editions,  which  sometimes  dagger  several  words  using  only one obelus.)  To dagger words in this way is to ʺobelizeʺ them.

## Load/Build Truecasing dictionary; count all cased tokens, use to normalize cases later

In [3]:
truecase_file = 'truecase_counter.latin.pkl'

if os.path.exists(truecase_file):
    with open(truecase_file, 'rb') as fin:        
        case_counts = pickle.load(fin)
else:
    tesserae = get_corpus_reader(corpus_name='latin_text_tesserae', language='latin')
    case_counts = Counter()
    jv_replacer = JVReplacer()
    aeoe_replacer = AEOEReplacer()
    toker = WordTokenizer('latin')
    sent_toker = SentenceTokenizer()
    lemmatizer = LemmaReplacer('latin')
    
    for file in tqdm(tesserae.fileids(), total=len(tesserae.fileids())):
        for sent in tesserae.sents(file):
            sent = aeoe_replacer.replace(jv_replacer.replace(drop_punct(sent)))
            sent = normalize_accents(sent)
            sent = accept_editorial(sent)
            for token in toker.tokenize(sent):
                case_counts.update({token:1})
                
    with open(truecase_file, 'wb') as fout:        
        pickle.dump(case_counts, fout) 

len(case_counts)
# 344393, 322711  
# 318451
# 316722
# 311399
# 310384
# 310567
# 309529

100%|██████████| 762/762 [03:10<00:00,  3.99it/s]


309529

In [4]:
print(sample(list(case_counts.items()), 25))

[('litigatur', 4), ('excreare', 4), ('cantharum', 14), ('circumdare', 40), ('Caesariem', 2), ('cultori', 5), ('semitari', 1), ('amplectatur', 4), ('Retraham', 1), ('totonderunt', 1), ('sonco', 1), ('Laurentium', 2), ('epicurei', 1), ('obrizo', 2), ('Teutomatus', 2), ('iniustissume', 1), ('parassent', 5), ('cognoscendas', 2), ('δηρόν', 1), ('Erconualdo', 1), ('audiar', 4), ('Melito', 1), ('Paxaea', 1), ('ramorum', 58), ('rge', 1)]


In [5]:
def get_word_counts(files:List[str])->Tuple[Dict[str, int], Dict[str, int]]:
    """
    Given a list of files, 
    clean & tokenize the documents
    return Counters for:
    lemmatized words in the documents
    inflected words in the documents    
    """
    word_counter = Counter()
    inflected_word_counter = Counter()
    jv_replacer = JVReplacer()
    aeoe_replacer = AEOEReplacer()
    toker = WordTokenizer('latin')
    sent_toker = SentenceTokenizer()
    lemmatizer = LemmaReplacer('latin')

    for file in tqdm(files , total=len(files), unit='files'):
        with open(file, 'rt') as fin:
            text = fin.read()
            text = text.replace("-\n", "")
            text = text.replace("\n", " ")
            text = aeoe_replacer.replace(jv_replacer.replace( text))
            for sent in sent_toker.tokenize(text):
                sent = dehyphenate(sent) # because it's Phi5
                sent = swallow_braces(sent)
                sent = swallow_square_brackets(sent)
                sent = disappear_round_brackets(sent)
                sent = swallow_obelized_words(sent)
                sent = disappear_angle_brackets(sent)            
                sent = drop_punct(sent)
                sent = normalize_accents(sent)            
                # lemmatizer prefers lower 
    #             sent = lemmatizer.lemmatize(sent.lower(), return_string=True)
                for word in toker.tokenize(sent):
                    if word.isnumeric():
                        continue
                    inflected_word_counter.update({truecase(word, case_counts):1})                    
                    word = lemmatizer.lemmatize(word.lower(), return_string=True)
                    # normalize capitals
                    word_counter.update({truecase(word, case_counts) : 1})
    return word_counter, inflected_word_counter


In [10]:
def word_stats(author:str, lemma_counter:Counter, 
               inflected_counter:Counter)->Tuple[float, float]:
    """
    
    """
    nw = sum(lemma_counter.values())
    print(f"Total count of all tokens in {author} corpus: {nw:,}")
    print(f"Total number of distinct inflected words/tokens in {author} corpus: {len(inflected_counter):,}")
    print(f"Total number of lemmatized words/tokens in {author} corpus {len(lemma_counter):,}")
    ciw1 = sum([1 for key, val in inflected_counter.items() if val == 1]) 
    print(f"Count of inflected tokens only occuring once {ciw1:,}")
    cw1 = sum([1 for key, val in lemma_counter.items() if val == 1])
    print(f"Count of lemmatized tokens only occuring once {cw1:,}")
    Piu_one = ciw1 / nw
    print(f"Probability of a single count unigram occuring in the {author} corpus: {Piu_one:.3f}") 
    Plu_one = cw1 / nw
    print(f"Probability of a single count unigram in the lemmatized {author} corpus: {Plu_one:.3f}") 
    return (Piu_one, Plu_one)

In [7]:
# Cicero works
cicero_files = glob(f"{os.path.expanduser('~')}/cltk_data/latin/text/phi5/individual_works/LAT0474.TXT-0*.txt")
len (cicero_files)              

75

In [8]:
cicero_lemmas, cicero_inflected_words = get_word_counts(cicero_files)

100%|██████████| 75/75 [00:47<00:00,  1.59files/s]


In [11]:
word_stats(author='Cicero', 
           lemma_counter=cicero_lemmas,
           inflected_counter=cicero_inflected_words)

Total count of all tokens in Cicero corpus: 1,196,512
Total number of distinct inflected words/tokens in Cicero corpus: 75,705
Total number of lemmatized words/tokens in Cicero corpus 23,345
Count of inflected tokens only occuring once 34,608
Count of lemmatized tokens only occuring once 10,656
Probability of a single count unigram occuring in the Cicero corpus: 0.029
Probability of a single count unigram in the lemmatized Cicero corpus: 0.009


(0.02892407263780054, 0.008905886443261747)

In [12]:
cicero_lemmas_counter_file = 'cicero_lemmas_counter.pkl'
cicero_inflected_counter_file = 'cicero_inflected_counter.pkl'

if not os.path.exists(cicero_lemmas_counter_file):
    with open(cicero_lemmas_counter_file, 'wb') as fout:
        pickle.dump(cicero_lemmas, fout)
if not os.path.exists(cicero_inflected_counter_file):
    with open(cicero_inflected_counter_file, 'wb') as fout:
        pickle.dump(cicero_inflected_words, fout)

In [41]:
author_index = {val:key for key,val in PHI5_INDEX.items()  
                if val != 'Marcus Tullius Cicero, Cicero, Tully'}

In [20]:
def get_phi5_author_files(author_name, author_index):
    stub = author_index[author_name]
    return glob(os.path.expanduser(f'~/cltk_data/latin/text/phi5/individual_works/{stub}*.txt'))


## Visualization of our corpus comparison: 
If you took one page from one author and placed it into Cicero, how surprising would it be?

If the other author's vocabulary was substantially different, it would be noticeable. We can quantify this.

As a result, since we want to predict as close as possible to the author, we should only train a language model where the underlying corpus vocabularies are within a reasonable window of surprise.

In [22]:
results = []

for author in author_index:
    files = get_phi5_author_files(author, author_index)
    #     cicero_lemmas, cicero_inflected_words = get_word_counts(cicero_files)    
    author_lemmas, author_inflected_words = get_word_counts(files)    

    author_words = set(author_lemmas.keys())
    cicero_words = set(cicero_lemmas.keys())
    common = author_words & cicero_words
    author_uniq  = author_words - common 
    P_one_x_lemma_unigram = len(author_uniq) / sum(author_lemmas.values())
    
    author_words = set(author_inflected_words.keys())
    cicero_words = set(cicero_inflected_words.keys())
    
    common = author_words & cicero_words
    author_uniq  = author_words - common 
    P_one_x_inflected_unigram = len(author_uniq) / sum(author_inflected_words.values())
    results.append((author, P_one_x_lemma_unigram, P_one_x_inflected_unigram  ))




100%|██████████| 1/1 [00:00<00:00,  2.22files/s]
100%|██████████| 1/1 [00:01<00:00,  1.03s/files]
100%|██████████| 8/8 [00:03<00:00,  2.54files/s]
100%|██████████| 1/1 [00:00<00:00, 331.88files/s]
100%|██████████| 1/1 [00:00<00:00, 170.88files/s]
100%|██████████| 1/1 [00:00<00:00, 620.73files/s]
100%|██████████| 1/1 [00:00<00:00, 358.92files/s]
100%|██████████| 2/2 [00:00<00:00, 94.10files/s]
100%|██████████| 1/1 [00:03<00:00,  3.36s/files]
100%|██████████| 75/75 [00:48<00:00,  1.55files/s]
100%|██████████| 1/1 [00:00<00:00, 504.49files/s]
100%|██████████| 1/1 [00:00<00:00, 513.82files/s]
100%|██████████| 2/2 [00:00<00:00, 200.66files/s]
100%|██████████| 1/1 [00:00<00:00, 148.79files/s]
100%|██████████| 2/2 [00:00<00:00, 389.86files/s]
100%|██████████| 1/1 [00:00<00:00, 454.13files/s]
100%|██████████| 1/1 [00:00<00:00, 551.59files/s]
100%|██████████| 1/1 [00:00<00:00, 169.64files/s]
100%|██████████| 1/1 [00:00<00:00, 27.88files/s]
100%|██████████| 1/1 [00:00<00:00, 198.92files/s]
100%|

100%|██████████| 1/1 [00:00<00:00, 370.23files/s]
100%|██████████| 1/1 [00:00<00:00, 106.77files/s]
100%|██████████| 2/2 [00:00<00:00, 390.86files/s]
100%|██████████| 1/1 [00:00<00:00,  2.85files/s]
100%|██████████| 1/1 [00:00<00:00, 131.23files/s]
100%|██████████| 1/1 [00:00<00:00, 504.85files/s]
100%|██████████| 1/1 [00:00<00:00, 253.68files/s]
100%|██████████| 1/1 [00:00<00:00, 548.20files/s]
100%|██████████| 1/1 [00:00<00:00, 178.60files/s]
100%|██████████| 3/3 [00:00<00:00, 21.33files/s]
100%|██████████| 1/1 [00:00<00:00,  3.79files/s]
100%|██████████| 1/1 [00:01<00:00,  1.06s/files]
100%|██████████| 7/7 [00:00<00:00, 22.14files/s]
100%|██████████| 1/1 [00:00<00:00, 422.13files/s]
100%|██████████| 1/1 [00:00<00:00, 483.83files/s]
100%|██████████| 1/1 [00:00<00:00, 357.39files/s]
100%|██████████| 1/1 [00:00<00:00, 374.49files/s]
100%|██████████| 1/1 [00:00<00:00, 123.99files/s]
100%|██████████| 1/1 [00:00<00:00, 535.12files/s]
100%|██████████| 2/2 [00:00<00:00, 273.65files/s]
100%|

100%|██████████| 1/1 [00:00<00:00, 541.41files/s]
100%|██████████| 1/1 [00:00<00:00, 507.54files/s]
100%|██████████| 2/2 [00:00<00:00, 13.09files/s]
100%|██████████| 1/1 [00:00<00:00, 194.18files/s]
100%|██████████| 1/1 [00:00<00:00, 442.25files/s]
100%|██████████| 1/1 [00:00<00:00, 451.05files/s]
100%|██████████| 2/2 [00:02<00:00,  1.09s/files]
100%|██████████| 1/1 [00:00<00:00, 116.08files/s]
100%|██████████| 1/1 [00:00<00:00, 159.87files/s]
100%|██████████| 1/1 [00:00<00:00, 225.49files/s]
100%|██████████| 1/1 [00:00<00:00, 356.78files/s]
100%|██████████| 1/1 [00:00<00:00, 183.86files/s]
100%|██████████| 1/1 [00:00<00:00, 72.62files/s]
100%|██████████| 1/1 [00:00<00:00, 201.98files/s]
100%|██████████| 7/7 [00:14<00:00,  2.13s/files]
100%|██████████| 1/1 [00:00<00:00,  5.13files/s]
100%|██████████| 1/1 [00:00<00:00, 443.89files/s]
100%|██████████| 1/1 [00:00<00:00, 162.05files/s]
100%|██████████| 1/1 [00:34<00:00, 34.09s/files]
100%|██████████| 1/1 [00:00<00:00, 156.06files/s]
100%|█

In [46]:
# sorted(results, key=lambda x:x[1])

In [30]:
results_map = {key: (val, val2) for key,val,val2 in results}

In [45]:
for author in author_index:
    files = get_phi5_author_files(author, author_index)
    if len(files) >= 3:
        print(author, results_map[author])
# the values analogous to Cicero are: (0.02892407263780054, 0.008905886443261747)        

Gaius Iulius Caesar, Caesar (0.016170899832329378, 0.0464137117307334)
Apuleius Madaurensis (0.039956560814859196, 0.12101183343319354)
Caelius Apicius (0.04383594547528974, 0.09950159130486999)
Anonymi Comici et Tragici (0.05979473449352968, 0.10397144132083891)
C. Iul. Caes. Augustus Octavianus (0.16793743890518084, 0.20527859237536658)
Publius Papinius Statius (0.03662215849687846, 0.1022791767482152)
Lucius Accius (0.0845518118245391, 0.16634880271243907)
Gaius Caesius Bassus (0.040359504832965916, 0.07953196540613872)
Publius Vergilius Maro, Virgil, Vergil (0.03315200072836527, 0.0929348568307006)
Publius Ovidius Naso (0.023965644822556705, 0.06525858344775079)
Gnaeus Naevius (0.11655300681959083, 0.20644761314321142)
Fragmenta Bobiensia (0.07398076042143839, 0.1385707741639945)
Scriptores Historiae Augustae (0.03177853760216489, 0.071072022819111)
Publius Terentius Afer, Terence (0.028577576089507863, 0.058641733823644474)
Aulus Cornelius Celsus (0.017332921313593843, 0.055884859

In [None]:
# grab prose authors

# grab poets

# consider individual files

# Gaius Iulius Caesar, Caesar (0.016170899832329378, 0.0464137117307334)
# Apuleius Madaurensis (0.039956560814859196, 0.12101183343319354)
# Caelius Apicius (0.04383594547528974, 0.09950159130486999)
# Anonymi Comici et Tragici (0.05979473449352968, 0.10397144132083891)
# C. Iul. Caes. Augustus Octavianus (0.16793743890518084, 0.20527859237536658)
# Publius Papinius Statius (0.03662215849687846, 0.1022791767482152)
# Lucius Accius (0.0845518118245391, 0.16634880271243907)
# Gaius Caesius Bassus (0.040359504832965916, 0.07953196540613872)
# Publius Vergilius Maro, Virgil, Vergil (0.03315200072836527, 0.0929348568307006)
# Publius Ovidius Naso (0.023965644822556705, 0.06525858344775079)
# Gnaeus Naevius (0.11655300681959083, 0.20644761314321142)
# Fragmenta Bobiensia (0.07398076042143839, 0.1385707741639945)
# Scriptores Historiae Augustae (0.03177853760216489, 0.071072022819111)
# Publius Terentius Afer, Terence (0.028577576089507863, 0.058641733823644474)
# Aulus Cornelius Celsus (0.017332921313593843, 0.0558848592109822)
# Gaius Suetonius Tranquillus (0.033629947836759745, 0.0958944461491255)
# Marcus Terentius Varro, Varro (0.045866176600832524, 0.093891152245151)
# Appendix Vergiliana (0.0500247341083354, 0.1418501113034875)
# Annius Florus (0.038297569987210456, 0.09140969162995595)
# Pomponius Porphyrio (0.04030915576694411, 0.09312987184568636)
# Marcus Valerius Probus (0.03835521769177609, 0.08431237042156185)
# Quintus Ennius (0.05652467883705206, 0.12021636240703178)
# Didascaliae et Per. in Terentium (0.0782967032967033, 0.13598901098901098)
# Cornelius Tacitus (0.02469418086200983, 0.07631488690859423)
# Titus Livius, Livy (0.011407436246836674, 0.03913716547549524)
# Lucius Annaeus Seneca senior (0.01619733327917297, 0.052095498258405856)
# Quintus Horatius Flaccus, Horace (0.04486396446418656, 0.12253192670738479)
# Gaius Asinius Pollio (0.03592814371257485, 0.08982035928143713)
# Gaius Sallustius Crispus (0.020570966643975494, 0.059330326752893126)
# C. Plinius Caecilius Secundus, Pliny (0.01694301397770358, 0.06551977816761927)
# Marcus Fabius Quintilianus (0.009342494688624445, 0.0416682017463066)
# Hyginus Gromaticus (0.0285692634131555, 0.08320703243407093)
# Titus Lucretius Carus (0.022190184885737107, 0.06787585965048998)
# Claudius Caesar Germanicus (0.04035804020100502, 0.12861180904522612)
# Gaius, iur., Gaius (0.011268643689753487, 0.035144203727768185)
# Quintus Terentius Scaurus (0.04715169618092597, 0.09174311926605505)
# Lucius Livius Andronicus (0.14615384615384616, 0.25)
# Marcus Cornelius Fronto (0.03605195520469984, 0.08350927115843583)
# Didascaliae et Argum. in Plautum (0.07712590639419907, 0.14831905075807514)
# Argum. Aen. et Tetrast. (0.07066381156316917, 0.1441827266238401)
# Anonymi Epici et Lyrici (0.09684487291849254, 0.19237510955302367)
# Marcus Porcius Cato, Cato (0.061287538049157236, 0.13079823724501385)
# Sextus Iulius Frontinus (0.03041633518960488, 0.09337045876425351)
# Lucius Annaeus Seneca iunior (0.012655345175352984, 0.05447654369184723)
# Titus Maccius Plautus (0.02682148990105487, 0.062141513731995376)
# Maurus Servius Honoratus, Servius (0.025347881711764008, 0.05923711189138313)
# Quintus Asconius Pedianus (0.010382059800664452, 0.029663028001898434)
