### Assessing NER regularization of Latin corpus texts
In Cythia Damon's masterful edition of Julius Caesar's _Bellum Civile_ she provides rules for the spelling regularizations she performed in an Appendix Orthographica. This notebook tries to quantify the quality and scope of these suggested regularizations when applied to a larger Latin corpus, specifically using her NER regularization rules.

Applying these rules to a corpus could help boost the values of embeddings, especially for limited resource language.

In [1]:
import os.path
from collections import Counter
import inspect
import os
import sys
from glob import glob
from typing import List, Dict, Tuple

from cltk.corpus.readers import get_corpus_reader
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.latin.phi5_index import PHI5_INDEX
from tqdm import tqdm

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
 
from mlyoucanuse.aeoe_replacer import AEOEReplacer
from mlyoucanuse.text_cleaners import ( normalize_accents, disappear_angle_brackets,
                                        drop_punct, disappear_round_brackets,
                                         dehyphenate, accept_editorial,
                                        swallow_braces, swallow_obelized_words,
                                        swallow_square_brackets)

# Latin Proper Name Prefix Regularizations
Adapted from the Appendix Orthographica of Cythia Damon _C. Iuli Caesaris commentarii de bello civili_ (Bellum civile, or Civil War) (Oxford Classical Texts), pp. 168-172

In [2]:
# The first string in the tuple is the preferred prefix
# The second string in the tuple is the anomaly to be regularized
# we will title case after construction, lower cased here for readability

regularizations =[
                    ('acarnan', 'acharnan'),
                    ('achillam', 'achillan'), 
                    ('achillam', 'achila'), 
                    ('aeginium', 'eginium'), 
                    ('aeginium', 'eginum'), 
                    ('aegypt', 'aegipt'),
                    ('aegypt', 'agipt'),
                    ('aegypt', 'egypt'), 
                    ('aegypt', 'egipt'),
                    ('aetoli', 'aetholi'),
                    ('aetoli', 'etoli'),
                    ('aetoli', 'etholi'), 
                    ('afrani', 'affrani'),
                    ('afric', 'affric'),
                    ('albic', 'albicc'),
                    ('alexandri', 'alexandre'),
                    ('amantini', 'amatthini'),
                    ('amantini', 'amatini'),                  
                    ('ambraci', 'ambrachi'),
                    ('ambraci', 'abrachi'),
                    ('amphiloc', 'amfiloc'),
                    ('androsthen', 'androsten'),
                    ('antoni', 'anthoni'),
                    ('antioch', 'anthioch'),
                    ('antioch', 'antioc'),
                    ('antoni', 'anthoni'),
                    ('apollonia', 'appollon'),
                    ('apollonia', 'apollin'),
                    ('arelate', 'arelatae'),
                    ('arimin', 'harimin'),
                    ('arretium', 'aretium'),
                    ('arretium', 'aretiuum'),
                    ('asia', 'asya'),
                    ('aspara', 'aspera'),
                    ('aspara', 'haspara'),
                    ('atti', 'ati'),
                    ('atti', 'acti'),                               
                    ('bess', 'beiss'),
                    ('bithyni', 'bithini'),
                    ('boeoti', 'boeti'),
                    ('brundisi', 'brundusi'),
                    ('bylli', 'bibly'),
                    ('bylli', 'bibli'),
                    ('byllidens', 'billidens'),
                    ('caeli', 'coeli'),                  
                    ('claudi', 'clodi'),
                    ('caesa', 'cesa'),
                    ('caesa', 'cessa'),# problematic with cesso, cessare
                    ('caesa', 'caessa'), # problematic with cesso, cessare
                    ('calydo', 'calido'),
                    ('cappadoci','capodoci'),
                    ('cassi', 'casi'),                  
                    ('cilici', 'ciliti'),
                    ('clipe', 'clippe'),
                    ('clipe', 'clupe'),
                    # ('cn.','cnei'),
                    ('commagen', 'commagin'),
                    ('corcyr', 'corcir'),
                    ('cott', 'coct'),
                    ('cott', 'coty'),
                    ('cyclad', 'ciclad'),
                    ('cypr', 'cipr'),
                    ('cyren', 'ciren'),
                    ('domiti', 'domici'),
                    ('dyrrachi', 'dyrachi'),
                    ('dyrrachi', 'dirrachi'),
                    ('dyrrachi', 'dirachy'),                  
                    ('dyrrachi', 'dirachi'),
                    ('eli', 'aeli'),
                    ('ephes', 'aephes'),
                    ('epir', 'epyr'),
                    ('epir', 'aepyr'),
                    ('fauoni', 'faboni'),
                    ('flegmat', 'flecmat'),
                    ('fufi', 'fufei'),
                    ('gabini', 'gauini'),
                    ('gallia', 'galia'),
                    ('gallograeci', 'gallogreci'),
                    ('gerg', 'gorg'),
                    ('gomph', 'gonph'),
                    ('gomph', 'gonf'),
                    ('gomph', 'gimph'),
                    ('gomph', 'gymph'),
                    ('graec', 'grec'),
                    ('hadriatic', 'adriatic'),
                    ('hadrumet', 'hadrument'),
                    ('hadrumet', 'adrumet'),
                    ('haliacm', 'aliacm'),
                    ('hegesaret', 'haegesaret'),
                    ('hegesaret', 'egesaret'),
                    ('hirr', 'hyrr'),
                    ('hispan', 'hyspan'),
                    ('hispan', 'ispan'),
                    ('ilerd', 'ylerd'),
                    ('illyric', 'illiric'),
                    ('illyric', 'illic'),
                    ('illyric', 'ylliric'),
                    ('issa', 'hissa'),
                    ('itali', 'ithali'),
                    ('lacedaemon', 'lacedemon'),
                    ('lacedaemon', 'lacaedemon'),
                    ('laeli', 'leli'),
                    ('larisa', 'larissa'),
                    ('liss', 'lys'),
                    ('luccei', 'lucei'),
                    ('macedon', 'machedon'),
                    ('marrucin', 'marrutin'),
                    ('massilien', 'massalien'),
                    ('massilien', 'massillien'),
                    ('massilien', 'masilien'),
                    ('maure', 'mauri'),
                    ('minuci', 'minuti'),
                    ('mitylen', 'mytilen'),
                    ('nymphae', 'nymphe'),
                    ('oric', 'oryc'),
                    ('oric', 'orig'),
                    ('otacili', 'octacili'),
                    ('otoges', 'octoges'),
                    ('otoges', 'otogens'),
                    ('otoges', 'octogens'),
                    ('paelign', 'pelign'),
                    ('parthin', 'partin'),
                    ('parthin', 'parthyn'),
                    ('parthin', 'partyn'),
                    ('parthin', 'parthyin'),
                    ('pharu', 'faru'),
                    ('pharu', 'pharo'),
                    #('philipp', 'philip'), # need to match with regex
                    ('philipp', 'phylipp'),
                    ('phoenic', 'faenic'),
                    ('phoenic', 'phenic'),
                    ('placenti', 'placenci'),
                    ('ploti', 'plochi'),
                    ('pompe', 'pope'),
                    ('pothin', 'phothin'),
                    ('ptolome', 'ptholome'),
                    ('ptolome', 'ptholomae'),
                    ('ptolome', 'tholome'),
                    ('ptolome', 'tholomae'),
                    ('pyrenae', 'pyrene'),
                    ('pyrenae', 'pirene'),
                    ('rhascypoli', 'rascipoli'),
                    ('rhascypoli', 'rascipolu'),
                    ('roucill', 'rogill'),
                    ('roucill', 'rouic'),
                    ('rutili', 'rutuli'),
                    ('scipi', 'scypi'),
                    ('sicili', 'sycili'),
                    ('sicor', 'sycor'),
                    ('sicor', 'siccor'),
                    ('spinth', 'spint'),
                    ('spinth', 'sphint'),
                    ('sull', 'syll'),
                    ('sull', 'sill'),
                    ('sulpici', 'sulpiti'),
                    ('syri', 'siri'),
                    ('syri', 'syrri'),
                    ('tarraci', 'tarrachi'),
                    ('tarracon', 'tarraconn'),
                    ('therm','term'),                  
                    ("thessali", "tessali"),
                    ("thessali", "thesali"),
                    ("thessali", "tessalli"),
                    ("thraci", "traci"),
                    ("thur", "tur"),
                    ("tiburti", "tiburci"),
                    ("tiburti", "tyburti"),
                    # ("uar", "uarr" ), # bar ? # fails for: Marcus Terentius Varro
                    ("uespill", "uispill"),   
                    ("uespill", "bispill"),
                    ("uespill", "bispull"),
                    ("uetto", "uecto"),    
                    ("uetto", "ueto"),  
                    ("uetto", "betto"),  
                    ("uetto", "beto"),
                    ("uibi", "biui"),
                    ("uibull", "uibul"),   
                    ("uibull", "uibell"),   
                    ("uibull", "bibul"),
                    ("uibull", "bibull"),
                    ("uolca", "uolga"),
                    ("uolca", "uulga"),
                    ("uolcaci", "uolcati")
                 ]
regularizations = [(tmp.title(), tmp2.title()) for tmp, tmp2 in regularizations]

In [3]:
# Some useful common functions

### In the following sections, we will show Damon's preferred prefix first

def get_matches(prefix:str, word_counts:Dict[str,int])->Tuple[int, List[str]]:
    egs = [(key, val) for key,val in word_counts.items() if key.startswith(prefix)]
    if len(egs) == 0:
        return 0, []
    vals, counts = zip(*egs)
    egs = sorted(egs, key=lambda x:x[1], reverse=True)
    return sum(counts), egs

def print_proposed_regularizations(word_counts):
    """show the proposed regularizations"""
    total_transformations = 0
    for prefix_one, prefix_two in regularizations:
        prefix_one_num, prefix_one_egs = get_matches(prefix_one, word_counts) 
        prefix_two_num, prefix_two_egs = get_matches(prefix_two, word_counts) 
        if prefix_two_num == 0:
            continue
        print(f"# prefix examples: {prefix_one}- {prefix_one_num:,}, {prefix_two}- {prefix_two_num:,}")
        common_transforms = []
        for word, count in prefix_two_egs:
            transformed = word.replace(prefix_two, prefix_one)
            if transformed in word_counts:
                common_transforms.append((word, '->', transformed, count))
                total_transformations += count
        if common_transforms:
            print(f'exact transform matches: {len(common_transforms)}')                        
            for item in common_transforms:
                print(item)
        else:
            print("top 5 unmatched candidates", prefix_two_egs[:5])
    print(f"total transformations proposed {total_transformations:,}")     

# Tesserae Corpus: Proposed NER regularizatons

In [4]:
tesserae = get_corpus_reader(corpus_name='latin_text_tesserae', language='latin')
word_counts = Counter()
jv_replacer = JVReplacer()
aeoe_replacer = AEOEReplacer()
toker = WordTokenizer('latin')
sent_toker = SentenceTokenizer() 

for file in tqdm(tesserae.fileids(), total=len(tesserae.fileids())):
    for sent in tesserae.sents(file):
        sent = aeoe_replacer.replace(jv_replacer.replace(drop_punct(sent)))
        sent = normalize_accents(sent)
        sent = accept_editorial(sent)
        for token in toker.tokenize(sent):
            word_counts.update({token : 1})

print("Proposed regularizations for Tesserae")
print_proposed_regularizations(word_counts)

100%|██████████| 762/762 [03:39<00:00,  3.48it/s]


Proposed regularizations for Tesserae
# prefix examples: Achillam- 6, Achila- 3
top 5 unmatched candidates [('Achilae', 3)]
# prefix examples: Alexandri- 426, Alexandre- 73
exact transform matches: 4
('Alexandream', '->', 'Alexandriam', 39)
('Alexandrea', '->', 'Alexandria', 18)
('Alexandreae', '->', 'Alexandriae', 15)
('Alexandre', '->', 'Alexandri', 1)
# prefix examples: Antioch- 799, Antioc- 799
top 5 unmatched candidates [('Antiocho', 180), ('Antiochus', 164), ('Antiochum', 159), ('Antiochi', 112), ('Antiochiam', 64)]
# prefix examples: Apollonia- 73, Apollin- 433
exact transform matches: 1
('Apolline', '->', 'Apolloniae', 36)
# prefix examples: Aspara- 4, Aspera- 25
top 5 unmatched candidates [('Aspera', 23), ('Asperat', 2)]
# prefix examples: Atti- 635, Ati- 227
exact transform matches: 6
('Atia', '->', 'Attia', 4)
('Atius', '->', 'Attius', 3)
('Atinas', '->', 'Attinas', 3)
('Atium', '->', 'Attium', 1)
('Atio', '->', 'Attio', 1)
('Atii', '->', 'Attii', 1)
# prefix examples: Atti-

# prefix examples: Uibull- 23, Bibul- 123
exact transform matches: 1
('Bibuli', '->', 'Uibulli', 28)
# prefix examples: Uolca- 93, Uolga- 1
top 5 unmatched candidates [('Uolgares', 1)]
# prefix examples: Uolca- 93, Uulga- 11
top 5 unmatched candidates [('Uulgari', 2), ('Uulgatum', 1), ('Uulgatissimum', 1), ('Uulgaribus', 1), ('Uulgarem', 1)]
# prefix examples: Uolcaci- 17, Uolcati- 18
exact transform matches: 3
('Uolcatio', '->', 'Uolcacio', 9)
('Uolcatium', '->', 'Uolcacium', 5)
('Uolcatius', '->', 'Uolcacius', 4)
total transformations proposed 1,070


# Phi5 Proposed NER regularizatons

In [5]:
author_index = {val:key for key,val in PHI5_INDEX.items()}  

def get_phi5_author_files(author_name, author_index):
    stub = author_index[author_name]
    return glob(os.path.expanduser(f'~/cltk_data/latin/text/phi5/individual_works/{stub}*.txt'))

word_counts = Counter()
jv_replacer = JVReplacer()
aeoe_replacer = AEOEReplacer()
toker = WordTokenizer('latin')
sent_toker = SentenceTokenizer() 

for author in tqdm(author_index):
    files = get_phi5_author_files(author, author_index)
    for file in files:
        with open(file, 'rt') as fin:
            text = fin.read()
            text = text.replace("-\n", "") # Phi5 has some hyphenated line endings
            text = text.replace("\n", " ")
            text = aeoe_replacer.replace(jv_replacer.replace(text))
            for sent in sent_toker.tokenize(text):
                sent = dehyphenate(sent) # because it's Phi5
                sent = swallow_braces(sent)
                sent = swallow_square_brackets(sent)
                sent = disappear_round_brackets(sent)
                sent = swallow_obelized_words(sent)
                sent = disappear_angle_brackets(sent)            
                sent = drop_punct(sent)
                sent = normalize_accents(sent)            
                for word in toker.tokenize(sent):
                    if word.isnumeric():
                        continue
                    word_counts.update({word : 1}) 

print('Phi5 Proposed regularizatons:')
print_proposed_regularizations(word_counts)

100%|██████████| 362/362 [03:14<00:00,  1.86it/s]


Phi5 Proposed regularizatons:
# prefix examples: Aegypt- 1,073, Egypt- 1
exact transform matches: 1
('Egyptiaci', '->', 'Aegyptiaci', 1)
# prefix examples: Alexandri- 610, Alexandre- 72
exact transform matches: 3
('Alexandream', '->', 'Alexandriam', 39)
('Alexandreae', '->', 'Alexandriae', 18)
('Alexandrea', '->', 'Alexandria', 15)
# prefix examples: Antioch- 769, Antioc- 769
top 5 unmatched candidates [('Antiocho', 224), ('Antiochum', 161), ('Antiochus', 147), ('Antiochi', 120), ('Antiochiam', 29)]
# prefix examples: Apollonia- 106, Apollin- 729
exact transform matches: 1
('Apolline', '->', 'Apolloniae', 81)
# prefix examples: Aspara- 7, Aspera- 8
top 5 unmatched candidates [('Aspera', 7), ('Asperat', 1)]
# prefix examples: Atti- 893, Ati- 323
exact transform matches: 9
('Atiae', '->', 'Attiae', 6)
('Atia', '->', 'Attia', 4)
('Atius', '->', 'Attius', 3)
('Atinas', '->', 'Attinas', 2)
('Ati', '->', 'Atti', 1)
('Atium', '->', 'Attium', 1)
('Atii', '->', 'Attii', 1)
('Atio', '->', 'Attio

# prefix examples: Uespill- 1, Uispill- 1
exact transform matches: 1
('Uispillo', '->', 'Uespillo', 1)
# prefix examples: Uetto- 63, Uecto- 2
exact transform matches: 2
('Uectonicam', '->', 'Uettonicam', 1)
('Uectonicae', '->', 'Uettonicae', 1)
# prefix examples: Uetto- 63, Ueto- 1
exact transform matches: 1
('Ueto', '->', 'Uetto', 1)
# prefix examples: Uibi- 91, Biui- 1
exact transform matches: 1
('Biuium', '->', 'Uibium', 1)
# prefix examples: Uibull- 23, Uibul- 40
top 5 unmatched candidates [('Uibullius', 9), ('Uibullium', 8), ('Uibulanus', 7), ('Uibulli', 5), ('Uibulenus', 3)]
# prefix examples: Uibull- 23, Uibell- 9
exact transform matches: 3
('Uibellio', '->', 'Uibullio', 3)
('Uibellius', '->', 'Uibullius', 3)
('Uibellium', '->', 'Uibullium', 2)
# prefix examples: Uibull- 23, Bibul- 133
exact transform matches: 1
('Bibuli', '->', 'Uibulli', 33)
# prefix examples: Uolca- 148, Uolga- 3
top 5 unmatched candidates [('Uolgaria', 1), ('Uolgari', 1), ('Uolgaris', 1)]
# prefix examples: 