# Building Good Training Data
building a high quality classifier requires good, thorough, unbiased training data. Sometimes you have to build the dataset yourself.
### In this notebooks we will:
1. Build a transformer to convert data
1. Evaluate our transformer against a sample dataset
1. Transform several source texts to create a good training dataset to be used in 

In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Some open source imports

In [15]:
from collections import Counter, defaultdict
import re
from bisect import bisect
import statistics

from cltk.corpus.readers import get_corpus_reader
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.word import WordTokenizer
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

plt.style.use('fivethirtyeight')

## Some local imports

In [16]:
perseus_greek = get_corpus_reader(language='greek', corpus_name='greek_text_perseus')
plato = [tmp for tmp in perseus_greek.fileids() if 'plato' in tmp]
plato
#  'plato__apology__grc.json'
plato_apologia_gk = list(perseus_greek.words('plato__apology__grc.json'))
plato_apologia_gk = [word for word in plato_apologia_gk if word.isalpha()]
plato_apologia_gk[:10]
print(f"Number of words in Plato's Apology: {len(plato_apologia_gk):,}")
print(f"Number of distinct words: {len(set(plato_apologia_gk)):,}")

Number of words in Plato's Apology: 8,516
Number of distinct words: 2,544


### Load the Perseus transliteration

In [17]:
plato_perseus_transliteration =''
gk_tokenizer= WordTokenizer('greek')
with open('greek.transliterated.plato.apologia.txt', 'rt') as reader:
    perseus_greek = reader.read()
    plato_perseus_transliteration = [tmp.strip() for tmp in gk_tokenizer.tokenize(perseus_greek)
                          if tmp.isalpha()]

## Make our transliteration

In [18]:
from romanizer import Romanizer

romanizer = Romanizer()
plato_our_transliteration = [romanizer.transliterate(word).strip() for word in plato_apologia_gk]

print(f'Number words in common: {len(set(plato_our_transliteration).intersection( plato_perseus_transliteration))}')

Number words in common: 2323


In [6]:
start=4700
end=start+100
print(plato_perseus_transliteration[start:end])
print( plato_our_transliteration[start:end])
print(plato_apologia_gk[start:end])

['onta', 'hoion', 'egō', 'legō', 'ouk', 'eme', 'meizō', 'blapsete', 'ē', 'humas', 'autous', 'eme', 'men', 'gar', 'ouden', 'an', 'blapseien', 'oute', 'Melētos', 'oute', 'gar', 'an', 'gar', 'oiomai', 'themiton', 'einai', 'ameinoni', 'andri', 'hupo', 'kheironos', 'blaptesthai', 'apokteineie', 'mentan', 'isōs', 'ē', 'exelaseien', 'ē', 'atimōseien', 'alla', 'tauta', 'houtos', 'men', 'isōs', 'oietai', 'kai', 'allos', 'tis', 'pou', 'megala', 'kaka', 'egō', 'd', 'ouk', 'oiomai', 'alla', 'polu', 'mallon', 'poiein', 'ha', 'houtosi', 'nun', 'poiei', 'andra', 'adikōs', 'epikheirein', 'apokteinunai', 'nun', 'oun', 'ō', 'andres', 'Athēnaioi', 'pollou', 'deō', 'egō', 'huper', 'emautou', 'apologeisthai', 'hōs', 'tis', 'an', 'oioito', 'alla', 'huper', 'humōn', 'mē', 'ti', 'examartēte', 'peri', 'tēn', 'tou', 'theou', 'dosin', 'humin', 'emou', 'katapsēphisamenoi', 'ean', 'gar', 'me', 'apokteinēte', 'ou']
['ouden', 'gar', 'an', 'nun', 'de', 'ou', 'gar', 'estin', 'ei', 'mē', 'ara', 'hoson', 'an', 'egō', 'd

In [7]:
set(plato_our_transliteration).difference( plato_perseus_transliteration)

{'Ektora',
 'Euēnos',
 'Hesiodō',
 'apopsēphisasthai',
 'apothanē',
 'arourēs',
 'autoskhediazōmen',
 'engus',
 'engutatō',
 'enguterō',
 'enguētai',
 'ephēstha',
 'hetoimos',
 'petrēs',
 'sigōn',
 'sungenesthai',
 'tethnaiēn',
 'Ēran'}

In [8]:
plato_order =sorted(set(plato_apologia_gk))

In [9]:
print(plato_order)

['Αἰακὸς', 'Αἰαντόδωρος', 'Αἰσχίνου', 'Αἴαντι', 'Γοργίας', 'Δίʼ', 'Δία', 'Δελφοὺς', 'Δελφοῖς', 'Δηλίῳ', 'Δημοδόκου', 'Διός', 'Διὸς', 'Εὔηνον', 'Εὔηνος', 'Θέτιδος', 'Θεάγης', 'Θεοζοτίδου', 'Θεόδοτος', 'Καλλία', 'Καλλίᾳ', 'Κεῖος', 'Κηφισιεὺς', 'Κλαζομενίου', 'Κρίτων', 'Κριτοβούλου', 'Κριτόβουλος', 'Λέοντα', 'Λεοντῖνος', 'Λυσανίας', 'Λύκων', 'Μέλητε', 'Μέλητον', 'Μέλητος', 'Μέλητός', 'Μίνως', 'Μελήτου', 'Μελήτῳ', 'Μουσαίῳ', 'Νικόστρατος', 'Πάριος', 'Παλαμήδει', 'Παράλιος', 'Πατρόκλῳ', 'Πλάτων', 'Ποτειδαίᾳ', 'Πρόδικος', 'Πυθία', 'Σίσυφον', 'Σαλαμίνιον', 'Σαλαμῖνα', 'Σαλαμῖνος', 'Σφήττιος', 'Σωκράτη', 'Σωκράτης', 'Σωκράτους', 'Σώκρατες', 'Τελαμῶνος', 'Τριπτόλεμος', 'Τροίαν', 'Τροίᾳ', 'Χαιρεφῶν', 'Χαιρεφῶντα', 'αἰνίττεσθαι', 'αἰνίττεται', 'αἰσθανόμενος', 'αἰσχροῦ', 'αἰσχρόν', 'αἰσχρὸν', 'αἰσχυνθῆναι', 'αἰσχύνην', 'αἰσχύνομαι', 'αἰσχύνῃ', 'αἰτίαν', 'αἰτιῶν', 'αἱ', 'αἱρεῖ', 'αἱροῦμαι', 'αἱρῇ', 'αἴνιγμα', 'αἴσθησίς', 'αἴσθησιν', 'αἴτια', 'αἴτιον', 'αἴτιόν', 'αὐθαδέστερον', 'αὐθαδιζόμενος', 'αὐλ

In [10]:
CHECKS={'Ektora',
 'Euēnos',
 'Hesiodō',
 'apopsēphisasthai',
 'apothanē',
 'arourēs',
 'autoskhediazōmen',
 'ehilesthe',
 'engus',
 'engutatō',
 'enguterō',
 'enguētai',
 'ephēstha',
 'hetoimos',
 'petrēs',
 'sigōn',
 'sungenesthai',
 'tethnaiēn',
 'uhios',
 'Ēran'}

for idx, word in enumerate( plato_apologia_gk):
    result =romanizer.transliterate(word)
    if result in CHECKS:
        print(idx, word, result)

1124 Ἕκτορα Ektora
1142 Ἕκτορα Ektora
1149 Ἕκτορα Ektora
1151 ἑτοῖμος hetoimos
1177 τεθναίην tethnaiēn
1191 ἀρούρης arourēs
1554 ἀποθανῇ apothanē
1714 ἐγγυτέρω enguterō
3266 πέτρης petrēs
3297 ἀποψηφίσασθαι apopsēphisasthai
4578 σιγῶν sigōn
4753 ἐγγυηταὶ enguētai
4818 ἐγγύς engus
5688 συγγενέσθαι sungenesthai
5692 Ἡσιόδῳ Hesiodō
6186 Εὔηνος Euēnos
6281 αὐτοσχεδιάζωμεν autoskhediazōmen
6609 ἔφησθα ephēstha
7798 Ἥραν Ēran
8018 ἐγγυτάτω engutatō
8195 συγγενέσθαι sungenesthai


In [11]:
data = romanizer.greek_to_roman.items()
data= sorted(data)
print(data)

[('ʼ', ' '), ('ΐ', 'i'), ('ά', 'a'), ('έ', 'e'), ('ή', 'ē'), ('ί', 'i'), ('ό', 'o'), ('ύ', 'u'), ('ώ', 'ō'), ('ἀ', 'a'), ('ἁ', 'ha'), ('ἂ', 'a'), ('ἃ', 'ha'), ('ἄ', 'a'), ('ἅ', 'ha'), ('ἆ', 'a'), ('Ἀ', 'A'), ('Ἄ', 'A'), ('Ἅ', 'Ha'), ('ἐ', 'e'), ('ἑ', 'he'), ('ἓ', 'he'), ('ἔ', 'e'), ('ἕ', 'he'), ('Ἐ', 'E'), ('Ἕ', 'E'), ('ἠ', 'ē'), ('ἡ', 'hē'), ('ἢ', 'ē'), ('ἣ', 'hē'), ('ἤ', 'ē'), ('ἥ', 'hē'), ('ἦ', 'ē'), ('ἧ', 'hē'), ('Ἠ', 'Ē'), ('Ἡ', 'He'), ('Ἥ', 'Ē'), ('ἰ', 'i'), ('ἱ', 'hi'), ('ἳ', 'hi'), ('ἴ', 'i'), ('ἵ', 'hi'), ('ἶ', 'i'), ('ἷ', 'hi'), ('Ἱ', 'Hi'), ('ὀ', 'o'), ('ὁ', 'ho'), ('ὂ', 'o'), ('ὃ', 'ho'), ('ὄ', 'o'), ('ὅ', 'ho'), ('Ὀ', 'O'), ('Ὁ', 'Ho'), ('ὐ', 'u'), ('ὑ', 'hu'), ('ὓ', 'hu'), ('ὔ', 'u'), ('ὕ', 'hu'), ('ὖ', 'u'), ('ὗ', 'hu'), ('ὠ', 'ō'), ('ὡ', 'hō'), ('ὢ', 'ō'), ('ὤ', 'ō'), ('ὥ', 'hō'), ('ὦ', 'ō'), ('ὧ', 'hō'), ('ὰ', 'a'), ('ὲ', 'e'), ('ὴ', 'ē'), ('ὶ', 'i'), ('ὸ', 'o'), ('ὺ', 'u'), ('ὼ', 'ō'), ('ᾐ', 'ē'), ('ᾑ', 'hē'), ('ᾔ', 'ē'), ('ᾕ', 'hē'), ('ᾖ', 'ē'), ('ᾗ', 'hē'), ('ᾠ', 'ō

In [12]:
data2 = romanizer.alternate_font_greek_to_roman.items()
data2 = sorted(data2)
print(data2)

[(' ', ' '), ('I', 'i'), ('O', 'O'), ('R', 'R'), ('Yi', 'Yi'), ('o', 'o'), ('r', 'r'), ('yi', 'yi'), ('Α', 'A'), ('Αι', 'Ae'), ('Β', 'B'), ('Γ', 'G'), ('Δ', 'D'), ('Ε', 'E'), ('Ει', 'E'), ('Ζ', 'Z'), ('Η', 'Ē'), ('Θ', 'Th'), ('Ι', 'I'), ('Κ', 'K'), ('Λ', 'L'), ('Μ', 'M'), ('Ν', 'N'), ('Ξ', 'X'), ('Ο', 'O'), ('Οι', 'Oe'), ('Ου', 'Ou'), ('Π', 'P'), ('Ρ', 'R'), ('Σ', 'S'), ('Τ', 'T'), ('Υ', 'Y'), ('Υι', 'Ui'), ('Φ', 'Ph'), ('Χ', 'Kh'), ('Ψ', 'Ps'), ('Ω', 'Ō'), ('α', 'a'), ('αι', 'ae'), ('β', 'b'), ('γ', 'g'), ('γγ', 'ng'), ('γκ', 'nc'), ('γξ', 'nx'), ('γχ', 'nch'), ('δ', 'd'), ('ε', 'e'), ('ει', 'ei'), ('ζ', 'z'), ('η', 'ē'), ('θ', 'th'), ('ι', 'i'), ('κ', 'k'), ('λ', 'l'), ('μ', 'm'), ('ν', 'n'), ('ξ', 'x'), ('ο', 'o'), ('οι', 'oe'), ('ου', 'ou'), ('π', 'p'), ('ρ', 'r'), ('ς', 's'), ('σ', 's'), ('τ', 't'), ('υ', 'u'), ('υι', 'ui'), ('φ', 'ph'), ('χ', 'kh'), ('ψ', 'ps'), ('ω', 'ō')]


In [13]:
print(sorted(romanizer.greek_to_roman_dipthongs.items()))

[(' Ἥ', 'Hē'), ('ΑΥ̓͂', 'AU'), ('ΑἹ', 'HAI'), ('ΑὙ', 'HAU'), ('ΑὝ', 'HAU'), ('ΑὟ', 'HAU'), ('ΓΓ', 'NG'), ('ΕΙ', 'EI'), ('ΕΥ̓͂', 'EU'), ('ΕἽ', 'HEI'), ('ΕἾ', 'EI'), ('ΕἿ', 'HEI'), ('Εὔ', 'Eu'), ('ΕὙ', 'HEU'), ('ΟΙ', 'OI'), ('ΟΥ', 'OU'), ('ΟἹ', 'HOI'), ('ΟἻ', 'HOI'), ('ΟἽ', 'HOI'), ('ΟἿ', 'HOI'), ('ΟὙ', 'HOU'), ('ΟὛ', 'HOU'), ('ΟὝ', 'HOU'), ('ΟὟ', 'HOU'), ('ΥἹ', 'HUI'), ('αἱ', 'hai'), ('αὑ', 'hau'), ('αὕ', 'hau'), ('αὖ', 'au'), ('αὗ', 'hau'), ('γγ', 'ng'), ('ει', 'ei'), ('εἵ', 'hei'), ('εἶ', 'ei'), ('εἷ', 'hei'), ('εὑ', 'heu'), ('εὔ', 'eu'), ('εὖ', 'eu'), ('οι', 'oi'), ('ου', 'ou'), ('οἱ', 'hoi'), ('οἳ', 'hoi'), ('οἵ', 'hoi'), ('οἷ', 'hoi'), ('οὑ', 'hou'), ('οὓ', 'hou'), ('οὕ', 'hou'), ('οὗ', 'hou'), ('υἱ', 'hui')]


### The following lines were used in tuning the Romanizer, they are left here as curiosity

In [107]:
# unmapped_greek= list(set([
#     letter for word in plato_our_transliteration 
#     for letter in word 
#     if letter not in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPRSTUVWXYZ']))

# print(unmapped_greek[:10])
# print(len(unmapped_greek))
# print(unmapped_greek)