# Boosting Training Data
Once you have a classifier, it's usually important to gather more training data so that you can rebuild the classifier for better results. Unfortunately, it's usually undesirable to use the classifier to harvest addition training data. Let's compare using the classifier against word probability distributions.

In [1]:
import json
import site
import pickle
import os
import logging 
import site

import numpy as np
from joblib import dump, load
import sklearn
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.word import WordTokenizer

## Add our candidate training data
we've selected a lemmatization dictionary, the keys are distinct forms which map to many inflected values.

In [2]:
site.addsitedir(os.path.expanduser('~/cltk_data/latin/lemma/latin_pos_lemmata_cltk'))
from latin_lemmata_cltk import LEMMATA

In [3]:
LOG = logging.getLogger('make_model')
LOG.addHandler(logging.NullHandler())
logging.basicConfig(level=logging.INFO)

In [4]:
def word_to_features(word, max_word_length=20):
    """
    Convert a single word into an array of numbers based on character ordinals, with padding
    :param word: a single word
    :param max_word_length: the maximum word length for the feature array
    :return: a list of integers padded to the max word length

    >>> word_to_features('far', 20)
    [116, 114, 97, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]
    """
    if len(word) > max_word_length:
        LOG.warning('Excessive word length {} for {}, truncating to {}'.format(len(word), word,
                                                                               max_word_length))
        word = word[:max_word_length]
    word = list(word)
    word.reverse() #: encourage aligning on word endings if possible
    return [ord(c) for c in "".join(word).ljust(max_word_length, ' ')]

In [5]:
candidate_words = list(LEMMATA.keys())
print(f'{len(candidate_words):,} words')

270,227 words


### Some minor text processing must occur, such as JV Transformation and taking the first tokenized form, which will drop enclitic endings

In [6]:
jv_transform = JVReplacer()
tokenizer = WordTokenizer('latin')

In [7]:
cans = [tokenizer.tokenize(jv_transform.replace(word))[0] for word in candidate_words ]
cans[:10]

['cerycia',
 'decimo',
 'Mutycenses',
 'enauiganda',
 'quadriiugo',
 'colocynthide',
 'aspicientibus',
 'Dulichius',
 'euangelizauerit',
 'patrocinatur']

In [9]:
cans= list(set(cans))
print(f'Candidate words: {len(cans):,}')

Candidate words: 224,576


In [10]:
model_output_file = 'is_transliterated_greek.mdl.{}.joblib'.format(sklearn.__version__)
classifier = load(os.path.join('./',model_output_file))

In [11]:
with open('{}.prov.json'.format(model_output_file), 'rt') as reader:
    prov = json.load(reader)
    max_len =  prov['max_word_length']

### Let's also try to separate the lemmatization keys by using the probability distributions of the two languages in question

In [12]:
greek_transliterated_word_probs = {}
with open('freq_dist.greek.transliterated.pkl', 'rb') as reader:
    greek_transliterated_word_probs = pickle.load(reader)
    
latin_word_probs = {}
with open(os.path.join('../building_language_model', 'freq_dist.latin.pkl'), 'rb') as reader:
    latin_word_probs = pickle.load(reader)

In [13]:
greater_prob_latin = [word for word in cans 
                     if latin_word_probs.get(word, 0.0000001) >= greek_transliterated_word_probs.get(word,0.0000001 )]
print(len(greater_prob_latin))

175969


In [14]:
with open('latin.lemma.forms.txt', 'wt') as writer:
    for word in greater_prob_latin:
        writer.write(word)
        writer.write('\n')

In [15]:
probable_latin=[]
maybe_greek=[]
 
for word in cans:
    result = classifier.predict(np.array([word_to_features(word, max_len) ]))
    if result:
        probable_latin.append(word)
    else:
        maybe_greek.append(word)
print(len(probable_latin))
len(maybe_greek)



32463


192113

## That's all for now folks!