# Loanword Feature Engineering

### In this notebook you will:
1. Analyze and optimize the featurization matrix from the previous notebook
2. Prove that words aligned by suffixes in reverse order are stronger than natural order alignment
3. Reduce the size of the feature matrix to increase the efficiency of our classifier without greatly sacrificing precision scores.



In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Imports

In [2]:
import datetime
import glob
import json
import logging
import multiprocessing
import os
import site
from copy import deepcopy
import random
from collections import defaultdict
import pickle

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from scipy import sparse
 
from tqdm import tqdm
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.extmath import density
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from joblib import dump, load
import sklearn
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from cltk.stem.latin.j_v import JVReplacer
from cltk.prosody.latin.scansion_constants import ScansionConstants
from cltk.prosody.latin.string_utils import remove_punctuation_dict
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.readers import get_corpus_reader
from cltk.utils.featurization import word_to_features
from cltk.utils.file_operations import md5
from cltk.utils.matrix_corpus_fun import (
    distinct_words,
    separate_camel_cases,
    drop_empty_lists,
    drop_non_lower,
    drop_arabic_numeric,
    drop_all_caps,
    drop_empty_strings,
    jv_transform,
    splice_hyphens,
    accept_editorial,
    profile_chars,
    demacronize,
    drop_enclitics,
    drop_fringe_punctuation,
    divide_separate_words,
    drop_all_punctuation)
plt.style.use('fivethirtyeight')

### Add parent directory to path so we can access our common code

In [3]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

In [43]:
from mlyoucanuse.romanizer import Romanizer, romanizer_transform  
from mlyoucanuse.aeoe_replacer import aeoe_transform
from mlyoucanuse.matrix_fun import (run_length_encoding,
                                    extract_words,                                     
                                    patch_cluster_holes)
from mlyoucanuse.featurize_text_fun import word_to_features

### Turn on logging, primarily so that library methods may report warnings, if any

In [5]:
LOG = logging.getLogger('make_model')
LOG.addHandler(logging.NullHandler())
logging.basicConfig(level=logging.INFO)

### Define a CorpusReader and select only the text files of Prudentius, Caesar and Eutropius. 
#### As shown in the appendix of this notebook, the authors of this seed set have a low incidence of using transliterated Greek words.

In [6]:
latin_reader = get_corpus_reader('latin_text_latin_library', language='latin')
ALL_FILE_IDS = list(latin_reader.fileids())
good_files = [file for file in ALL_FILE_IDS
              if 'prudentius' in file or
              'caesar' in file or
              'eutropius' in file]
LOG.info('available good files %s', len(good_files))
latin_reader._fileids = good_files
# good_files

INFO:make_model:available good files 41


In [7]:
# remove some unfamiliar entries
questionable = ['caesar/alex.txt',
                'caesar/hisp.txt',
                'prudentius/prud.psycho.txt',
                'suetonius/suet.caesar.txt',
                'xylander/caesar.txt']
for file in questionable:
    good_files.remove(file)

### Define a custom Scikit-learn Pipeline, and call the CorpusReader `words()` method to process the texts
#### The functions used in the pipelines are doctest documented in the `corpus_cleaning` module
#### The functions used and their order was developed iteratively by running the pipelines on actual data and carefully inspecting the results prior to runnning it through featurization. Always know your data!

#### Lastly, we use the joblib library to save/pickle the pipeline so that it can be reloaded and reused.

In [8]:
# process_latin_text_pipeline = Pipeline([
#     ('separate_camel_cases', FunctionTransformer(separate_camel_cases, validate=False)),
#     ('splice_hyphens', FunctionTransformer(splice_hyphens, validate=False)),
#     ('jv_transform', FunctionTransformer(jv_transform, validate=False)),
#     ('aeoe_transform', FunctionTransformer(aeoe_transform, validate=False)),
#     ('accept_editorial', FunctionTransformer(accept_editorial, validate=False)),
#     ('drop_enclitics', FunctionTransformer(drop_enclitics, validate=False)),
#     ('drop_fringe_punctuation', FunctionTransformer(drop_fringe_punctuation, validate=False)),
#     ('drop_all_punctuation', FunctionTransformer(drop_all_punctuation, validate=False)),
#     ('drop_non_lower', FunctionTransformer(drop_non_lower, validate=False)),
#     ('drop_arabic_numeric', FunctionTransformer(drop_arabic_numeric, validate=False)),
#     ('drop_all_caps', FunctionTransformer(drop_all_caps, validate=False)),
#     ('divide_separate_words', FunctionTransformer(divide_separate_words, validate=False)),
#     ('drop_empty_lists', FunctionTransformer(drop_empty_lists, validate=False)),
#     ('drop_empty_strings', FunctionTransformer(drop_empty_strings, validate=False))])

process_latin_text_pipeline_file = 'process_latin_text_pipeline.{}.joblib'.format(
    sklearn.__version__)
process_latin_text_pipeline = load(process_latin_text_pipeline_file)


In [9]:
X = process_latin_text_pipeline.fit_transform(tqdm([list(latin_reader.words())]))

100%|██████████| 1/1 [00:00<00:00,  4.98it/s]


* Analyze the resulting matrix, by profiling the character occurences
* Go back and adjust the pipeline as necessary
* Turn the output into a distinct set of words

In [11]:
char_count = profile_chars(X)
# print('Character distribution profile, total chars:', sum(char_count.values()))
# print(char_count)
distinct_good_latin = distinct_words(X)
# print(f'Number of distinct words in Eutropius/Prudentius/Caesar sample: {len(distinct_good_latin):,}')

### After running this notebook several times, we've decided to load in more training data, which is provide by the notebook:
* `boosting_training_data.ipynb`

In [12]:
additional_latin_words = []
with open('latin.lemma.forms.txt', 'rt') as reader:
    additional_latin_words = reader.read().split('\n')
# random.sample(additional_latin_words, 5)

In [13]:
print(f'additional_latin_words: {len(additional_latin_words):,}')
distinct_good_latin= list(set(distinct_good_latin) | set(additional_latin_words))
print(f'distinct_good_latin now: {len(distinct_good_latin):,}')

additional_latin_words: 175,970
distinct_good_latin now: 180,068


* Load the Greek texts of Homer and Plato (two of the most commonly quoted Greek authors)
* Preprocess the text
* Transliterate into Classical Latin

#### We save this pipeline for reuse too.

In [14]:
perseus_greek = get_corpus_reader(language='greek', corpus_name='greek_text_perseus')
plato = [tmp for tmp in perseus_greek.fileids() if 'plato' in tmp]
homer = [tmp for tmp in perseus_greek.fileids() if 'homer' in tmp]
greek_texts = plato + homer

# process_greek_pipeline = Pipeline([
#     ('accept_editorial', FunctionTransformer(accept_editorial, validate=False)),  # problematic
#     ('romanizer', FunctionTransformer(romanizer_transform, validate=False)),
#     ('drop_fringe_punctuation', FunctionTransformer(drop_fringe_punctuation, validate=False)),
#     ('drop_all_punctuation', FunctionTransformer(drop_all_punctuation, validate=False)),
#     ('drop_arabic_numeric', FunctionTransformer(drop_arabic_numeric, validate=False)),  #ok
#     ('drop_empty_lists', FunctionTransformer(drop_empty_lists, validate=False)),  # problem?
#     ('drop_empty_strings', FunctionTransformer(drop_empty_strings, validate=False))  # problem?
# ])

process_greek_text_pipeline_file = 'process_greek_text_pipeline.{}.joblib'.format(
    sklearn.__version__)

process_greek_pipeline = load(   process_greek_text_pipeline_file)

In [15]:
X_greek_transliterated = process_greek_pipeline.fit_transform(tqdm([list(perseus_greek.words(greek_texts))]))

100%|██████████| 1/1 [00:35<00:00, 35.99s/it]


* Analyze the transliterated Greek examples
* Check character profiles for tuning
* Create a set distinct words, with and without macrons

In [16]:
# print('Character distribution profile of transliterated Greek: ', profile_chars(X_greek_transliterated))
distinct_transliterated_greek_examples = distinct_words(X_greek_transliterated)
print(f'{len(distinct_transliterated_greek_examples):,} distinct_transliterated_greek_examples')
distinct_demacronized_greek = distinct_words(demacronize(X_greek_transliterated))
# print(f'{len(distinct_demacronized_greek):,} distinct_demacronized_greek')

48,478 distinct_transliterated_greek_examples


### See how many words from the transliterated Greek words which have also appear in the Latin corpus

In [17]:
shared_words = distinct_demacronized_greek & set(distinct_good_latin)
# print(f'Shared_words: {len(shared_words)} : {shared_words}')

#### These shared words appear in both language corpora; however, intuitively, we know each word will have a different probability of occurrence in each language. So, rather than arbitrarily excluding some or all of the words from one language or the other, we should split them into the most common probable groups. We can do this by loading the probability distribution pickle objects we have created in the notebooks:
* `building_language_model/make_frequency_distribution.ipynb` 
* `detecting_loanwords/make_frequency_distribution_greek_transliterated.ipynb`

### Load Frequency Distributions for Latin and transliterated Greek

In [18]:
greek_transliterated_word_probs = {}
with open('freq_dist.greek.transliterated.pkl', 'rb') as reader:
    greek_transliterated_word_probs = pickle.load(reader)
    
latin_word_probs = {}
with open(os.path.join('../building_language_model', 'freq_dist.latin.pkl'), 'rb') as reader:
    latin_word_probs = pickle.load(reader)

#### We'll create a list of tuples containing (the word, the words probability in Latin, the words probability in Greek)

In [19]:
shared_latin_greek = [(word, 
                       latin_word_probs.get(word, 0.000001),
                       greek_transliterated_word_probs.get(word, 0.000001)) 
                      for word in shared_words]

shared_latin_greek.sort(key=lambda a: a[1], reverse=True)
# for item in shared_latin_greek:
#     print(item)

`kai` is the most common word in the Greek corpus, so we could also divide the shared words by the threshold of this probability

In [20]:
latin_word_probs.get('kai')

0.0011142407253193212

In [21]:
likely_latin = [word for word, latin_prob, greek_prob 
                in shared_latin_greek 
                if latin_prob >= latin_word_probs['kai']]
print(len(likely_latin))
# likely_latin

94


In [22]:
greater_prob_latin = [word for word, latin_prob, greek_prob
                     in shared_latin_greek
                     if latin_prob >= greek_prob]
# print(len(greater_prob_latin))
# greater_prob_latin

In [23]:
# We'll remove the words that have a high probability of being Latin 
# from the collection of demacronized transliterated Greek words
only_greek_transliterated = distinct_demacronized_greek - set(likely_latin)
likely_greek = shared_words - set(likely_latin)

# Likewise, let's remove the transliterated words that are likely Greek 
# from the collection of good Latin words
distinct_good_latin = set(distinct_good_latin) - likely_greek

print(f'{len(only_greek_transliterated):,} distinct transliterated Greek words without matches in the Latin corpus')

# NOTE: we are toggling this on to see the difference
# only_greek_transliterated=  distinct_demacronized_greek
# distinct_latin_wo_greek_matches = distinct_good_latin - shared_words
# The following had low precision high recall:
# only_greek_transliterated = distinct_demacronized_greek - shared_words

46,929 distinct transliterated Greek words without matches in the Latin corpus


### Normally these gaps might concern us, but since we are more interested in groups of loanwords in phrases, we can rely on smoothing over clusters of loanwords to screen out misses.

### Create a simple data matrix of the single words, transliterated Greek examples followed by the Latin words

In [24]:
X = [list(only_greek_transliterated) + list(distinct_good_latin)]
len(X[0])

226601

### Before we featurize our data matrix, let's check on the max word lengths

In [27]:
print(f'{sorted([len(tmp) for tmp in distinct_good_latin])[-1]} Max word length in distinct good Latin sample')
print(f'{sorted([len(tmp) for tmp in only_greek_transliterated])[-1] } Max word length in transliterated Greek sample')
max_len = sorted([len(tmp) for tmp in only_greek_transliterated])[-1]  

28 Max word length in distinct good Latin sample
25 Max word length in transliterated Greek sample


## Featurization
#### We'll use a generic character to integer transform so that we can reuse our encoding process for unseen character data combinations (as opposed to building a dictionary mapping for a discrete sample space)

#### Let's create the X,y feature matrix and labels

In [31]:
all_y = np.array([1] * len(only_greek_transliterated) + [0] * len(distinct_good_latin), dtype=float)
print(f'y shape: {all_y.shape}')
# We use a label encoder to automatically capture the range of values for provenance
# Although it's true we're only doing binary classification, 
# it's a good practice to use this so that we can automate recording our model's provenance
label_encoder = LabelEncoder()
label_encoder.fit(all_y)
all_words = list(only_greek_transliterated) + list(distinct_good_latin)
all_X = np.array([word_to_features(word, max_len) for word in all_words])
print(f'X shape: {all_X.shape}')
# all_X = sparse.csr_matrix(all_X)
num_samples = all_y.shape[0] # to be used later by model provenance
num_features = all_X.shape[1] # to be used later by model provenance

y shape: (226601,)




X shape: (226601, 25)


In [32]:
from sklearn.decomposition import PCA

In [33]:
pca = PCA(n_components=0.99)
features_pca = pca.fit_transform(all_X)

In [34]:
print(f'Original features:{all_X.shape[1]}')
print(f'Reduced features: {features_pca.shape[1]}')

Original features:25
Reduced features: 14


In [35]:
import warnings
from sklearn.feature_selection import RFECV
# warnings.filterwarnings(action='ignore', module='scipy')

In [36]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
rfecv = RFECV(estimator=rfc,
              step=1, 
              scoring='neg_mean_squared_error', cv=5, n_jobs = multiprocessing.cpu_count()-1,
             verbose=1)
rfecv.fit(all_X, all_y)
rfecv.transform(all_X)

Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.


array([[105, 111, 105, ..., 112,   0,   0],
       [ 97, 108, 108, ..., 105,  97, 116],
       [110,  97, 115, ..., 111, 100, 105],
       ...,
       [115, 105, 116, ..., 116, 105, 109],
       [105, 115, 112, ..., 108,  99, 101],
       [109, 117, 105, ..., 110, 101, 114]])

In [39]:
rfecv.n_features_

7

In [40]:
rfecv.support_

array([ True,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [41]:
rfecv.ranking_

array([ 1,  1,  1,  1,  1,  1,  1,  2,  3,  4,  6,  7,  5,  8,  9, 10, 11,
       12, 13, 15, 14, 18, 19, 17, 16])

In [45]:
all_X_non_reversed = np.array([word_to_features(word, max_len, reverse=False) for word in all_words])



In [None]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
rfecv_non_reverse = RFECV(estimator=rfc,
              step=1, 
              scoring='neg_mean_squared_error', cv=3, n_jobs = multiprocessing.cpu_count()-1,
             verbose=1)
rfecv_non_reverse.fit(all_X_non_reversed, all_y)
rfecv_non_reverse.transform(all_X_non_reversed)

Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.


array([[112, 111, 105, 111, 105,   0],
       [104, 101, 108, 101, 116,  97],
       [105, 100, 111, 117, 115,  97],
       ...,
       [112, 114, 111, 120, 105, 109],
       [101,  99, 108, 105, 112, 115],
       [ 97, 100, 104,  97, 101, 114]])

In [47]:
rfecv_non_reverse.ranking_

array([ 1,  1,  1,  1,  1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
       13, 14, 15, 20, 17, 16, 19, 18])

## Comparing the two ranking scores, we see that the word's natural sequence yields a straight gradient with best fit for the first six letters and then decaying linearly.
## The reverse order, or suffix aligned feature matrix yields best fit for seven features, before going into a near linear decay.

In [48]:
def prepare(word):
    # craft a vector: aligning reverse suffix and prefixes
    letters =list(word)
    letters.reverse()
    return ''.join(letters[:10] + [' '] + letters[-3:])

prepare('abcdefghijklmno')


'onmlkjihgf cba'

In [75]:
len('onmlkjihgf cba')

14

In [76]:
all_X_shortened = np.array([word_to_features(prepare(word), max_word_length=14, reverse=False) for word in all_words])

In [51]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
rfecv_shortened = RFECV(estimator=rfc,
              step=1, 
              scoring='neg_mean_squared_error', cv=3, n_jobs = multiprocessing.cpu_count()-1,
             verbose=0)
rfecv_shortened.fit(all_X_shortened, all_y)
rfecv_shortened.transform(all_X_shortened)

array([[105, 111, 105, ...,   0, 105, 111],
       [ 97, 108, 108, ...,  97, 116, 101],
       [110,  97, 115, ..., 100, 105,   0],
       ...,
       [115, 105, 116, ..., 105, 109, 105],
       [105, 115, 112, ...,  99, 101,   0],
       [109, 117, 105, ..., 101, 114, 101]])

In [53]:
rfecv_shortened.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 6, 4, 7, 5])

In [58]:
def shorten(word):
    # craft a vector: aligning reverse suffix 
    letters =list(word)
    letters.reverse()
    return ''.join(letters[:10])

shorten('abcdefghijklmno')

'onmlkjihgf'

In [64]:
all_X_short = np.array([word_to_features(shorten(word), max_word_length=10, reverse=False) for word in all_words])

In [None]:
## Remove duplicates from the shortened feature matrix
since we have removed features from the matrix, some rows will not be unique, let's make the data distinct.

In [65]:
all_X_short.shape

(226601, 10)

In [66]:
all_y.shape

(226601,)

In [67]:
uniqs, indices = np.unique(all_X_short, return_index=True, axis=0)

In [68]:
all_X_shorter = all_X_short[ indices ]

In [69]:
all_X_shorter.shape

(216189, 10)

In [70]:
all_y_shorter = all_y[indices]

In [71]:
all_y_shorter.shape

(216189,)

In [74]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
scores = cross_val_score(rfc, all_X_shorter, all_y_shorter,
                             scoring='accuracy',
                             n_jobs=multiprocessing.cpu_count()-1,
                             cv=5)
print(f'{str(rfc)} {scores.mean()} {scores}')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 0.47923783949821946 [0.43370568 0.55012836 0.45331545 0.16860559 0.79043412]


## That is quite a drop in expected performance!

In [77]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
scores = cross_val_score(rfc, all_X_shortened, all_y,
                             scoring='accuracy',
                             n_jobs=multiprocessing.cpu_count()-1,
                             cv=5)
print(f'{str(rfc)} {scores.mean()} {scores}')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 0.9621184307382344 [0.96211469 0.96341652 0.96167255 0.9614519  0.96193649]


# So, the moderately trimmed feature matrix retained high f scores, whereas the aggressively trimmed version was lost too much performance.

In [78]:
classifier =RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
classifier.fit(all_X_shorter, all_y_shorter)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Save the classifier, so it can be loaded without training

In [79]:
model_output_file = 'is_transliterated_greek.lw.mdl.{}.joblib'.format(sklearn.__version__)
dump(classifier, model_output_file)

['is_transliterated_greek.lw.mdl.0.20.2.joblib']

# That's all for now folks!