# Preprocessing a corpus for vectorization
One of the best ways to speed your research and development time is to checkpoint your work so that you can reuse portions of your pipeline that don't change. Fortunately, as long as you don't use lambda statement in you scikit learn pipelines, you should be able to pickle and reload your pipelines, and furthermore, the resulting matrix output can be pickled, and reloaded. However, before we get to demonstrate impressive reuse, we typically have to deal with dirty data.
## Dealing with dirty data
A good rule of thumb is: if you don't think your data is dirty, you're probably not looking at it.
Let's preprocess the Latin Library corpus and show some transformations that can be done to auto correct some data quality issues.

### In this notebook we will:
1. Create a reusable text processing pipeline
1. Assess the pipeline processing
1. Divide and conquer - multiprocess a corpus in sections

In [22]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import os
import datetime
import logging
import multiprocessing
from datetime import datetime
from scipy import sparse
import json
import joblib
import pickle
from tqdm import tqdm
import numpy as np
from cltk.corpus.readers import get_corpus_reader, assemble_corpus
from cltk.corpus.latin.latin_library_corpus_types import corpus_directories_by_type, corpus_texts_by_type
from cltk.prosody.latin.string_utils import punctuation_for_spaces_dict
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.prosody.latin.scansion_constants import ScansionConstants
from cltk.tokenize.word import WordTokenizer
from sklearn.pipeline import Pipeline
from cltk.corpus.latin.latin_library_corpus_types import corpus_directories_by_type, corpus_texts_by_type
from sklearn.preprocessing import FunctionTransformer
from cltk.utils.matrix_corpus_fun import (
    separate_camel_cases,
    splice_hyphens,
    drop_empty_lists,
    drop_non_lower,
    drop_probable_entities,
    drop_editorial,
    drop_arabic_numeric,
    drop_all_caps,
    jv_transform,
    accept_editorial,    
    drop_enclitics ,
    drop_fringe_punctuation, 
    divide_separate_words,
    drop_all_punctuation,
    drop_short_sentences)

## Add our common library to the path and load functions

In [24]:
import sys
import inspect
from pathlib import Path 
currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 
from mlyoucanuse.featurize_text_fun import featurize, vectorize_features
from mlyoucanuse.smart_lower_transformer import SmartLowerTransformer
from mlyoucanuse.trie_transformer import TrieTransformer
from mlyoucanuse.featurize_text_fun import word_to_features
from mlyoucanuse.matrix_fun import run_length_encoding, extract_words, patch_cluster_holes, merge_words

In [25]:
LOG = logging.getLogger('preprocess_corpus')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

## Configure the Smart Lower transformer and show how it's used

In [27]:
smart_lower_transformer = SmartLowerTransformer(lower_only_file=os.path.join(currentdir, 'latin.words.always.lower.txt' ))
smart_lower_transformer.transform([['Leuissima', 'virum', 'cano'],
                                        ['perlucent', 'Arenas', 'stuff']])

[['leuissima', 'virum', 'cano'], ['perlucent', 'Arenas', 'stuff']]

## Configure the Trie Transformer and show how it's used to autocorrect corrupt text

In [28]:
filename = 'latin_word_trie.pkl'
trie_transformer = TrieTransformer(trie_file=filename)
trie_transformer.transform([
    # actually found in latin library
    ['maturitatemperueniunt'],
    ['radicibussubministres'],
    ['peregrinationeshabere'],
    ['uersibusdisertissimis'],
    ['crudelitatisconsuetudinem'],
    ['adiciebatcontrahendam'],
    ['translationesinprobas']     
])

[['maturitatem', 'perueniunt'],
 ['radicibus', 'subministres'],
 ['peregrinationes', 'habere'],
 ['uersibus', 'disertissimis'],
 ['crudelitatis', 'consuetudinem'],
 ['adiciebat', 'contrahendam'],
 ['translationes', 'inprobas']]

## Load our classifier from another notebook

In [29]:
greek_cls = None
with open(os.path.join('../detecting_loanwords', 'is_transliterated_greek.mdl.0.20.2.joblib'), 'rb') as reader:
    greek_cls = joblib.load(reader)

## Load our text processing pipeline

In [30]:
process_latin_text_pipeline = None
with open(os.path.join('../detecting_loanwords', 'process_latin_text_pipeline.0.20.2.joblib'), 'rb') as reader:
    process_latin_text_pipeline = joblib.load(reader)

## Create a custom function for a transformer that uses our classifier

In [31]:
def drop_greek(string_matrix, max_len=25):
    results = []
    for sentence in string_matrix:
        unseen_X = process_latin_text_pipeline.fit_transform([(sentence)])
        if unseen_X and len(unseen_X[0]) > 1:
            arr = greek_cls.predict(
                sparse.csr_matrix(np.array([word_to_features(word, max_len) 
                                            for sent in unseen_X 
                                            for word in sent])))
            arr = patch_cluster_holes(arr)
            purified_words = [word for idx, word in enumerate(unseen_X[0]) if arr[idx] == 0 ]
            found_greek = merge_words(extract_words(unseen_X[0], *run_length_encoding(arr)))  # works with sent
            if found_greek:                
                LOG.debug(found_greek)
                LOG.debug('purified words %s', purified_words)
            results.append(purified_words) 
    return results 

## Create a helper transformer

In [32]:
def verify_sentence_matrix(string_matrix):
    results = []
    for sentence in string_matrix:
        sent =[]
        for word in sentence:            
            if word and not isinstance(word, str):
                LOG.warning('fail, expected word as string: %s' , word)
        results.append(sentence)
    LOG.info('X size: %s', len(results))
    return results 

In [56]:
pipeline_process_text = Pipeline([
    ('verify', FunctionTransformer(verify_sentence_matrix, validate=False)),
    ('correct_camel_cases', FunctionTransformer(separate_camel_cases, validate=False)), 
    ('splice_hyphens', FunctionTransformer(splice_hyphens, validate=False)), 
    ('jv_transform', FunctionTransformer(jv_transform, validate=False)),
    ('drop_editorial', FunctionTransformer(drop_editorial, validate=False)),    
    ('drop_enclitics', FunctionTransformer(drop_enclitics, validate=False)),
    ('drop_fringe_punctuation', FunctionTransformer(drop_fringe_punctuation, validate=False)),
    ('smart_lower', smart_lower_transformer),
    ('trier', trie_transformer),
    ('drop_non_lower', FunctionTransformer(drop_non_lower, validate=False)),
    ('drop_arabic_numeric', FunctionTransformer(drop_arabic_numeric, validate=False)),
    ('drop_all_punctuation', FunctionTransformer(drop_all_punctuation, validate=False)),   
    ('divide_separate_words', FunctionTransformer(divide_separate_words, validate=False)),   
    ('drop_greek', FunctionTransformer(drop_greek, validate=False)),
    ('verify2', FunctionTransformer(verify_sentence_matrix, validate=False)),
    ('drop_all_caps', FunctionTransformer(drop_all_caps, validate=False)),
    ('drop_probable_entities', FunctionTransformer(drop_probable_entities, validate=False)),
    ('drop_empty_lists', FunctionTransformer(drop_empty_lists, validate=False)),
    ('drop_short_sentences', FunctionTransformer(drop_short_sentences, validate=False)) ,
    ('verify3', FunctionTransformer(verify_sentence_matrix, validate=False))
])
# Other function transformations worth considering:
#  ('accept_editorial', FunctionTransformer(accept_editorial, validate=False)),    

In [44]:
corpus_reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
corpus_reader._fileids = ['pliny.ep1.txt']   # ['catullus.txt']

In [45]:
#X = model.fit_transform(tqdm(list(corrected_reader.sents())))
X = process_text_model.fit_transform(tqdm(list(corpus_reader.sents())))

100%|██████████| 501/501 [00:00<00:00, 149200.96it/s]
INFO : X size: 501
INFO : X size: 422
INFO : X size: 422
INFO : X size: 404


404

## Inspecting the Output
The pipeline log statements show a few items that were not handled properly, such a improperly joined words that the Trie Transformer couldn't correct, however, one "perseuerantiamtantasustinentem" is due to several words being improperly joined ("perseuerant", "iam", "tanta", "sustinentem")--and trying to automatically recover from that level of textual corruption would strain credulity.

In [53]:
# To inspect the corpus processing output, uncomment the lines below
# X[:10]
# block=10 
# X[block:block + 10]
# block +=100 
# X[:100]

In [52]:
def get_unique_words(X):
    distinct = set()
    for sentence in X:
        for word in sentence:
            distinct.add(word)
    return distinct

corpus_words = get_unique_words(X)
print(len(corpus_words))
# catullus before dropping greek: 5235
# pliny epistles 1: 2686 words, 400 sentences
# pliny epistles 1: 2692, 404 sentences

2692


### Divide and conquer: finding discrete sections of a corpus

In [57]:
for key in corpus_directories_by_type.keys():
    reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')
    reader = assemble_corpus(reader, [key],
                          corpus_directories_by_type, 
                          corpus_texts_by_type )
    print(f'Era: {key} files: {len(reader._fileids)} ')

Era: republican files: 208 
Era: augustan files: 302 
Era: early_silver files: 247 
Era: late_silver files: 118 
Era: old files: 29 
Era: christian files: 347 
Era: medieval files: 12 
Era: renaissance files: 3 
Era: neo_latin files: 75 
Era: misc files: 405 
Era: early files: 1 


In [58]:
pipeline_filename_latin = 'pipeline_process_text_latin.pkl'

with open(pipeline_filename_latin, 'wb') as dumper:
    pickle.dump(pipeline_process_text, dumper)

## Creating a reusable function for processing the corpus sections

In [74]:
def process_corpus_section(section):  
    from time import time
    start = time()
    reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')
    reader = assemble_corpus(reader, [section],
                          corpus_directories_by_type, 
                          corpus_texts_by_type)
    pipeline_file  = 'pipeline_process_text_latin.pkl'
    with open (pipeline_file, 'rb') as loader:
        pipeline = pickle.load(loader)
    X = pipeline.fit_transform(list(reader.sents()))
    pickle.dump(X, open('latin_library.corpus.{}.processed.pkl'.format(section), "wb"))
    del X
    del reader
    del pipeline
    return (section, time() - start)

In [71]:
process_corpus_section('early')

INFO : X size: 191
INFO : X size: 88
INFO : X size: 74


In [72]:
process_corpus_section('renaissance')

INFO : X size: 219
INFO : X size: 153
INFO : X size: 149


In [73]:
process_corpus_section('medieval')

INFO : X size: 4164
INFO : X size: 3060
INFO : X size: 2952


In [75]:
eras_to_do =['early', 'renaissance', 'medieval']

In [81]:
from multiprocessing import Pool
# pool = Pool(processes=len(eras_to_do)+1)

In [None]:
# results = pool.apply_async(process_corpus_section, eras_to_do)
# results
# pool.close()
# pool.join()

In [83]:
eras_to_do = ['misc', 'republican', 'augustan', 'early_silver',
#             'late_silver',
              'christian', 
#               'neo_latin',
              'old']
pool = Pool(processes=len(eras_to_do)+1)
pool.map(process_corpus_section, eras_to_do)


INFO : X size: 47994
INFO : X size: 25302
INFO : X size: 29540
INFO : X size: 29540
INFO : X size: 60027
INFO : X size: 82644
INFO : X size: 121570
INFO : X size: 138657
INFO : X size: 324958
INFO : X size: 22196
INFO : X size: 21482
INFO : X size: 25265
INFO : X size: 24548
INFO : X size: 25265
INFO : X size: 24548
INFO : X size: 29145
INFO : X size: 24765
INFO : X size: 56983
INFO : X size: 55485
INFO : X size: 74269
INFO : X size: 70308
INFO : X size: 107449
INFO : X size: 102628
INFO : X size: 123475
INFO : X size: 118063
INFO : X size: 220449
INFO : X size: 173140


[('misc', 236195.42858600616),
 ('republican', 100088.08493995667),
 ('augustan', 82857.92074799538),
 ('early_silver', 125404.99969887733),
 ('late_silver', 44229.48902797699),
 ('christian', 135963.50766682625),
 ('neo_latin', 50776.54932188988),
 ('old', 70243.92168569565),
 ('neo_latin', 50763.268376111984)]

In [84]:
pool.close()

In [85]:
pool.join()