# Preprocessing a corpus for vectorization
One of the best ways to speed your research and development time is to checkpoint your work so that you can reuse portions of your pipeline that don't change. Fortunately, as long as you don't use lambda statement in you scikit learn pipelines, you should be able to pickle and reload your pipelines, and furthermore, the resulting matrix output can be pickled, and reloaded. However, before we get to demonstrate impressive reuse, we typically have to deal with dirty data.
## Dealing with dirty data
A good rule of thumb is: if you don't think your data is dirty, you're probably not looking at it.
Let's preprocess the Latin Library corpus and show some transformations that can be done to auto correct some data quality issues.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import datetime
import logging
import multiprocessing
from datetime import datetime
from scipy import sparse
import json
import joblib
import pickle
from tqdm import tqdm
import numpy as np
from cltk.corpus.readers import get_corpus_reader, assemble_corpus
from cltk.prosody.latin.string_utils import punctuation_for_spaces_dict
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.prosody.latin.scansion_constants import ScansionConstants
from cltk.tokenize.word import WordTokenizer
from sklearn.pipeline import Pipeline
from cltk.corpus.latin.latin_library_corpus_types import corpus_directories_by_type, corpus_texts_by_type
from sklearn.preprocessing import FunctionTransformer
from cltk.utils.matrix_corpus_fun import (
    separate_camel_cases,
    splice_hyphens,
    drop_empty_lists,
    drop_non_lower,
    drop_probable_entities,
    drop_editorial,
    drop_arabic_numeric,
    drop_all_caps,
    jv_transform,
    accept_editorial,    
    drop_enclitics ,
    drop_fringe_punctuation, 
    divide_separate_words,
    drop_all_punctuation,
    drop_short_sentences)

## Add our common library to the path and load functions

In [3]:

import sys
import inspect
from pathlib import Path 
currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 
from mlyoucanuse.featurize_text_fun import featurize, vectorize_features
from mlyoucanuse.smart_lower_transformer import SmartLowerTransformer
from mlyoucanuse.trie_transformer import TrieTransformer
from mlyoucanuse.featurize_text_fun import word_to_features
from mlyoucanuse.matrix_fun import run_length_encoding, extract_words, patch_cluster_holes, merge_words


In [4]:
LOG = logging.getLogger('preprocess_corpus')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

In [None]:
work_dir = currentdir

## Configure the Smart Lower transformer and show how it's used

In [None]:
smart_lower_transformer = SmartLowerTransformer(lower_only_file=os.path.join(work_dir, 'latin.words.always.lower.txt' ))
smart_lower_transformer.transform([['Leuissima', 'virum', 'cano'],
                                        ['perlucent', 'Arenas', 'stuff']])

[['leuissima', 'virum', 'cano'], ['perlucent', 'Arenas', 'stuff']]

## Configure the Trie Transformer and show how it's used to autocorrect corrupt text

In [None]:

filename = 'latin_word_trie.pkl'
trie_transformer = TrieTransformer(trie_file=filename)
trie_transformer.transform([
    # actually found in latin library
    ['maturitatemperueniunt'],
    ['radicibussubministres'],
    ['peregrinationeshabere'],
    ['uersibusdisertissimis'],
    ['crudelitatisconsuetudinem'],
    ['adiciebatcontrahendam'],
    ['translationesinprobas']     
])


[['maturitatem', 'perueniunt'],
 ['radicibus', 'subministres'],
 ['peregrinationes', 'habere'],
 ['uersibus', 'disertissimis'],
 ['crudelitatis', 'consuetudinem'],
 ['adiciebat', 'contrahendam'],
 ['translationes', 'inprobas']]

## Load our classifier from another notebook

In [None]:
greek_cls = None
with open(os.path.join('../detecting_loanwords', 'is_transliterated_greek.mdl.0.20.2.joblib'), 'rb') as reader:
    greek_cls = joblib.load(reader)

## Load our text processing pipeline

In [None]:
process_latin_text_pipeline = None
with open(os.path.join('../detecting_loanwords', 'process_latin_text_pipeline.0.20.2.joblib'), 'rb') as reader:
    process_latin_text_pipeline = joblib.load(reader)

## Create a custom function for a transformer that uses our classifier

In [None]:

def drop_greek(string_matrix, max_len=25):
    results = []
    for sentence in string_matrix:
        unseen_X = process_latin_text_pipeline.fit_transform([(sentence)])
        if unseen_X and len(unseen_X[0]) > 1:
            arr = greek_cls.predict(
                sparse.csr_matrix(np.array([word_to_features(word, max_len) 
                                            for sent in unseen_X 
                                            for word in sent])))
            arr = patch_cluster_holes(arr)
            purified_words = [word for idx, word in enumerate(unseen_X[0]) if arr[idx] == 0 ]
            found_greek = merge_words(extract_words(unseen_X[0], *run_length_encoding(arr)))  # works with sent
            if found_greek:                
                LOG.debug(found_greek)
                LOG.debug('purified words %s', purified_words)
            results.append(purified_words) 
    return results 

## Create a helper transformer

In [None]:
def verify_sentence_matrix(string_matrix):
    results = []
    for sentence in string_matrix:
        sent =[]
        for word in sentence:            
            if word and not isinstance(word, str):
                LOG.warning('fail, expected word as string: %s' , word)
        results.append(sentence)
    LOG.info('X size: %s', len(results))
    return results 
    

In [None]:
model = Pipeline([
    ('verify', FunctionTransformer(verify_sentence_matrix, validate=False)),
    ('jv_transform', FunctionTransformer(jv_transform, validate=False)),
    ('drop_editorial', FunctionTransformer(drop_editorial, validate=False)),    
    ('drop_enclitics', FunctionTransformer(drop_enclitics, validate=False)),
    ('drop_fringe_punctuation', FunctionTransformer(drop_fringe_punctuation, validate=False)),
    ('smart_lower', smart_lower_transformer),
    ('trier', trie_transformer),
    ('drop_non_lower', FunctionTransformer(drop_non_lower, validate=False)),
    ('drop_arabic_numeric', FunctionTransformer(drop_arabic_numeric, validate=False)),
    ('drop_greek', FunctionTransformer(drop_greek, validate=False)),
    ('verify2', FunctionTransformer(verify_sentence_matrix, validate=False)),
    ('drop_all_caps', FunctionTransformer(drop_all_caps, validate=False)),
    ('verify3', FunctionTransformer(verify_sentence_matrix, validate=False)),
    ('drop_probable_entities', FunctionTransformer(drop_probable_entities, validate=False)),
    ('drop_empty_lists', FunctionTransformer(drop_empty_lists, validate=False)),
    ('drop_short_sentences', FunctionTransformer(drop_short_sentences, validate=False)) 
])

# TODO incorporate above into the more refined pipeline below

process_text_model = Pipeline([
#     ('fix_text', FunctionTransformer(fix_text, validate=False)),
    ('correct_camel_cases', FunctionTransformer(separate_camel_cases, validate=False)),
    ('splice_hyphens', FunctionTransformer(splice_hyphens, validate=False)),
    ('jv_transform', FunctionTransformer(jv_transform, validate=False)),  
    ('accept_editorial', FunctionTransformer(accept_editorial, validate=False)),    
    ('drop_enclitics', FunctionTransformer(drop_enclitics, validate=False)),
    ('drop_fringe_punctuation', FunctionTransformer(drop_fringe_punctuation, validate=False)),
    ('drop_all_punctuation', FunctionTransformer(drop_all_punctuation, validate=False)),    
    ('drop_non_lower', FunctionTransformer(drop_non_lower, validate=False)),
    ('drop_arabic_numeric', FunctionTransformer(drop_arabic_numeric, validate=False)),
    ('drop_all_caps', FunctionTransformer(drop_all_caps, validate=False)),
    ('divide_separate_words', FunctionTransformer(divide_separate_words, validate=False)),    
    # Normally, for word vector building we would want to do the next step
#     ('drop_probable_entities', FunctionTransformer(drop_probable_entities, validate=False)),
    ('drop_empty_lists', FunctionTransformer(drop_empty_lists, validate=False)),
    ('drop_short_sentences', FunctionTransformer(drop_short_sentences, validate=False)) 
])


In [None]:
corpus_reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
# corpus_reader._fileids = ['pliny.ep1.txt']   # ['catullus.txt']

In [None]:
#X = model.fit_transform(tqdm(list(corrected_reader.sents())))
X = model.fit_transform(tqdm(list(corpus_reader.sents())))
len(X)

100%|██████████| 1038668/1038668 [00:02<00:00, 484467.19it/s]
INFO : X size: 1038668


In [None]:
len(X)

In [None]:
X[:10]

## Inspecting the Output
The pipeline log statements show a few items that were not handled properly, such a improperly joined words that the Trie Transformer couldn't correct, however, one "perseuerantiamtantasustinentem" is due to several words being improperly joined ("perseuerant", "iam", "tanta", "sustinentem")--and trying to automatically recover from that level of textual corruption would strain credulity.

In [None]:
block=10 

In [None]:
X[block:block+10 ]


In [None]:
block +=100 

In [None]:
# X[:100]

In [None]:
def get_unique_words(X):
    distinct=set()
    for sentence in X:
        for word in sentence:
            distinct.add(word)
    return distinct

corpus_words = get_unique_words(X)
print(len(corpus_words))
# catullus before dropping greek: 5235

 ## Save the preprocessed corpus so it can be assessed, and reused

In [None]:
pickle.dump(X, open('latin.corpus.X.processed.pkl', "wb" ))