# Boostrapping Document Classification

In [1]:
%load_ext autoreload
%autoreload 2
%doctest_mode on
%matplotlib inline
import warnings
warnings.simplefilter('ignore') # quiet warnings for presentation purposes only

Exception reporting mode: Plain
Doctest mode is: ON


In [None]:
import os
import pickle
import json
import time
from collections import defaultdict
from datetime import datetime

import joblib
from tqdm import tqdm

from cltk.corpus.readers import FilteredPlaintextCorpusReader, get_corpus_reader
# from cltk.corpus.latin.latin_library_corpus_types import  corpus_directories_by_type, corpus_texts_by_type  

from cltk.prosody.latin.string_utils import punctuation_for_spaces_dict
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.prosody.latin.scansion_constants import ScansionConstants
from cltk.tokenize.word import WordTokenizer
import numpy as np
from sklearn.preprocessing.label import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

### Add parent directory to path so we can access our common code

In [None]:
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
from mlyoucanuse.doc2tokens_transformer import Doc2TokensTransformer
from mlyoucanuse.corpus_fun import get_file_type_list 

In [4]:
reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')
ALL_FILE_IDS = list(reader.fileids() )
print(f'All file ids: {len(ALL_FILE_IDS)} e.g.: {ALL_FILE_IDS[:5]}')
print(f'Random sample: {list(reader.docs(ALL_FILE_IDS[2]))[0][200:300]}')

Random sample: opo Dunstano, vere moribus et aetate maturo, Abbo Floriacensis monachus levita, etsi indignus, a Chr


# Label some data, but not all 
#### requires some expertise, such as consulting:
#### https://en.wikipedia.org/wiki/Classical_Latin 
#### https://en.wikipedia.org/wiki/Latin_literature
### Create a dictionary of Category Types and a List of Instances, e.g.:

In [5]:

corpus_directories_by_type = {

    'republican': [
        './caesar',
        './lucretius',
        './nepos',
        './cicero'
    ],
    'augustan': [
        './livy',
        './ovid',
        './horace',
        './vergil',
        './hyginus',
    ],
    'early_silver': [
        './martial',
        './juvenal',
        './tacitus',
        './lucan',
        './quintilian',
        './sen',
        './statius',
        './silius',
        './columella'
    ],
    'late_silver': [
        './suetonius',
        './gellius',
        './apuleius'
        './justin',
        './apicius',
        './fulgentius',
        './orosius',
    ],
    'old': [
        './plautus'
    ],
    'christian': [
        './ambrose',
        './abelard',
        './alcuin',
        './augustine',
        './bede',
        './bible',
        './cassiodorus',
        './commodianus',
        './gregorytours',
        './hugo',
        './isidore',
        './jerome',
        './prudentius',
        './tertullian',
        './kempis',
        './leothegreat',
    ],
    'medieval': [
        './boethiusdacia',
        './dante',
    ],
    'renaissance': [
    ],
    'neo_latin': [
        './addison',
        './bacon',
        './bultelius',
        './descartes',
        './erasmus',
        './galileo',
        './kepler',
        './may',
        './melanchthon',
        './xylander',
        './campion',
    ]
}


#### by text


corpus_texts_by_type = {
    'republican': [
        'sall.1.txt',
        'sall.2.txt',
        'sall.cotta.txt',
        'sall.ep1.txt',
        'sall.ep2.txt',
        'sall.frag.txt',
        'sall.invectiva.txt',
        'sall.lep.txt',
        'sall.macer.txt',
        'sall.mithr.txt',
        'sall.phil.txt',
        'sall.pomp.txt',
        'varro.frag.txt',
        'varro.ll10.txt',
        'varro.ll5.txt',
        'varro.ll6.txt',
        'varro.ll7.txt',
        'varro.ll8.txt',
        'varro.ll9.txt',
        'varro.rr1.txt',
        'varro.rr2.txt',
        'varro.rr3.txt',
        'sulpicia.txt',
    ],
    'augustan': [
        'resgestae.txt',
        'resgestae1.txt',
        'manilius1.txt',
        'manilius2.txt',
        'manilius3.txt',
        'manilius4.txt',
        'manilius5.txt',
        'catullus.txt',
        'vitruvius1.txt',
        'vitruvius10.txt',
        'vitruvius2.txt',
        'vitruvius3.txt',
        'vitruvius4.txt',
        'vitruvius5.txt',
        'vitruvius6.txt',
        'vitruvius7.txt',
        'vitruvius8.txt',
        'vitruvius9.txt',
        'propertius1.txt',
        'tibullus1.txt',
        'tibullus2.txt',
        'tibullus3.txt',
    ],
    'early_silver': [
        'pliny.ep1.txt',
        'pliny.ep10.txt',
        'pliny.ep2.txt',
        'pliny.ep3.txt',
        'pliny.ep4.txt',
        'pliny.ep5.txt',
        'pliny.ep6.txt',
        'pliny.ep7.txt',
        'pliny.ep8.txt',
        'pliny.ep9.txt',
        'pliny.nh1.txt',
        'pliny.nh2.txt',
        'pliny.nh3.txt',
        'pliny.nh4.txt',
        'pliny.nh5.txt',
        'pliny.nhpr.txt',
        'pliny.panegyricus.txt',
        'petronius1.txt',
        'petroniusfrag.txt',
        'persius.txt',
        'phaedr1.txt',
        'phaedr2.txt',
        'phaedr3.txt',
        'phaedr4.txt',
        'phaedr5.txt',
        'phaedrapp.txt',
        'seneca.contr1.txt',
        'seneca.contr10.txt',
        'seneca.contr2.txt',
        'seneca.contr3.txt',
        'seneca.contr4.txt',
        'seneca.contr5.txt',
        'seneca.contr6.txt',
        'seneca.contr7.txt',
        'seneca.contr8.txt',
        'seneca.contr9.txt',
        'seneca.fragmenta.txt',
        'seneca.suasoriae.txt',
        'valeriusflaccus1.txt',
        'valeriusflaccus2.txt',
        'valeriusflaccus3.txt',
        'valeriusflaccus4.txt',
        'valeriusflaccus5.txt',
        'valeriusflaccus6.txt',
        'valeriusflaccus7.txt',
        'valeriusflaccus8.txt',
        'valmax1.txt',
        'valmax2.txt',
        'valmax3.txt',
        'valmax4.txt',
        'valmax5.txt',
        'valmax6.txt',
        'valmax7.txt',
        'valmax8.txt',
        'valmax9.txt',
        'vell1.txt',
        'vell2.txt',
    ],
    'late_silver': [
    ],
    'old': [
        '12tables.txt',
        'ter.adel.txt',
        'ter.andria.txt',
        'ter.eunuchus.txt',
        'ter.heauton.txt',
        'ter.hecyra.txt',
        'ter.phormio.txt',
        'andronicus.txt',
        'enn.txt',
    ],
    'medieval': [
        'anselmepistula.txt',
        'anselmproslogion.txt',
        'carm.bur.txt',
    ],
    'christian': [
        'anon.martyrio.txt',
        'benedict.txt',
        'berengar.txt',
        'bernardclairvaux.txt',
        'bernardcluny.txt',
        'bonaventura.itinerarium.txt',
        'creeds.txt',
        'decretum.txt',
        'diesirae.txt',
        'egeria.txt',
        'ennodius.txt',
        'eucherius.txt',
        'eugippius.txt',
        'greg.txt',
        'gregory.txt',
        'gregory7.txt',
        'hydatius.txt',
        'hymni.txt',
        'innocent.txt',
        'hydatius.txt',
        'junillus.txt',
        'lactantius.txt',
        'liberpontificalis.txt',
        'macarius.txt',
        'macarius1.txt',
        'novatian.txt',
        'papal.txt',
        'paulinus.poemata.txt',
        'perp.txt',
        'professio.txt',
        'prosperus.txt',
        'regula.txt',
        'sedulius.txt',
        'sulpiciusseverus.txt',
        'vorag.txt',
    ],
    'renaissance': [
        'petrarch.ep1.txt',
        'petrarch.numa.txt',
        'petrarch.rom.txt',
    ],
    'neo_latin': [
        'spinoza.ethica1.txt',
        'spinoza.ethica2.txt',
        'spinoza.ethica3.txt',
        'spinoza.ethica4.txt',
        'spinoza.ethica5.txt'
    ]
}



#### The following were directories that weren't obviously divisible into periods based on the web site/file directory layout.
 
	./alanus',
	'./albertanus',
	'./albertofaix',
	'./aquinas',
	'./ammianus',
	'./arnobius',
	'./capellanus',
	'./cato',
	'./claudian',
	'./curtius',
	'./eutropius',
	'./frontinus',
	'./gestafrancorum',
	'./justinian',
	'./lactantius',
	'./martinbraga',
	'./mirandola',
	'./ottofreising',
	'./pauldeacon',
	'./sha',
	'./theodosius',
	'./voragine',
	'./walter',
	'./williamtyre',
 

In [7]:
from cltk.corpus.readers import assemble_corpus
reader = assemble_corpus(reader, corpus_texts_by_type.keys(), corpus_directories_by_type, corpus_texts_by_type )


In [9]:

files_types = get_file_type_list(reader.fileids(), corpus_texts_by_type, corpus_directories_by_type)
fileid_names, categories = zip(*files_types)

# reader.fileids(fileid_names)
reader._fileids = fileid_names

DOCS_TO_CLASSIFY = list(set(ALL_FILE_IDS) ^ set(fileid_names))
print(f'Docs to classify: {len(DOCS_TO_CLASSIFY):,}')


print(f'Original file list: {len(ALL_FILE_IDS):,}')
print(f'Corrected file list: {len(fileid_names):,}')



Docs to classify: 863
Original file list: 2,141
Corrected file list: 1,278


In [12]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform( np.array(categories).ravel())
print(f'Y shape: {y.shape} e.g.: {y}')
print(f'Label encoder classes: {label_encoder.classes_}')

Y shape: (1278,) e.g.: [6 1 1 ... 0 0 5]
Label encoder classes: ['augustan' 'christian' 'early_silver' 'late_silver' 'medieval'
 'neo_latin' 'old' 'renaissance' 'republican']


In [13]:
def identity (data):
    """Identity, as in math; you know, do nothing, just be yourself."""
    return data

In [14]:
dummyX = [
    ['The quick brown fox. The lazy dog.'],
    ['Waiting for godot. Looking for the sunshine.']
]

newX = Doc2TokensTransformer().transform(dummyX)
print(list(newX))

[['The', 'quick', 'brow', 'fox', 'The', 'lazy', 'dog'], ['Waiting', 'for', 'godot', 'Looking', 'for', 'the', 'sunshine']]


In [15]:

X = list(reader.docs(reader.fileids()))
print(len(X))

1278


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# With a larger labelled data set our test size would be higher, but lower here is okay
# because we just want to see some differences among classifiers

In [17]:
print('X_train size: {}, X_test size: {}, y_train size: {}, y_test size: {}'.format(len(X_train),
                                                                                    len(X_test),
                                                                                    len(y_train),
                                                                                    len(y_test)))


X_train size: 1150, X_test size: 128, y_train size: 1150, y_test size: 128


In [18]:
def train(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train)
    print("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier

trial1 = Pipeline([
    ('normalizer', Doc2TokensTransformer()),
    ('vectorizer', TfidfVectorizer(
        analyzer='word',
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None)),
    ('classifier', MultinomialNB())
])

train(trial1, X_train, X_test, y_train, y_test)

Accuracy: 0.6953125


Pipeline(memory=None,
     steps=[('normalizer', Doc2TokensTransformer(drop_regexes=[re.compile('[0-9]+[a-zA-Z]'), re.compile('\\s+')],
           language=None, valid_chars=None)), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...True, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
def model_selection(X_train, y_train, X_test, y_test, estimator):
    """
    Test various estimators.
    """

    model = Pipeline([
        ('normalizer', Doc2TokensTransformer()),
        ('vectorizer', TfidfVectorizer(
            analyzer='word',
            tokenizer=identity,
            preprocessor=identity,
            token_pattern=None)),
        ('classifier', estimator())
    ])
    model.fit(X_train, y_train)
    return ("Accuracy: %s" % model.score(X_test, y_test))


In [20]:
classifiers = [LinearSVC, SVC, KNeighborsClassifier, LogisticRegressionCV,
        LogisticRegression, SGDClassifier, BaggingClassifier,
        ExtraTreesClassifier, RandomForestClassifier]

for cls in tqdm(classifiers):
    print(f'Testing {str(cls)}')
    print(model_selection(X_train, y_train, X_test, y_test, cls))


  0%|          | 0/9 [00:00<?, ?it/s]

Testing <class 'sklearn.svm.classes.LinearSVC'>


 11%|█         | 1/9 [00:02<00:22,  2.76s/it]

Accuracy: 0.6796875
Testing <class 'sklearn.svm.classes.SVC'>


 22%|██▏       | 2/9 [00:05<00:19,  2.77s/it]

Accuracy: 0.6796875
Testing <class 'sklearn.neighbors.classification.KNeighborsClassifier'>


 33%|███▎      | 3/9 [00:08<00:16,  2.77s/it]

Accuracy: 0.6640625
Testing <class 'sklearn.linear_model.logistic.LogisticRegressionCV'>


 44%|████▍     | 4/9 [00:12<00:16,  3.30s/it]

Accuracy: 0.6796875
Testing <class 'sklearn.linear_model.logistic.LogisticRegression'>


 56%|█████▌    | 5/9 [00:15<00:12,  3.14s/it]

Accuracy: 0.6953125
Testing <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>


 67%|██████▋   | 6/9 [00:18<00:09,  3.02s/it]

Accuracy: 0.46875
Testing <class 'sklearn.ensemble.bagging.BaggingClassifier'>


 78%|███████▊  | 7/9 [00:21<00:05,  2.95s/it]

Accuracy: 0.6796875
Testing <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>


 89%|████████▉ | 8/9 [00:23<00:02,  2.91s/it]

Accuracy: 0.6796875
Testing <class 'sklearn.ensemble.forest.RandomForestClassifier'>


100%|██████████| 9/9 [00:26<00:00,  2.88s/it]

Accuracy: 0.6796875





In [21]:
# Train all the data using a reasonable winner
model = Pipeline([
    ('normalizer', Doc2TokensTransformer()),
    ('vectorizer', TfidfVectorizer(
        analyzer='word',
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None)),
    ('classifier', SGDClassifier())
])

model.fit(X, y)

Pipeline(memory=None,
     steps=[('normalizer', Doc2TokensTransformer(drop_regexes=[re.compile('[0-9]+[a-zA-Z]'), re.compile('\\s+')],
           language=None, valid_chars=None)), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [46]:
unclassified_reader =get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')
unclassified_reader.skip_keywords = None
to_classify_X = list(unclassified_reader.docs(DOCS_TO_CLASSIFY ))
new_labels = model.predict(to_classify_X)
print (f'Shape new_labels: {new_labels.shape} e.g.: {new_labels[:5]}...')


Shape new_labels: (863,) e.g.: [1 2 0 1 2]...


In [44]:
new_cats = defaultdict(list)
for idx, filename in enumerate(DOCS_TO_CLASSIFY):
    new_cats[label_encoder.classes_[new_labels[idx]]].append(filename)
print(new_cats)


defaultdict(<class 'list'>, {'christian': ['malaterra3.txt', 'justin/27.txt', 'zonaras.txt', 'richerus1.txt', 'vegetius1.txt', 'justin/20.txt', 'sha/aurel.txt', 'gaud.txt', 'holberg.txt', 'dumdiane.txt', 'albertofaix/hist1.txt', 'gaius3.txt', 'justin/40.txt', 'williamtyre/18.txt', 'gestafrancorum/gestafrancorum6.txt', 'arnobius/arnobius6.txt', 'albertofaix/hist6.txt', 'justin/10.txt', 'thesauro.txt', 'ammianus/21.txt', 'owen.txt', 'wmconchesdogma.txt', 'sha/alexsev.txt', 'justin/23.txt', 'apuleius/apuleius.apol.txt', 'appverg.catalepton.txt', 'justin/16.txt', 'voragine/nic.txt', 'ammianus/20.txt', 'apuleius/apuleius.mundo.txt', 'gregdecretals3.txt', 'capellanus/capellanus2.txt', 'grattius.txt', 'lucernarium.txt', 'gestafrancorum/gestafrancorum7.txt', 'aus.sept.sent.txt', 'justin/15.txt', 'gestafrancorum/gestafrancorum3.txt', 'mirabilia1.txt', 'withof4.txt', 'williamtyre/1.txt', 'debury.txt', 'walter7.txt', 'walter/pastourelles.txt', 'justin/9.txt', 'williamtyre/8.txt', 'aus.mos.txt', '

In [40]:
with open('new_cats.json', mode='w', encoding='utf8') as writer:
    json.dump(new_cats, writer, indent=2)


In [41]:
with open('latin_text_classifier.mdl.pkl', 'wb') as writer:
    joblib.dump(new_cats, writer)


# Vetting the data

In [None]:
# Load the corrected labellings
from cltk.corpus.latin.latin_library_corpus_types import corpus_directories_by_type as new_corpus_type_dirs 
from cltk.corpus.latin.latin_library_corpus_types import corpus_texts_by_type as new_corpus_text_types
# diff the old and new


