In [74]:
import os
import pickle
import json
from collections import defaultdict
from datetime import datetime


from cltk.corpus.readers import FilteredPlaintextCorpusReader
from cltk.corpus.readers import  get_corpus_reader
from cltk.corpus.latin.latin_library_corpus_types import  corpus_directories_by_type, corpus_texts_by_type  

from cltk.prosody.latin.string_utils import punctuation_for_spaces_dict
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.prosody.latin.scansion_constants import ScansionConstants
from cltk.tokenize.word import WordTokenizer
import numpy as np
from sklearn.preprocessing.label import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier


from doc2tokens_transformer import Doc2TokensTransformer



In [75]:
reader =get_corpus_reader('latin_text_latin_library')
#root, FILEIDS, word_tokenizer=word_tokenizer, sent_tokenizer=sent_tokenizer)

ALL_FILE_IDS = list(reader.fileids() )

print(ALL_FILE_IDS[:5])

['12tables.txt', '1644.txt', 'abbofloracensis.txt', 'abelard/dialogus.txt', 'abelard/epistola.txt']


In [76]:
print(list(reader.docs(ALL_FILE_IDS[2]))[0][200:300])


opo Dunstano, vere moribus et aetate maturo, Abbo Floriacensis monachus levita, etsi indignus, a Chr


# Label some data, but not all
 


In [77]:

corpus_directories_by_type = {

    'republican': [
        './caesar',
        './lucretius',
        './nepos',
        './cicero'
    ],
    'augustan': [
        './livy',
        './ovid',
        './horace',
        './vergil',
        './hyginus',
    ],
    'early_silver': [
        './martial',
        './juvenal',
        './tacitus',
        './lucan',
        './quintilian',
        './sen',
        './statius',
        './silius',
        './columella'
    ],
    'late_silver': [
        './suetonius',
        './gellius',
        './apuleius'
        './justin',
        './apicius',
        './fulgentius',
        './orosius',
    ],
    'old': [
        './plautus'
    ],
    'christian': [
        './ambrose',
        './abelard',
        './alcuin',
        './augustine',
        './bede',
        './bible',
        './cassiodorus',
        './commodianus',
        './gregorytours',
        './hugo',
        './isidore',
        './jerome',
        './prudentius',
        './tertullian',
        './kempis',
        './leothegreat',
    ],
    'medieval': [
        './boethiusdacia',
        './dante',
    ],
    'renaissance': [
    ],
    'neo_latin': [
        './addison',
        './bacon',
        './bultelius',
        './descartes',
        './erasmus',
        './galileo',
        './kepler',
        './may',
        './melanchthon',
        './xylander',
        './campion',
    ]
}


#### by text


corpus_texts_by_type = {
    'republican': [
        'sall.1.txt',
        'sall.2.txt',
        'sall.cotta.txt',
        'sall.ep1.txt',
        'sall.ep2.txt',
        'sall.frag.txt',
        'sall.invectiva.txt',
        'sall.lep.txt',
        'sall.macer.txt',
        'sall.mithr.txt',
        'sall.phil.txt',
        'sall.pomp.txt',
        'varro.frag.txt',
        'varro.ll10.txt',
        'varro.ll5.txt',
        'varro.ll6.txt',
        'varro.ll7.txt',
        'varro.ll8.txt',
        'varro.ll9.txt',
        'varro.rr1.txt',
        'varro.rr2.txt',
        'varro.rr3.txt',
        'sulpicia.txt',
    ],
    'augustan': [
        'resgestae.txt',
        'resgestae1.txt',
        'manilius1.txt',
        'manilius2.txt',
        'manilius3.txt',
        'manilius4.txt',
        'manilius5.txt',
        'catullus.txt',
        'vitruvius1.txt',
        'vitruvius10.txt',
        'vitruvius2.txt',
        'vitruvius3.txt',
        'vitruvius4.txt',
        'vitruvius5.txt',
        'vitruvius6.txt',
        'vitruvius7.txt',
        'vitruvius8.txt',
        'vitruvius9.txt',
        'propertius1.txt',
        'tibullus1.txt',
        'tibullus2.txt',
        'tibullus3.txt',
    ],
    'early_silver': [
        'pliny.ep1.txt',
        'pliny.ep10.txt',
        'pliny.ep2.txt',
        'pliny.ep3.txt',
        'pliny.ep4.txt',
        'pliny.ep5.txt',
        'pliny.ep6.txt',
        'pliny.ep7.txt',
        'pliny.ep8.txt',
        'pliny.ep9.txt',
        'pliny.nh1.txt',
        'pliny.nh2.txt',
        'pliny.nh3.txt',
        'pliny.nh4.txt',
        'pliny.nh5.txt',
        'pliny.nhpr.txt',
        'pliny.panegyricus.txt',
        'petronius1.txt',
        'petroniusfrag.txt',
        'persius.txt',
        'phaedr1.txt',
        'phaedr2.txt',
        'phaedr3.txt',
        'phaedr4.txt',
        'phaedr5.txt',
        'phaedrapp.txt',
        'seneca.contr1.txt',
        'seneca.contr10.txt',
        'seneca.contr2.txt',
        'seneca.contr3.txt',
        'seneca.contr4.txt',
        'seneca.contr5.txt',
        'seneca.contr6.txt',
        'seneca.contr7.txt',
        'seneca.contr8.txt',
        'seneca.contr9.txt',
        'seneca.fragmenta.txt',
        'seneca.suasoriae.txt',
        'valeriusflaccus1.txt',
        'valeriusflaccus2.txt',
        'valeriusflaccus3.txt',
        'valeriusflaccus4.txt',
        'valeriusflaccus5.txt',
        'valeriusflaccus6.txt',
        'valeriusflaccus7.txt',
        'valeriusflaccus8.txt',
        'valmax1.txt',
        'valmax2.txt',
        'valmax3.txt',
        'valmax4.txt',
        'valmax5.txt',
        'valmax6.txt',
        'valmax7.txt',
        'valmax8.txt',
        'valmax9.txt',
        'vell1.txt',
        'vell2.txt',
    ],
    'late_silver': [
    ],
    'old': [
        '12tables.txt',
        'ter.adel.txt',
        'ter.andria.txt',
        'ter.eunuchus.txt',
        'ter.heauton.txt',
        'ter.hecyra.txt',
        'ter.phormio.txt',
        'andronicus.txt',
        'enn.txt',
    ],
    'medieval': [
        'anselmepistula.txt',
        'anselmproslogion.txt',
        'carm.bur.txt',
    ],
    'christian': [
        'anon.martyrio.txt',
        'benedict.txt',
        'berengar.txt',
        'bernardclairvaux.txt',
        'bernardcluny.txt',
        'bonaventura.itinerarium.txt',
        'creeds.txt',
        'decretum.txt',
        'diesirae.txt',
        'egeria.txt',
        'ennodius.txt',
        'eucherius.txt',
        'eugippius.txt',
        'greg.txt',
        'gregory.txt',
        'gregory7.txt',
        'hydatius.txt',
        'hymni.txt',
        'innocent.txt',
        'hydatius.txt',
        'junillus.txt',
        'lactantius.txt',
        'liberpontificalis.txt',
        'macarius.txt',
        'macarius1.txt',
        'novatian.txt',
        'papal.txt',
        'paulinus.poemata.txt',
        'perp.txt',
        'professio.txt',
        'prosperus.txt',
        'regula.txt',
        'sedulius.txt',
        'sulpiciusseverus.txt',
        'vorag.txt',
    ],
    'renaissance': [
        'petrarch.ep1.txt',
        'petrarch.numa.txt',
        'petrarch.rom.txt',
    ],
    'neo_latin': [
        'spinoza.ethica1.txt',
        'spinoza.ethica2.txt',
        'spinoza.ethica3.txt',
        'spinoza.ethica4.txt',
        'spinoza.ethica5.txt'
    ]
}



#### The following were directories that weren't obviously divisible into periods based on the web site/file directory layout.
 
	./alanus',
	'./albertanus',
	'./albertofaix',
	'./aquinas',
	'./ammianus',
	'./arnobius',
	'./capellanus',
	'./cato',
	'./claudian',
	'./curtius',
	'./eutropius',
	'./frontinus',
	'./gestafrancorum',
	'./justinian',
	'./lactantius',
	'./martinbraga',
	'./mirandola',
	'./ottofreising',
	'./pauldeacon',
	'./sha',
	'./theodosius',
	'./voragine',
	'./walter',
	'./williamtyre',
 

In [78]:


CLEAN_IDS_TYPES = []

for key, valuelist in corpus_texts_by_type.items():
    for value in valuelist:
        if value in ALL_FILE_IDS:
            CLEAN_IDS_TYPES.append((value, key))

for key, valuelist in corpus_directories_by_type.items():
    for value in valuelist:
        corrected_dir = value.replace('./', '')
        corrected_dir = '{}/'.format(corrected_dir)
        for name in ALL_FILE_IDS:
            if name.startswith(corrected_dir):
                CLEAN_IDS_TYPES.append((name, key))

CLEAN_IDS_TYPES.sort(key=lambda x: x[0])
fileid_names, categories = zip(*CLEAN_IDS_TYPES)

reader._fileids = fileid_names

print('Original file list: %s ' % len(ALL_FILE_IDS))
print('Corrected file list: %s' % len(fileid_names))


Original file list: 2141 
Corrected file list: 1278


In [79]:
DOCS_TO_CLASSIFY = list(set(ALL_FILE_IDS) ^ set(fileid_names))
print('Docs to classify: %s ' % len(DOCS_TO_CLASSIFY))


Docs to classify: 863 


In [80]:

label_encoder = LabelEncoder()
y = label_encoder.fit_transform( np.array(categories).ravel())
print(y)

[6 1 1 ... 0 0 5]


In [81]:
print(label_encoder.classes_)

['augustan' 'christian' 'early_silver' 'late_silver' 'medieval'
 'neo_latin' 'old' 'renaissance' 'republican']


In [82]:
def identity (data):
    """Identity, as in math; you know, do nothing, just be yourself."""
    return data


In [83]:
dummyX = [
    ['The quick brown fox. The lazy dog.'],
    ['Waiting for godot. Looking for the sunshine.']
]

newX = Doc2TokensTransformer().transform(dummyX)
print(list(newX))

[['The', 'quick', 'brow', 'fox', 'The', 'lazy', 'dog'], ['Waiting', 'for', 'godot', 'Looking', 'for', 'the', 'sunshine']]


In [None]:

X = list(reader.docs(reader.fileids()))
print(len(X))

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# With a larger labelled data set our test size would be higher, but lower here is okay
# because we just want to see some differences among classifiers

In [65]:
print('X_train size: {}, X_test size: {}, y_train size: {}, y_test size: {}'.format(len(X_train),
                                                                                    len(X_test),
                                                                                    len(y_train),
                                                                                    len(y_test)))


856
422
856
422


In [68]:
def train(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train)
    print("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier

trial1 = Pipeline([
    ('normalizer', Doc2TokensTransformer()),
    ('vectorizer', TfidfVectorizer(
        analyzer='word',
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None)),
    ('classifier', MultinomialNB())
])

train(trial1, X_train, X_test, y_train, y_test)

Accuracy: 0.7109004739336493


Pipeline(memory=None,
     steps=[('normalizer', TextNormalizer(language=None, valid_chars=None)), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...True, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [69]:
def model_selection(X_train, y_train, X_test, y_test, estimator):
    """
    Test various estimators.
    """

    model = Pipeline([
        ('normalizer', Doc2TokensTransformer()),
        ('vectorizer', TfidfVectorizer(
            analyzer='word',
            tokenizer=identity,
            preprocessor=identity,
            token_pattern=None)),
        ('classifier', estimator())
    ])
    model.fit(X_train, y_train)
    return ("Accuracy: %s" % model.score(X_test, y_test))


In [71]:
classifiers = [LinearSVC, SVC, KNeighborsClassifier, LogisticRegressionCV,
        LogisticRegression, SGDClassifier, BaggingClassifier,
        ExtraTreesClassifier, RandomForestClassifier]

for cls in classifiers:
    print('testing %s %s' % (str(cls), str(datetime.now())))
    print(model_selection(X_train, y_train, X_test, y_test, cls))
print('done %s' %  str(datetime.now()))

testing <class 'sklearn.svm.classes.LinearSVC'> 2018-08-31 23:37:25.833036


Accuracy: 0.7109004739336493
testing <class 'sklearn.svm.classes.SVC'> 2018-09-01 00:00:07.021422


Accuracy: 0.6824644549763034
testing <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 2018-09-01 00:22:11.119004


Accuracy: 0.5924170616113744
testing <class 'sklearn.linear_model.logistic.LogisticRegressionCV'> 2018-09-01 00:43:54.351014




Accuracy: 0.7061611374407583
testing <class 'sklearn.linear_model.logistic.LogisticRegression'> 2018-09-01 01:05:26.698582


Accuracy: 0.7109004739336493
testing <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> 2018-09-01 01:26:56.212475




Accuracy: 0.7014218009478673
testing <class 'sklearn.ensemble.bagging.BaggingClassifier'> 2018-09-01 01:48:23.356373


Accuracy: 0.7061611374407583
testing <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> 2018-09-01 02:10:04.528566


Accuracy: 0.7109004739336493
testing <class 'sklearn.ensemble.forest.RandomForestClassifier'> 2018-09-01 02:31:35.841899


Accuracy: 0.7061611374407583
done 2018-09-01 02:53:18.508641


In [39]:
# Train all the data using a reasonable winner
model = Pipeline([
    ('normalizer', Doc2TokensTransformer()),
    ('vectorizer', TfidfVectorizer(
        analyzer='word',
        tokenizer=identity,
        preprocessor=identity,
        token_pattern=None)),
    ('classifier', SGDClassifier())
])

model.fit(X, y)



Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function i...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [40]:
unclassified_reader = FilteredPlaintextCorpusReader(root, DOCS_TO_CLASSIFY, word_tokenizer=None,
                                                    sent_tokenizer=None)
to_classify_X = list(unclassified_reader.docs(unclassified_reader.fileids()))
print('done %s ' % datetime.now())


done 2018-08-31 19:21:17.391585 


In [None]:
new_labels = model.predict(to_classify_X)
print (new_labels)

In [47]:
new_cats = defaultdict(list)
for idx, filename in enumerate(DOCS_TO_CLASSIFY):
    new_cats[label_encoder.classes_[new_labels[idx]]].append(filename)

print(new_cats)


defaultdict(<class 'list'>, {'christian': ['martinbraga/superbia.txt', 'aquinas/q1.14.txt', 'innocent1.txt', 'aquinas/q1.19.txt', 'justinian/codex12.txt', 'foedusaeternum.txt', 'theodosius/theod16.txt', 'williamtyre/9.txt', 'sedulius3.txt', 'ottofreising/2.txt', 'aquinas/q1.38.txt', 'theodosius/theod14.txt', 'justinian/institutes2.txt', 'aquinas/q1.64.txt', 'capellanus/capellanus2.txt', 'mirabilia.txt', 'fulbert.txt', 'albertanus/albertanus2.txt', 'aquinas/q1.55.txt', 'martinbraga/pascha.txt', 'albertanus/albertanus.sermo3.txt', 'thomasedessa.txt', 'adso.txt', 'aquinas/q1.69.txt', 'albertanus/albertanus.arsloquendi.txt', 'voragine/chris.txt', 'baldo.txt', 'sidonius7.txt', 'voragine/alexio.txt', 'vegetius4.txt', 'gestafrancorum/gestafrancorum2.txt', 'newton.capita.txt', 'theodosius/theod05.txt', 'williamtyre/19.txt', 'egeria2.txt', 'apuleius/apuleius9.txt', 'ammianus/26.txt', 'justinian/institutes1.txt', 'aquinas/q1.51.txt', 'pauldeacon/hist3.txt', 'voragine/andrea.txt', 'aquinas/q1.32.

In [50]:
with open(os.path.expanduser('~/projects/mycltk/mycltk_experiments/new_cats.json'), mode='w',
          encoding='utf8') as writer:
    json.dump(new_cats, writer, indent=2)


In [52]:
with open(os.path.expanduser('~/projects/mycltk/mycltk_experiments/latin_text_classifier.mdl.pkl')
        , 'wb') as writer:
    pickle.dump(new_cats, writer)
