In [1]:
import cltk
from cltk.corpus.utils.importer import CorpusImporter
from cltk.stem.latin.j_v import JVReplacer
import pickle
import re
import unidecode
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stem.latin.stem import Stemmer
from cltk.stem.lemma import LemmaReplacer
import numpy as np 
import pandas as pd
from pymongo import MongoClient
import string

Effort not shown here: had to go into several .py files under stem/lemmatize in the cltk Latin toolkit to customize/improve the algorithms.

In [17]:
corpus_importer = CorpusImporter('latin')
corpus_importer.list_corpora

['latin_text_perseus',
 'latin_treebank_perseus',
 'latin_text_latin_library',
 'phi5',
 'phi7',
 'latin_proper_names_cltk',
 'latin_models_cltk',
 'latin_pos_lemmata_cltk',
 'latin_treebank_index_thomisticus',
 'latin_lexica_perseus',
 'latin_training_set_sentence_cltk',
 'latin_word2vec_cltk',
 'latin_text_antique_digiliblt',
 'latin_text_corpus_grammaticorum_latinorum',
 'latin_text_poeti_ditalia']

In [3]:
for corpus in corpus_importer.list_corpora:
    try:
        corpus_importer.import_corpus(corpus)
    except:
        None

In [2]:
corpus_importer = CorpusImporter('greek')
corpus_importer.list_corpora

['greek_software_tlgu',
 'greek_text_perseus',
 'phi7',
 'tlg',
 'greek_proper_names_cltk',
 'greek_models_cltk',
 'greek_treebank_perseus',
 'greek_lexica_perseus',
 'greek_training_set_sentence_cltk',
 'greek_word2vec_cltk',
 'greek_text_lacus_curtius',
 'greek_text_first1kgreek']

In [3]:
for corpus in corpus_importer.list_corpora:
    try:
        corpus_importer.import_corpus(corpus)
    except:
        None

Downloaded 100% 163.52 MiB | 6.22 MiB/s 

In [2]:
def clean_text(text):
    # remove macrons and accents
    cleaned = unidecode.unidecode(text)
    # convert Js to Is and Vs to Us for consistency
    cleaned = JVReplacer().replace(cleaned)
    # remove any non Latin characters except for punctuation
    cleaned = re.sub(r'[\n]',' ',cleaned.lower())
    cleaned = re.sub(r'[^a-zA-Z\s\.\!\?:;]*','',cleaned)
    cleaned = re.sub(r'\b[ivxlcdm]+\b','',cleaned)
    
    return cleaned.strip()

def tokenizer = LineTokenizer('latin')

tokenized_data = tokenizer.tokenize(data)

In [4]:
scraped_data = pickle.load(open('pickles/scraped_data.pkl','rb'))

In [7]:
client = MongoClient()
db = client.latinlit
data = db.data
split_data = db.splitdata

In [15]:
def gen_doc_chunks(data):
    ''' need to create Mongo collection splitdata first
    '''
    for doc in data:
        author_name = doc['author']
        work_name = doc['work']

        split_text = re.split(r'[\.\?\!:;]',clean_text(doc['text']))

        sentence_counter = 0
        text_chunk = []
        for text in split_text:
            split_words = text.strip().split(' ')
            if len(split_words) >= 5:
                text_chunk.append(text.strip())
                sentence_counter += 1
            if sentence_counter == 5:
                text_chunk_str = '. '.join(text_chunk)+'.'
                split_data.insert_one({'author':author_name, 'work':work_name, 'text':text_chunk_str})
                text_chunk = []
                sentence_counter = 0
            
    return

In [None]:
gen_doc_chunks(data)

In [6]:
list(split_data.find({}).limit(5))

[{'_id': ObjectId('5b0e0fac42f3cb02d6ebe0f4'),
  'author': 'AA VV',
  'work': 'Epistolae Confessorum Romanorum et Carthaginensium',
  'text': 'argumentum huius et sequentis epistolae habes istud in epistola cypriani. exempla quoque inquit epistolae celerini boni et robusti confessoris quam ad lucianum eumdem confessorem scripserit item quid lucianus ei rescripserit misi uobis ut scire tis et laborem circa omnia et diligentiam nostram et ueritatem ipsam disceretis. celerinus confessor quam sit timoratus et cautus et humilitate ac ti more sectae nostrae uerecundus. lucianus uero circa intelligentiam dominicae lectionis ut  minus peritus et circa inuidiam uerecundiae nostrae relinquendam facilitate sua molestus nam cum dominus dixerit in nomine patris et filii et spi ritus sancti gentes tingui et in baptismo peccata dimitti. hic praecepti et legis ignarus mandat pa cem dari et peccata dimitti in pauli nomine et hoc sibi dicit ab illo esse mandatum.'},
 {'_id': ObjectId('5b0e0fac42f3cb02d6

In [11]:
def root_text(text):
    lemmatizer = LemmaReplacer('latin')
    stemmer = Stemmer()
    lem_text = lemmatizer.lemmatize(text,return_string=True)
    stem_text = stemmer.stem(lem_text)
    # remove conjugation numbers prior to running lfidf
    cleaned_text = clean_text(stem_text)
    return cleaned_text

In [22]:
root_data = db.rootdata

num_docs = split_data.find().count()

for n in range(0,num_docs):
    doc = split_data.find()[n]
    output_text = root_text(doc['text'])
    root_data.insert_one({'author':doc['author'], 'work':doc['work'], 'text':output_text})
    
#texts = [root_text(doc['text']) for doc in list(split_data.find({}))]

In [23]:
list(root_data.find({},{'text':1}).limit(1))

[{'_id': ObjectId('5b10596542f3cb0d6ba0a3a5'),
  'text': 'argument hic et sequo epistul habe ist in epistul cypriani. exempl quoque inqu epistul celerin bon et robust confessor qui ad lucian is confessor scrib it quis lucian is rescrib mitt tu ut sci t et labor circ omn et diligent noste et uerit ipse disceretis. celerin confesso qui sum timorat et caue et humilit atque t morio sec noste uerecundus. lucian uer circ intellegent dominic lecti ut paru pere et circ inuid uerecund noste relinqu facilit su molest nam cum domin dico in nomen pate et fil et sp rit sanci ge tingu et in baptism pecc dimitto. hic praecipi et lego ignar mando p   et pecc dimitt in paulus nomen et hic sui dico ab ille edo mando.'}]

In [24]:
list(split_data.find({},{'text':1}).limit(1))

[{'_id': ObjectId('5b0e0fac42f3cb02d6ebe0f4'),
  'text': 'argumentum huius et sequentis epistolae habes istud in epistola cypriani. exempla quoque inquit epistolae celerini boni et robusti confessoris quam ad lucianum eumdem confessorem scripserit item quid lucianus ei rescripserit misi uobis ut scire tis et laborem circa omnia et diligentiam nostram et ueritatem ipsam disceretis. celerinus confessor quam sit timoratus et cautus et humilitate ac ti more sectae nostrae uerecundus. lucianus uero circa intelligentiam dominicae lectionis ut  minus peritus et circa inuidiam uerecundiae nostrae relinquendam facilitate sua molestus nam cum dominus dixerit in nomine patris et filii et spi ritus sancti gentes tingui et in baptismo peccata dimitti. hic praecepti et legis ignarus mandat pa cem dari et peccata dimitti in pauli nomine et hoc sibi dicit ab illo esse mandatum.'}]

In [25]:
root_data_local = list(root_data.find({}))
indices = np.arange(len(root_data_local))
train_indices = np.random.choice(indices,size=int(0.9*len(indices)),replace=False)
train_data = [root_data_local[i] for i in train_indices]
test_data = [root_data_local[i] for i in indices if i not in train_indices]

In [26]:
root_train = db.roottrain
root_test = db.roottest
root_train.insert_many(train_data)
root_test.insert_many(test_data)

<pymongo.results.InsertManyResult at 0x7fb883225908>

In [33]:
texts = [doc['text'] for doc in root_data_local]
texts_train = [doc['text'] for doc in train_data]
texts_test = [doc['text'] for doc in test_data]

In [None]:
pickle.dump(texts,open('pickles/texts.pkl','wb'))
pickle.dump(texts_train,open('pickles/texts_train.pkl','wb'))
pickle.dump(texts_test,open('pickles/texts_test.pkl','wb'))
pickle.dump(root_data_local,open('pickles/root_data.pkl','wb'))
pickle.dump(train_data,open('pickles/root_train.pkl','wb'))
pickle.dump(test_data,open('pickles/root_test.pkl','wb'))
pickle.dump(train_indices,open('pickles/train_indices.pkl','wb'))