In [1]:
import os
import gensim
from gensim.utils import simple_preprocess
import pandas as pd

In [2]:
def preproc_corpus(path):
    texts = []
    for fname in os.listdir(path):
        if not fname.endswith('.txt'):
            continue
        with open(path + '/' + fname) as f:
            text = f.read()
            texts.append(text)
    return texts

texts = preproc_corpus('.')
texts

['þis his sy forewearde þe Brichtric 7 his gebedde habbað wið þa canonicas, þæt his þæt heo sculan habban heora beira dæi anes canonikes gerihte on mete 7 on eale 7 heo habbað geunnen hyra land þam canonike scær 7 saccleas, 7 æfter heora bera dæige beon þa canonikes eruename of ealre æhte 7 þis beoð þa gewitnisse, þæt is Ægelwine Brihtmeres sune 7 Leofstan 7 Hearding 7 Bruning.',
 'þis is sy forewearde þe Ægelward hafð gemacad wið þonne decanus 7 wið ealle þa gebroðre of sce Paules mynstre þæt is of ane healfe hyde landes æt Sandune þæt he sceal æfrice geare gyuen .viii. horen for ealle þinc 7 hi sculan baþa habban he 7 hys wif þa hwile þa hy lyfieð 7 æfter hyre begre dæge habba scs Paulus eall þæt hy þær æfter belæfeð þa him mid rihte to gebyrige oppan þan ilcan lande 7 þæt he habba þæt land swa swa hit nu gelogod is. 7 þis synd þa gewitnysse, þæt is se decanus 7 þa fuwer arcediacones 7 þysre gewrite syndon twa, an haben þa gebroðre 7 þæt oðer Ægelward, 7 hær wið wæs Colswegen 7 Ægelm

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [4]:
clean_texts = list(sent_to_words(texts))
clean_text_df_2 = pd.DataFrame(clean_texts[1])
clean_text_df_1 = pd.DataFrame(clean_texts[0])

In [5]:
# def load_lemm(path):
#     # https://github.com/unimorph/ang
#     lemmas = {}
#     with open(path) as f:
#         for line in f:
#             cells = line.strip().split('\t')
#             if len(cells) == 3:
#                 lemma, wordform, grammar = cells
#                 lemmas[wordform.lower()] = lemma.lower()
#     return lemmas

In [6]:
# lemm_dict = load_lemm('../calgary/Lena_test/ang.txt')

In [7]:
# def lemmatize(w, lemm_dict):
#     if w in lemm_dict:
#         return lemm_dict[w]
#     else:
#         return w

In [8]:
# lemmas = [[lemmatize(w, lemm_dict) for w in s] for s in clean_texts]
# lemmas_df_1 = pd.DataFrame(lemmas[0])
# lemmas_df_2 = pd.DataFrame(lemmas[1])


In [9]:
# result_table_1 = pd.concat([clean_text_df_1, lemmas_df_1], axis=1, sort=False)
# result_table_2 = pd.concat([clean_text_df_2, lemmas_df_2], axis=1, sort=False)
# result_table_1=result_table_1.rename(columns={'0': '1','0': '2'})
# result_table_1

In [10]:
texts_str = ' '.join(texts)
texts_str

'þis his sy forewearde þe Brichtric 7 his gebedde habbað wið þa canonicas, þæt his þæt heo sculan habban heora beira dæi anes canonikes gerihte on mete 7 on eale 7 heo habbað geunnen hyra land þam canonike scær 7 saccleas, 7 æfter heora bera dæige beon þa canonikes eruename of ealre æhte 7 þis beoð þa gewitnisse, þæt is Ægelwine Brihtmeres sune 7 Leofstan 7 Hearding 7 Bruning. þis is sy forewearde þe Ægelward hafð gemacad wið þonne decanus 7 wið ealle þa gebroðre of sce Paules mynstre þæt is of ane healfe hyde landes æt Sandune þæt he sceal æfrice geare gyuen .viii. horen for ealle þinc 7 hi sculan baþa habban he 7 hys wif þa hwile þa hy lyfieð 7 æfter hyre begre dæge habba scs Paulus eall þæt hy þær æfter belæfeð þa him mid rihte to gebyrige oppan þan ilcan lande 7 þæt he habba þæt land swa swa hit nu gelogod is. 7 þis synd þa gewitnysse, þæt is se decanus 7 þa fuwer arcediacones 7 þysre gewrite syndon twa, an haben þa gebroðre 7 þæt oðer Ægelward, 7 hær wið wæs Colswegen 7 Ægelmær 7 

http://docs.cltk.org/en/latest/old_english.html


In [11]:
import cltk.lemmatize.old_english.lemma as oe_l
lemmatizer = oe_l.OldEnglishDictionaryLemmatizer()
lemmas = lemmatizer.lemmatize(texts_str)

In [12]:
lemmas_df = pd.DataFrame(lemmas, columns = ['form', 'lemma']) 
lemmas_df

Unnamed: 0,form,lemma
0,þis,þis
1,his,he
2,sy,wesan
3,forewearde,forewearde
4,þe,se
...,...,...
200,7,7
201,Ægelmær,Ægelmær
202,7,7
203,Sexi,Sexi


In [13]:
lemmas_df.to_csv("lemmas.csv", index=False)

In [14]:
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter("old_english")
corpus_importer.list_corpora
corpus_importer.import_corpus('old_english_models_cltk')

In [15]:
from cltk.tag.pos import POSTag

tagger = POSTag('old_english')

pos = tagger.tag_crf(texts_str)

In [16]:
POS = pd.DataFrame(pos, columns = ['word', 'POS'])
POS

Unnamed: 0,word,POS
0,þis,PD
1,his,PS
2,sy,NB
3,forewearde,V-
4,þe,G-
...,...,...
200,7,NB
201,Ægelmær,DF
202,7,R-
203,Sexi,NE


In [17]:
POS.to_csv("POS.csv", index=False)