In [11]:

from cltk import NLP
from cltk.languages.utils import get_lang

from unidecode import unidecode

from pathlib import Path

import pandas as pd


# Load files

## Latin text

In [7]:

fn = Path('..') / 'files' / 'pliny-6-16.txt'

In [8]:
with open(fn, 'r', encoding='utf-8') as file:
    text = file.read()

## Vocab

In [27]:
fn_list = Path('..') / 'files' / 'ap-latin-draft-course-framework-vocab-list'

In [28]:
df_vocab = pd.read_csv(fn_list.with_suffix('.csv'))


In [29]:
df_vocab.head()

Unnamed: 0,Required Vocabulary,Definition,Suggested Reading,Base Word
0,Required Vocabulary,Definition,Suggested \nReading,Required
1,"—, suī, sibi, sē, sē","himself, herself, itself, themselves",1.1,—
2,"ā, ab, abs","from, away from, out of, by (with abl.)",1.1,a
3,"abeō, -īre, iī, itum","to go from, go away, go off, go forth, go, depart",2.2,abeo
4,"absum, abesse, āfuī","to be away from, be absent",5.3,absum


# Lemma text

In [3]:

# Initialize the Latin pipeline
nlp = NLP(language="lat")

‎𐤀 CLTK version '1.4.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.

⸖ ``LatinStanzaProcess`` using Stanza model from the Stanford NLP Group: https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/
⸖ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [9]:
# Your Latin text
latin_text = "Arma virumque cano, Troiae qui primus ab oris"

# Process the text
doc = nlp.analyze(text=text)

# Get lemmas
lemmas = [word.lemma for word in doc.words]

# Print original words and their lemmas
for word, lemma in zip([word.string for word in doc.words], lemmas):
    print(f"Word: {word:<20} Lemma: {lemma}")

Word: C                    Lemma: C
Word: .                    Lemma: .
Word: PLINII               Lemma: plinium
Word: CAECILII             Lemma: caecilium
Word: SECVNDI              Lemma: secvndum
Word: EPISTVLARVM          Lemma: epistvla
Word: LIBER                Lemma: liber
Word: SEXTV                Lemma: sextus
Word: S                    Lemma: S
Word: 1                    Lemma: 1
Word: C                    Lemma: C
Word: .                    Lemma: .
Word: PLINIUS              Lemma: plinus
Word: TIRONI               Lemma: tiro
Word: SUO                  Lemma: suus
Word: S                    Lemma: S
Word: .                    Lemma: .
Word: 1                    Lemma: 1
Word: Quamdiu              Lemma: quamdiu
Word: ego                  Lemma: ego
Word: trans                Lemma: traho
Word: Padum                Lemma: padus
Word: tu                   Lemma: tu
Word: in                   Lemma: in
Word: Piceno               Lemma: picenus
Word: ,                    L

In [12]:
# Create the dataframe directly from the two lists
df_words = pd.DataFrame({
    'Original': [word.string for word in doc.words],
    'Lemma': [word.lemma for word in doc.words]
})

In [17]:
df_lemma = df_words.groupby('Lemma', as_index=False).agg(cnt_lemma=('Lemma', 'count'),
                              cnt_forms=('Original', 'nunique')).sort_values('Lemma')

In [20]:
df_lemma.sort_values('cnt_lemma', ascending=False).head(20)

Unnamed: 0,Lemma,cnt_lemma,cnt_forms
26,",",613,1
28,.,459,1
1669,qui,210,20
727,et,190,2
1941,sum,172,32
691,ego,124,10
63,;,119,1
2067,ut,101,1
972,in,99,2
1330,non,97,2


In [None]:
# Match

In [34]:
df_compare = df_lemma.merge(df_vocab, left_on='Lemma', right_on='Base Word', how='left').fillna('')

In [40]:
df_compare.sort_values('cnt_lemma', ascending=False).head(40)

Unnamed: 0,Lemma,cnt_lemma,cnt_forms,Required Vocabulary,Definition,Suggested Reading,Base Word
26,",",613,1,,,,
28,.,459,1,,,,
1677,qui,210,20,"quī, quae, quod","who, which, that",1.1,qui
731,et,190,2,et,"and; also, too, besides, likewise, as well, even",1.1,et
1950,sum,172,32,"sum, esse, fuī",to be,1.1,sum
694,ego,124,10,"ego, meī, mihi, mē, mē","I, me",1.1,ego
63,;,119,1,,,,
2077,ut,101,1,ut or utī,"how, in what way; as, like; when; that, so that",1.1,ut
977,in,99,2,in,"in, on (with abl.); into, to, onto (with acc.)",1.1,in
1337,non,97,2,nōn,"not, by no means, not at all",1.1,non


In [41]:
df_words[df_words['Lemma']=='plinus']

Unnamed: 0,Original,Lemma
12,PLINIUS,plinus
124,PLINIUS,plinus
558,PLINIUS,plinus
639,PLINIUS,plinus
789,PLINIUS,plinus
1042,PLINIUS,plinus
1416,PLINIUS,plinus
1513,PLINIUS,plinus
1903,PLINIUS,plinus
1969,PLINIUS,plinus


In [42]:
df_vocab_check = df_lemma.merge(df_vocab, left_on='Lemma', right_on='Base Word', how='right').fillna('')

In [44]:
df_vocab_check.head(20)


Unnamed: 0,Lemma,cnt_lemma,cnt_forms,Required Vocabulary,Definition,Suggested Reading,Base Word
0,,,,Required Vocabulary,Definition,Suggested \nReading,Required
1,—,42.0,1.0,"—, suī, sibi, sē, sē","himself, herself, itself, themselves",1.1,—
2,a,18.0,3.0,"ā, ab, abs","from, away from, out of, by (with abl.)",1.1,a
3,abeo,4.0,4.0,"abeō, -īre, iī, itum","to go from, go away, go off, go forth, go, depart",2.2,abeo
4,absum,3.0,3.0,"absum, abesse, āfuī","to be away from, be absent",5.3,absum
5,accedo,3.0,3.0,"accēdō (adc-), -ere, -cessī, -cessum","to go to, come to, come near, draw near, appro...",2.1,accedo
6,accendo,2.0,2.0,"accendō (adc-), -ere, -cendī, -cēnsum","to kindle, set on fire, inflame",3.6,accendo
7,accido,3.0,2.0,"accidō, -ere, -cidī","to fall upon, fall to, reach by falling, happen",3.1,accido
8,accipio,9.0,7.0,"accipiō (adc-), -ere, -cēpī, -ceptum","to take without effort, receive, get, accept",1.3,accipio
9,,,,"aciēs, -ēī","sharp edge, point, cutting part; the front of ...",6.1,acies


# Save

In [46]:
fn_matched_word_list = Path('..') / 'files' / 'matched-text-and-vocab-list'

In [47]:
df_compare.to_excel(fn_matched_word_list.with_suffix('.xlsx'), sheet_name='Pliny 6-16 Word List')