_syn_, _lnsyn_

In [1]:
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.wordnet.wordnet import WordNetCorpusReader
from cltk.core.exceptions import CLTKException
import string
import re
import numpy
from numpy import log as ln

def polysemy(sentence):
    # Create WordNet Corpus Reader for Latin
    LWN = WordNetCorpusReader(iso_code="lat")

    # Remove punctuation marks from sentence
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Split sentence into words
    words = sentence.split()

    # Lemmatize the words
    lemmatizer = LatinBackoffLemmatizer()
    lemmas = lemmatizer.lemmatize(words)

    # The LatinBackoffLemmatizer gives us a tuple of the token and the lemma.
    # Because we only need the lemma, we extract the lemma from the tuple.
    # We also remove numerals 1 to 4 that may appear after a lemma.
    lemmata = [re.sub(r'[1-4]$', '', lemma[1]) for lemma in lemmas]

    total_synonyms = 0
    logfreq_sum = 0

    for lemma in lemmata:
        if lemma:
            try:
                # Get the synsets for the lemma
                lemma_synsets = LWN.lemma(lemma)

                if lemma_synsets:
                    synsets = list(lemma_synsets[0].synsets())
                    num_synonyms = len(synsets)
                    total_synonyms += num_synonyms

                    # Calculate the logarithm of the number of synonyms (only when num_synonyms >= 1)
                    if num_synonyms >= 1:
                        log_num_synonyms = ln(num_synonyms)
                        logfreq_sum += log_num_synonyms

                else:
                    num_synonyms = 0
            except Exception:
                continue

    
    avg_synonyms = round(total_synonyms / len(words),3) if len(words) > 0 else 0
    log_synonyms = round(logfreq_sum / len(words),3) if len(words) > 0 else 0

    return avg_synonyms, log_synonyms

_synsw_, _lnsynsw_

In [2]:
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.wordnet.wordnet import WordNetCorpusReader
from cltk.stops.words import Stops
from cltk.core.exceptions import CLTKException
import string
import re
import numpy
from numpy import log as ln

def polysemy_cnt(sentence):
    # Create WordNet Corpus Reader for Latin
    LWN = WordNetCorpusReader(iso_code="lat")

    # Remove punctuation marks from sentence
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Split sentence into words
    words = sentence.split()

    # Lemmatize the words
    lemmatizer = LatinBackoffLemmatizer()
    lemmas = lemmatizer.lemmatize(words)

    # The LatinBackoffLemmatizer gives us a tuple of the token and the lemma.
    # Because we only need the lemma, we extract the lemma from the tuple.
    # We also remove numerals 1 to 4 that may appear after a lemma.
    lemmata = [re.sub(r'[1-4]$', '', lemma[1]) for lemma in lemmas]
    
    # we remove the Latin stop words.
    stops_obj = Stops(iso_code="lat")
    tokens_filtered = stops_obj.remove_stopwords(tokens=lemmata)

    total_synonyms = 0
    logfreq_sum = 0

    for lemma in tokens_filtered:
        if lemma:
            try:
                # Get the synsets for the lemma
                lemma_synsets = LWN.lemma(lemma)

                if lemma_synsets:
                    synsets = list(lemma_synsets[0].synsets())
                    num_synonyms = len(synsets)
                    total_synonyms += num_synonyms

                    # Calculate the logarithm of the number of synonyms (only when num_synonyms >= 1)
                    if num_synonyms >= 1:
                        log_num_synonyms = ln(num_synonyms)
                        logfreq_sum += log_num_synonyms

                else:
                    num_synonyms = 0
            except Exception:
                continue

    
    avg_synonyms = round(total_synonyms / len(words),3) if len(words) > 0 else 0
    log_synonyms = round(logfreq_sum / len(words),3) if len(words) > 0 else 0

    return avg_synonyms, log_synonyms

_syn500_

In [3]:
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.wordnet.wordnet import WordNetCorpusReader
from cltk.stops.words import Stops
import string
import re
import pandas as pd

def polysemy_counter(sentence):
    
    # Remove punctuation marks from sentence
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Split sentence into words
    words = sentence.split()

    # Lemmatize the words
    lemmatizer = LatinBackoffLemmatizer()
    lemmas = lemmatizer.lemmatize(words)

    # The LatinBackoffLemmatizer gives us a tuple of the token and the lemma.
    # Because we only need the lemma, we extract the lemma from the tuple.
    # We also remove numerals 1 to 4 that may appear after a lemma.
    lemmata = [re.sub(r'[1-4]$', '', lemma[1]) for lemma in lemmas]
    
    # we remove the Latin stop words.
    stops_obj = Stops(iso_code="lat")
    tokens_filtered = stops_obj.remove_stopwords(tokens=lemmata)

    # Read the Excel file into a DataFrame
    df = pd.read_excel('Polysemien_OLD.xlsx')

    # Create a dictionary from the Excel data
    polysemy_dict = dict(zip(df['LEMMA'], df['POLYSEM']))
    
    # Count the polysems
    polysem_count = 0
    for lemma in tokens_filtered:
        if lemma in polysemy_dict:
            polysem_count += polysemy_dict[lemma]
        else:
            polysem_count += 1
    
    avg_synonyms = round(polysem_count / len(words),3)

    return avg_synonyms

In [None]:
sentence = "maxime Teucrorum ductor, quo sospite numquam res equidem Troiae victas aut regna fatebor, nobis ad belli auxilium pro nomine tanto exiguae vires; hinc Tusco claudimur amni, hinc Rutulus premit et murum circumsonat armis. sed tibi ego ingentis populos opulentaque regnis iungere castra paro, quam fors inopina salutem ostentat: fatis huc te poscentibus adfers. haud procul hinc saxo incolitur fundata vetusto urbis Agyllinae sedes, ubi Lydia quondam gens, bello praeclara, iugis insedit Etruscis. hanc multos florentem annos rex deinde superbo imperio et saevis tenuit Mezentius armis. quid memorem infandas caedes, quid facta tyranni effera? di capiti ipsius generique reservent! mortua quin etiam iungebat corpora vivis componens manibusque manus atque oribus ora, tormenti genus, et sanie taboque fluentis complexu in misero longa sic morte necabat. at fessi tandem cives infanda furentem armati circumsistunt ipsumque domumque, obtruncant socios, ignem ad fastigia iactant. ille inter caedem Rutulorum elapsus in agros confugere et Turni defendier hospitis armis. ergo omnis furiis surrexit Etruria iustis, regem ad supplicium praesenti Marte reposcunt. his ego te, Aenea, ductorem milibus addam. toto namque fremunt condensae litore puppes signaque ferre iubent, retinet longaevus haruspex fata canens: o Maeoniae delecta iuventus, flos veterum virtusque virum, quos iustus in hostem fert dolor et merita accendit Mezentius ira, nulli fas Italo tantam subiungere gentem: externos optate duces. tum Etrusca resedit hoc acies campo monitis exterrita divum."
print(polysemy(sentence))
print(polysemy_cnt(sentence))
print(polysemy_counter(sentence))

(21.937, 1.425)
