In [None]:
from typing import List
import random
import math

import pandas as pd
import spacy 

nlp = spacy.load('la_core_web_lg')

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

LaTa = "../../../models/LaTa"
PhilTa = "../../../models/PhilTa"

tokenizer = AutoTokenizer.from_pretrained(PhilTa)
model = AutoModelForSeq2SeqLM.from_pretrained(PhilTa)

In [10]:
text = ("Fill: Ecce audiant hoc illi, qui maxime ecclesiarum localium, id <extra_id_0> optime.")


inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=5,
    num_beams=5,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    repetition_penalty=2.0,
    no_repeat_ngram_size=2,
    num_return_sequences=5  
)

for i, output in enumerate(outputs, 1):
    print(f"Option {i}: {tokenizer.decode(output, skip_special_tokens=True)}")


Option 1: audiant illi, qui
Option 2: audiant, qui
Option 3: audiant:
Option 4: ecclesiarum:
Option 5: audiant, qui maxime


In [6]:
text = ("Fill: Ecce <extra_id_0> hoc illi, qui maxime ecclesiarum localium, id est coenobiorum, archimandritis detrahunt, quoties gregis sui patiuntur detrimentum, et cum ipsi uacent otio, temere operarios Dei diiudicant, ubi aliquos ex eis, qui hortatu ipsorum conuersi sunt ad saeculum relabi conspiciunt.")

inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    do_sample=True,
    top_k=100,
    top_p=0.75,
    repetition_penalty=2.0,
    no_repeat_ngram_size=2,
    num_return_sequences=5
)

for i, output in enumerate(outputs, 1):
    print(f"Option {i}: {tokenizer.decode(output, skip_special_tokens=True)}")

Option 1: * *:om:.
Option 2: * *:om.
Option 3: * *:om:u:.
Option 4: * *:om: in.
Option 5: * *:om:u:a:.


In [None]:
def demask(sentence: str) -> str:
    
    # Replace each "<mask>" with the appropriate T5 placeholder token
    mask_index = 0
    while "<mask>" in sentence:
        sentence = sentence.replace("<mask>", f"<extra_id_{mask_index}>", 1)
        mask_index += 1

    # Tokenize the input sentence and generate the demasked output
    inputs = tokenizer(f"Fill {sentence}", return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,
        early_stopping=True
    )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return result

In [None]:
print(demask("Undecim discipuli <mask> in scholam ierunt.")[0])

In [None]:
substructures = choose_substructures("Ecce audiant hoc illi, qui maxime ecclesiarum localium, id est coenobiorum, archimandritis detrahunt, quoties gregis sui patiuntur detrimentum, et cum ipsi uacent otio, temere operarios Dei diiudicant, ubi aliquos ex eis, qui hortatu ipsorum conuersi sunt ad saeculum relabi conspiciunt.")
for struct in substructures:
    print(struct)

In [None]:
def choose_substructures(sentence: str) -> List[str]:

    len_sentence = len(sentence.split(' '))
    min_len_subtree = (
        math.ceil(len_sentence / 3) if len_sentence < 25 
        else math.ceil(len_sentence / 7) if len_sentence < 50
        else math.ceil(len_sentence / 15)
    )

    substructures = extract_substructures(sentence, min_len_subtree)
    chosen_substructures = []
    num_subtrees = math.ceil(len(substructures) / 3)
    selection = random.sample(substructures, num_subtrees)

    # Remove subsets from the sampled selection
    selection.sort(key=len, reverse=True)  # Sort longest first
    unique_selection = []

    for sub in selection:
        sub_set = set(sub.split())  # Convert substructure to a set of words
        if not any(sub_set.issubset(set(added.split())) for added in unique_selection):
            unique_selection.append(sub)

    return unique_selection

In [None]:
def extract_substructures(sentence: str, min_len_subtree: int) -> List[str]:
    doc = nlp(sentence)
    substructures = set()
    len_sentence = len(sentence.split(' '))
    for token in doc:
        subtree = " ".join([t.text for t in token.subtree])
        subtree = subtree.replace(',', '').strip()
        subtree = subtree.replace('  ', ' ')
        len_subtree = len(subtree.split(' '))
        if(len_subtree > min_len_subtree and len_subtree < len_sentence):
            substructures.add(subtree)
    return list(substructures)

In [None]:
def read_sentences(file_path: str) -> List[str]:
    df = pd.read_csv(file_path)
    return df['sentence'].str.strip().str.strip('"').tolist()
    