In [None]:
from typing import List
import random
import math

import pandas as pd
import spacy 

nlp = spacy.load('la_core_web_lg')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

LaTa = "../../../models/LaTa"
PhilTa = "../../../models/PhilTa"

tokenizer = AutoTokenizer.from_pretrained(PhilTa)
model = AutoModelForSeq2SeqLM.from_pretrained(PhilTa)

In [None]:
text = ("Fill: Ecce audiant hoc illi, qui maxime ecclesiarum localium, id <extra_id_0> optime.")


inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=5,
    num_beams=5,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    repetition_penalty=2.0,
    no_repeat_ngram_size=2,
    num_return_sequences=5  
)

for i, output in enumerate(outputs, 1):
    print(f"Option {i}: {tokenizer.decode(output, skip_special_tokens=True)}")


In [None]:
text = ("Fill: Ecce <extra_id_0> hoc illi, qui maxime ecclesiarum localium, id est coenobiorum, archimandritis detrahunt, quoties gregis sui patiuntur detrimentum, et cum ipsi uacent otio, temere operarios Dei diiudicant, ubi aliquos ex eis, qui hortatu ipsorum conuersi sunt ad saeculum relabi conspiciunt.")

inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    do_sample=True,
    top_k=100,
    top_p=0.75,
    repetition_penalty=2.0,
    no_repeat_ngram_size=2,
    num_return_sequences=5
)

for i, output in enumerate(outputs, 1):
    print(f"Option {i}: {tokenizer.decode(output, skip_special_tokens=True)}")

In [None]:
def demask(sentence: str) -> str:
    
    # Replace each "<mask>" with the appropriate T5 placeholder token
    mask_index = 0
    while "<mask>" in sentence:
        sentence = sentence.replace("<mask>", f"<extra_id_{mask_index}>", 1)
        mask_index += 1

    # Tokenize the input sentence and generate the demasked output
    inputs = tokenizer(f"Fill {sentence}", return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,
        early_stopping=True
    )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return result

In [None]:
print(demask("Undecim discipuli <mask> in scholam ierunt.")[0])

In [None]:
substructures = choose_substructures("Ecce audiant hoc illi, qui maxime ecclesiarum localium, id est coenobiorum, archimandritis detrahunt, quoties gregis sui patiuntur detrimentum, et cum ipsi uacent otio, temere operarios Dei diiudicant, ubi aliquos ex eis, qui hortatu ipsorum conuersi sunt ad saeculum relabi conspiciunt.")
for struct in substructures:
    print(struct)

In [None]:
def choose_substructures(sentence: str) -> List[str]:

    len_sentence = len(sentence.split(' '))
    min_len_subtree = (
        math.ceil(len_sentence / 3) if len_sentence < 25 
        else math.ceil(len_sentence / 7) if len_sentence < 50
        else math.ceil(len_sentence / 15)
    )

    substructures = extract_substructures(sentence, min_len_subtree)
    chosen_substructures = []
    num_subtrees = math.ceil(len(substructures) / 3)
    selection = random.sample(substructures, num_subtrees)

    # Remove subsets from the sampled selection
    selection.sort(key=len, reverse=True)  # Sort longest first
    unique_selection = []

    for sub in selection:
        sub_set = set(sub.split())  # Convert substructure to a set of words
        if not any(sub_set.issubset(set(added.split())) for added in unique_selection):
            unique_selection.append(sub)

    return unique_selection

def extract_substructures(sentence: str, min_len_subtree: int) -> List[str]:
    doc = nlp(sentence)
    substructures = set()
    len_sentence = len(sentence.split(' '))
    for token in doc:
        subtree = " ".join([t.text for t in token.subtree])
        subtree = subtree.replace(',', '').strip()
        subtree = subtree.replace('  ', ' ')
        len_subtree = len(subtree.split(' '))
        if(len_subtree > min_len_subtree and len_subtree < len_sentence):
            substructures.add(subtree)
    return list(substructures)

In [None]:
def read_sentences(file_path: str) -> List[str]:
    df = pd.read_csv(file_path)
    return df['sentence'].str.strip().str.strip('"').tolist()
    

In [None]:
import math
import random
from typing import List, Tuple, NamedTuple
import spacy

# Load spaCy model (adjust as needed)
nlp = spacy.load("la_core_web_lg")

class Substructure(NamedTuple):
    text: str
    start: int  # start index in the tokenized sentence
    end: int    # end index (inclusive)
    token_count: int

def find_sublist(lst: List[str], sublst: List[str]) -> int:
    """Return the starting index of sublst in lst, or None if not found."""
    for i in range(len(lst) - len(sublst) + 1):
        if lst[i : i + len(sublst)] == sublst:
            return i
    return None

def extract_substructures(sentence: str, tokens: List[str],
                          region: str = "middle") -> List[Substructure]:
    """
    Parse the sentence with spaCy and extract subtrees with fixed 
    minimal token counts based on the region.
    """

    min_len = 1

    total_tokens = len(tokens)
    doc = nlp(sentence)
    substructures = {}

    for token in doc:
        subtree_tokens = list(token.subtree)
        subtree_text = " ".join(t.text for t in subtree_tokens)
        subtree_text = subtree_text.replace(",", "").strip()
        subtree_text = " ".join(subtree_text.split())

        subtree_words = subtree_text.split()
        len_subtree = len(subtree_words)

        # Accept only if the subtree is long enough and not the full sentence.
        if len_subtree > min_len and len_subtree < total_tokens:
            start_idx = find_sublist(tokens, subtree_words)
            if start_idx is not None:
                end_idx = start_idx + len_subtree - 1
                key = (start_idx, end_idx)
                if key not in substructures:
                    substructures[key] = Substructure(
                        text=subtree_text, start=start_idx,
                        end=end_idx, token_count=len_subtree
                    )
    return list(substructures.values())

def choose_substructures_from_candidates(
    candidates: List[Substructure], target_mask: int
) -> List[Substructure]:
    """
    Greedily choose substructures from the candidate list whose total token 
    count does not exceed the target_mask.
    """
    candidates.sort(key=lambda s: s.token_count, reverse=True)
    selected = []
    current_mask = 0
    for sub in candidates:
        if current_mask + sub.token_count <= target_mask:
            selected.append(sub)
            current_mask += sub.token_count
    return selected

def apply_masking(tokens: List[str], selected_subs: List[Substructure]
                 ) -> List[str]:
    """
    Replace each selected substructure with a single "[MASK]" token.
    This function assumes that the substructures do not overlap.
    """
    # Sort the selected substructures by their start index
    selected_subs = sorted(selected_subs, key=lambda s: s.start)
    masked_tokens = []
    i = 0
    for sub in selected_subs:
        # Append tokens before the substructure
        while i < sub.start:
            masked_tokens.append(tokens[i])
            i += 1
        # Insert a single mask token for the entire substructure
        masked_tokens.append("[MASK]")
        # Skip over tokens that are part of the masked substructure
        i = sub.end + 1
    # Append any tokens remaining after the last substructure
    while i < len(tokens):
        masked_tokens.append(tokens[i])
        i += 1
    return masked_tokens

def mask_sentence(
    sentence: str, region: str = "middle",
    mask_ratio: Tuple[float, float] = (0.1, 0.2)
) -> str:
    """
    Mask 10% to 20% of the sentence’s tokens by replacing entire substructures 
    (as extracted by extract_substructures) with a mask token.
    The 'region' parameter toggles between 'start', 'middle', and 'end' 
    of the sentence.
    """
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    total_tokens = len(tokens)
    
    # Determine a random target mask count within the given ratio.
    min_mask = math.ceil(total_tokens * mask_ratio[0])
    max_mask = math.floor(total_tokens * mask_ratio[1])
    target_mask = random.randint(min_mask, max_mask)
    
    # Extract candidate substructures (with fixed minimal length).
    candidates = extract_substructures(sentence, tokens, region)
    
    # If no candidates remain in the desired region, fallback to using all.
    if not candidates:
        candidates = extract_substructures(sentence, tokens)
    
    selected_subs = choose_substructures_from_candidates(candidates, target_mask)
    
    masked_tokens = apply_masking(tokens, selected_subs)
    
    return " ".join(masked_tokens)

# Example usage:
if __name__ == "__main__":
    test_sentence = (
        "Ecce audiant hoc illi, qui maxime ecclesiarum localium, id est coenobiorum, "
        "archimandritis detrahunt, quoties gregis sui patiuntur detrimentum, et cum ipsi "
        "uacent otio, temere operarios Dei diiudicant, ubi aliquos ex eis, qui hortatu ipsorum "
        "conuersi sunt ad saeculum relabi conspiciunt."
    )
    for region in ["start", "middle", "end"]:
        print(mask_sentence(test_sentence, region=region))


