<a href="https://colab.research.google.com/github/udutta143/ENG417A-FinalProjectGreenlandic/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
save_dir = '/content/drive/MyDrive/Colab Notebooks/Final Project.ipynb'

In [None]:
%%capture
# The above line prevents output from being displayed for the current cell.
# Installing packages produces a lot of output, and we don't care about it.

!pip3 install nltk
!pip3 install nltk pandas
!pip3 install torch torchvision
!pip3 install transformers accelerate
!pip3 install pyinflect
!pip3 install pandas

In [59]:
import numpy as np
import gc
import nltk
import torch
import ast
import os
import json
from tqdm import tqdm

gc.collect()

torch.cuda.empty_cache()

import pandas as pd
import transformers
from huggingface_hub import login
login(new_session=False)

from pyinflect import getAllInflections, getInflection

import time
import torch.nn as nn
from transformers import pipeline

from sklearn.metrics.pairwise import cosine_similarity

# CausalLM = "Causal Language Modeling" (i.e., next word prediction)
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList


In [60]:
xls = pd.ExcelFile("/content/drive/MyDrive/GreenlandicDictionaryFilteredVerbs.xlsx")
df = xls.parse(xls.sheet_names[0])
dictionary = dict(zip(df["Greenlandic"], df["English"]))

class StopOnEOS(StoppingCriteria):
    def __init__(self, eos_token_id):
        self.eos_token_id = eos_token_id
    def __call__(self, input_ids, scores, **kwargs):
        return input_ids[0, -1] == self.eos_token_id

model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

system_prompt = """
You are a strict dictionary processor who converts Greenlandic dictionary entries into verb templates. For each given definition, do the following:

Use **exactly the words in the definition**. Do not paraphrase, replace, or interpret any words.
Only convert verbs to infinitive form and create templates. Do not invent new verbs or change adjectives or objects.
Replace subjects with {{subject}} and objects with {{object}} where appropriate. Do not change anything else.
Output must be a Python dictionary in the form {"verbs": [...], "templates": [...]}.
Append [EOS] at the end of the output. Stop generating after that.

Examples:

Definition = "irritates him; teases him; makes him angry"
Output = {'verbs':['irritate', 'tease', 'make angry'], 'templates':['{{subject}} irritate {{object}}', '{{subject}} tease {{object}}', '{{subject}} make {{object}} angry']}
----------------------
Definition = "trembles"
Output = {'verbs':['tremble'], 'templates':['{{subject}} tremble']}
----------------------
Definition = "looks at him or it fixedly; stares at him or it"
Output = {'verbs':['look at fixedly', 'stare at'], 'templates':['{{subject}} look at {{object}} fixedly', '{{subject}} stare at {{object}}']}
----------------------
Definition = "is angered"
Output = {'verbs':['be angered'], 'templates': ['{{subject}} be angered']}
----------------------
Definition = "joins it to something"
Output = {'verbs':['join to something'], 'templates':['{{subject}} join {{object}} to something']}
----------------------
Definition = "smokes"
Output = {'verbs':['smoke'], 'templates':['{{subject}} smoke']}
----------------------
Definition = "writes it; writes on it"
Output = {'verbs':['write','write on'], 'templates':['{{subject}} write {{object}}', '{{subject}} write on {{object}}']}
----------------------
Definition = "is raw; is unboiled; is not sufficiently boiled"
Output = {'verbs':['be raw', 'be unboiled', 'be not sufficiently boiled'], 'templates':['{{subject}} be raw', '{{subject}} be unboiled', '{{subject}} be not sufficiently boiled']}
----------------------
Definition = "has it in hand"
Output = {'verbs': ['have in hand'], 'templates':['{{subject}} have {{object}} in hand']}
----------------------
Definition = "catches walrus"
Output = {'verbs': ['catch walrus'], 'templates':['{{subject}} catch walrus']}
----------------------
Definition = "commands him; enjoins him; urges him"
Output = {'verbs':['command', 'enjoin', 'urge'], 'templates':['{{subject}} command {{object}}', '{{subject}} enjoin {{object}}', '{{subject}} urge {{object}}']}
----------------------
Definition = "throws it"
Output = {'verbs':['throw'], 'templates':['{{subject}} throw {{object}}']}
----------------------
Definition = "gives him a name"
Output = {'verbs':['give a name'], 'templates':['{{subject}} give {{object}} a name']}
----------------------
Definition = "serves him"
Output = {'verbs':['serve'], 'templates':['{{subject}} serve {{object}}']}
----------------------
Definition = "bounds it; marks it off"
Output = {'verbs':['bound', 'mark off'], 'templates':['{{subject}} bounds {{object}}', '{{subject}} marks off {{object}}']}
----------------------
"""

def generate_verb_template(input_word: str):
    definition = str(dictionary[input_word])

    # The quotes here seem to make it less likely to hallucinate for some reason
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content":
            f'Definition: "{definition}"\n'
            "Now output ONLY the Python dictionary and nothing else.\n"
            "Begin immediately after the marker: <<<TEMPLATE>>>\n"
            "<<<TEMPLATE>>>"
        }
    ]

    # Build chat-formatted prompt
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate output with EOS stopping criteria
    outputs = model.generate(
        **model_inputs,
        max_new_tokens=100,
        temperature=0.0,       # deterministic output
        do_sample=False,
        stopping_criteria=StoppingCriteriaList([StopOnEOS(tokenizer.eos_token_id)])
    )

    # Decode and remove prompt prefix
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "<<<TEMPLATE>>>" in generated_text:
      raw_template = generated_text.split("<<<TEMPLATE>>>")[-1].strip()
    else:
      raw_template = generated_text.replace(prompt, "").strip()
    template_string = "{'verbs'" + raw_template.split("{'verbs'", 1)[1].split("']}", 1)[0] + "']}"
    template = ast.literal_eval(template_string)
    return template

In [61]:
dictionary = dict(zip(df["Greenlandic"], df["English"]))
template_storage = "/content/drive/MyDrive/TemplateWritingTest.txt"

idx = 0
errors = 0

templates = {}
completed_templates = set()

if os.path.isfile(template_storage):
    with open(template_storage, "r") as infile:
        for line in infile:
            line = line.strip()
            if not line:
                continue
            try:
                # These records are similar to the templates, but are also indexed by the Greenlandic verbs themselves.
                # This makes misalignments less likely
                record = json.loads(line)
                greenlandic_verb = record["greenlandic"]
                templates[greenlandic_verb] = {
                    "verbs": record["verbs"],
                    "templates": record["templates"]
                }
                completed_templates.add(greenlandic_verb)
            except (json.JSONDecodeError, KeyError):
                pass

total = len(dictionary)
completed = len(completed_templates)

# Progress bar
pbar = tqdm(
    total = total,
    initial = completed,
    desc = "Verb Template Generation"
)

#=======================================================================
# Transitivity
#=======================================================================

# This section filters the templates for transitivity. The template generator gets English input,
# and since English doesn't have a strong transitivity distinction for a lot of verbs (unlike Greenlandic),
# it overgenerates templates.
# Since (thanks to Greenlandic morphology) the filtering can be done algorithmically,
# we do it here, after template generation.

transitivity_markers = {
    "voq" : "intr",
    "ppoq" : "intr",
    "rpoq" : "intr",

    "vaa" : "tr",
    "ppaa" : "tr",
    "rpaa" : "tr",
    "uaa" : "tr"
}

def get_transitivity(greenlandic):
  for suffix, transitivity in transitivity_markers.items():
    if greenlandic.endswith(suffix):
      return transitivity
  return "Unknown Transitivity"

def filter_templates_by_transitivity(templates, transitivity):
  if transitivity == "intr":
    return [
        t for t in templates
        if "{{object}}" not in t
    ]
  elif transitivity == "tr":
    return [
        t for t in templates
        if "{{object}}" in t
    ]
  return templates

print(templates)

#========================================================================

for greenlandic, english in dictionary.items():
    if greenlandic in completed_templates:
      pbar.update(1)
      continue

    try:
        verb_template = generate_verb_template(greenlandic)
        transitivity = get_transitivity(greenlandic)

        filtered_templates = filter_templates_by_transitivity(
            verb_template["templates"],
            transitivity
        )

        # Jsut in case filtering removes the entire template
        if not filtered_templates:
          raise ValueError(
              f"Filtering by {transitivity} has removed all valid templates"
          )

        verb_template["templates"] = filtered_templates

        record = {
            "greenlandic": greenlandic,
            "verbs": verb_template["verbs"],
            "templates": verb_template["templates"]
        }

        with open(template_storage, "a") as out_file:
            json.dump(record, out_file)
            out_file.write("\n")

        templates[greenlandic] = verb_template
        completed_templates.add(greenlandic)

    except Exception as e:
        errors += 1
        templates[greenlandic] = None

    finally:
      pbar.update(1)

pbar.close()

print(templates)

Verb Template Generation:  31%|███       | 1954/6300 [19:38<8:34:20,  7.10s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'aggiuppaa': {'verbs': ['bring'], 'templates': ['{{subject}} bring {{object}}']}, 'agiorpoq': {'verbs': ['hovers', 'soars'], 'templates': ['{{subject}} {{verb}} {{']}, 'agiorpaa': {'verbs': ['hovers', 'soars'], 'templates': ['{{subject}} {{verb}} {{object}},', '{{subject}} {{verb}} {{object}}', '{{subject}} {{verb}} above {{object}}']}, 'allappaa': {'verbs': ['write', 'write on'], 'templates': ['{{subject}} write {{object}}', '{{subject}} write on {{object}}']}, 'allapalaarpaa': {'verbs': ['embroider', 'draw', 'provide', 'carve'], 'templates': ['{{subject}} embroider {{object}}', '{{subject}} draw {{object}}', '{{subject}} provide {{object}}', '{{subject}} carve {{object}}']}, 'alleruppaa': {'verbs': ['abstain from', 'abstain', 'abstain for'], 'templates': ['{{subject}} abstain from {{object}} for {{object}}', '{{subject}} abstain from {{object}}', '{{subject}} abstain for {{object}}']}, 'allisippaa': {'verbs': ['increase', 'raise'], 'templates': ['{{subject}} increase {{object}}', '{

Verb Template Generation:  28%|██▊       | 1789/6300 [00:01<1:26:12,  1.15s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Verb Template Generation:  28%|██▊       | 1790/6300 [00:02<1:23:36,  1.11s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Verb Template Generation:  28%|██▊       | 1791/6300 [00:02<1:05:30,  1.15it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Verb Template Generation:  29%|██▊       | 1797/6300 [00:04<27:09,  2.76it/s]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Verb Template Generation:  29%|██▊       | 1798/6300 [00:04<24:48,  3.02it/s]

KeyboardInterrupt: 

In [63]:
for key in list(templates.keys()):
  if templates[key] == None:
    del templates[key]

In [66]:
for key, template in templates.items():
  template["verbs"] = list(set(template["verbs"]))
  template["templates"] = list(set(template["templates"]))

print(templates)
print(len(templates))

{'aggiuppaa': {'verbs': ['bring'], 'templates': ['{{subject}} bring {{object}}']}, 'agiorpoq': {'verbs': ['soars', 'hovers'], 'templates': ['{{subject}} {{verb}} {{']}, 'agiorpaa': {'verbs': ['soars', 'hovers'], 'templates': ['{{subject}} {{verb}} {{object}},', '{{subject}} {{verb}} {{object}}', '{{subject}} {{verb}} above {{object}}']}, 'allappaa': {'verbs': ['write', 'write on'], 'templates': ['{{subject}} write on {{object}}', '{{subject}} write {{object}}']}, 'allapalaarpaa': {'verbs': ['carve', 'embroider', 'draw', 'provide'], 'templates': ['{{subject}} carve {{object}}', '{{subject}} draw {{object}}', '{{subject}} embroider {{object}}', '{{subject}} provide {{object}}']}, 'alleruppaa': {'verbs': ['abstain for', 'abstain from', 'abstain'], 'templates': ['{{subject}} abstain from {{object}}', '{{subject}} abstain from {{object}} for {{object}}', '{{subject}} abstain for {{object}}']}, 'allisippaa': {'verbs': ['raise', 'increase'], 'templates': ['{{subject}} increase {{object}}', '{

In [None]:
word = "aggiuppaa"


# Inflections currently supported (out of 1SG, 2SG, 3SG, 1PL, 2PL, 3PL):

# Indicative: Everything except 3PL
# Participial: Everything
# Iterative: Everything
# Optative: Everything except 2SG and 2PL
# Transitive: Only Indicative, singular subjects

def get_possible_inflections(word:str) -> dict[str]:
   return {
    "vunga" : {
        "dictionary_form" : word.replace("vunga", "voq"), # The replacements for the indicative mood are a bit redundant, but they're more consistent with other moods
        "inflection" : ["1SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "vutit" : {
        "dictionary_form" : word.replace("vutit", "voq"),
        "inflection" : ["2SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "voq" : {
    "dictionary_form" : word,
    "inflection" : ["3SG"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "vugut" : {
    "dictionary_form" : word.replace("vugut", "voq"),
    "inflection" : ["1PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "vusi" : {
    "dictionary_form" : word.replace("vusi", "voq"),
    "inflection" : ["2PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "ppunga" : {
        "dictionary_form" : word.replace("ppunga", "ppoq"),
        "inflection" : ["1SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "pputit" : {
        "dictionary_form" : word.replace("pputit", "ppoq"),
        "inflection" : ["2SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "ppoq" : {
    "dictionary_form" : word,
    "inflection" : ["3SG"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "ppugut" : {
    "dictionary_form" : word.replace("ppugut", "ppoq"),
    "inflection" : ["1PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "ppusi" : {
    "dictionary_form" : word.replace("ppusi", "ppoq"),
    "inflection" : ["2PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "rpunga" : {
        "dictionary_form" : word.replace("rpunga", "rpoq"),
        "inflection" : ["1SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "rputit" : {
        "dictionary_form" : word.replace("rputit", "rpoq"),
        "inflection" : ["2SG"],
        "mood" : "IND",
        "transitivity" : "intr"
    },

    "rpoq" : {
    "dictionary_form" : word,
    "inflection" : ["3SG"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "rpugut" : {
    "dictionary_form" : word.replace("rpugut", "rpoq"),
    "inflection" : ["1PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "rpusi" : {
    "dictionary_form" : word.replace("rpusi", "rpoq"),
    "inflection" : ["2PL"],
    "mood" : "IND",
    "transitivity" : "intr"
    },

    "sunga" : {
        "dictionary_form" : word.replace("sunga", "voq"),
        "inflection" : ["1SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "sutit" : {
        "dictionary_form" : word.replace("sutit", "voq"),
        "inflection" : ["2SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "soq" : {
    "dictionary_form" : word.replace("soq", "voq"),
    "inflection" : ["3SG"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "sugut" : {
    "dictionary_form" : word.replace("sugut", "voq"),
    "inflection" : ["1PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "susi" : {
    "dictionary_form" : word.replace("susi", "voq"),
    "inflection" : ["2PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "sut" : {
    "dictionary_form" : word.replace("sut", "voq"),
    "inflection" : ["3PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "ttunga" : {
        "dictionary_form" : word.replace("ttunga", "ppoq"),
        "inflection" : ["1SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "ttutit" : {
        "dictionary_form" : word.replace("ttutit", "ppoq"),
        "inflection" : ["2SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "ttoq" : {
    "dictionary_form" : word.replace("ttoq", "ppoq"),
    "inflection" : ["3SG"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "ttugut" : {
      "dictionary_form" : word.replace("ttugut", "ppoq"),
      "inflection" : ["1PL"],
      "mood" : "PTCP",
      "transitivity" : "intr"
    },

    "ttusi" : {
    "dictionary_form" : word.replace("ttusi", "ppoq"),
    "inflection" : ["2PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "ttut" : {
    "dictionary_form" : word.replace("ttut", "ppoq"),
    "inflection" : ["3PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "rtunga" : {
        "dictionary_form" : word.replace("rtunga", "rpoq"),
        "inflection" : ["1SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "rtutit" : {
        "dictionary_form" : word.replace("rtutit", "oq"),
        "inflection" : ["2SG"],
        "mood" : "PTCP",
        "transitivity" : "intr"
    },

    "rtoq" : {
    "dictionary_form" : word.replace("rtoq", "rpoq"),
    "inflection" : ["3SG"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "rtugut" : {
    "dictionary_form" : word.replace("rtugut", "rpoq"),
    "inflection" : ["1PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "rtusi" : {
    "dictionary_form" : word.replace("rtusi", "rpoq"),
    "inflection" : ["2PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "rtut" : {
    "dictionary_form" : word.replace("rtut", "rpoq"),
    "inflection" : ["3PL"],
    "mood" : "PTCP",
    "transitivity" : "intr"
    },

    "gaangama" : {
        "dictionary_form" : word.replace("gaangama", "voq"),
        "inflection" : ["1SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "gaangavit" : {
        "dictionary_form" : word.replace("gaangavit", "voq"),
        "inflection" : ["2SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "gaangat" : {
    "dictionary_form" : word.replace("gaangat", "voq"),
    "inflection" : ["3SG"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "gaangatta" : {
    "dictionary_form" : word.replace("gaangatta", "voq"),
    "inflection" : ["1PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "gaangassi" : {
    "dictionary_form" : word.replace("gaangassi", "voq"),
    "inflection" : ["2PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "gaangata" : {
    "dictionary_form" : word.replace("gaangata", "voq"),
    "inflection" : ["3PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "kkaangama" : {
        "dictionary_form" : word.replace("kkaangama", "ppoq"),
        "inflection" : ["1SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "kkaangavit" : {
        "dictionary_form" : word.replace("kkaangavit", "ppoq"),
        "inflection" : ["2SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "kkaangat" : {
    "dictionary_form" : word.replace("kkaangat", "ppoq"),
    "inflection" : ["3SG"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "kkaangatta" : {
    "dictionary_form" : word.replace("kkaangatta", "ppoq"),
    "inflection" : ["1PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "kkaangassi" : {
    "dictionary_form" : word.replace("kkaangassi", "ppoq"),
    "inflection" : ["2PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "kkaangata" : {
    "dictionary_form" : word.replace("kkaangata", "ppoq"),
    "inflection" : ["3PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "raangama" : {
        "dictionary_form" : word.replace("raangama", "rpoq"),
        "inflection" : ["1SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "raangavit" : {
        "dictionary_form" : word.replace("raangavit", "rpoq"),
        "inflection" : ["2SG"],
        "mood" : "ITER",
        "transitivity" : "intr"
    },

    "raangat" : {
    "dictionary_form" : word.replace("raangat", "rpoq"),
    "inflection" : ["3SG"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "raangatta" : {
    "dictionary_form" : word.replace("raangatta", "rpoq"),
    "inflection" : ["1PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "raangassi" : {
    "dictionary_form" : word.replace("raangassi", "rpoq"),
    "inflection" : ["2PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "raangata" : {
    "dictionary_form" : word.replace("raangata", "rpoq"),
    "inflection" : ["3PL"],
    "mood" : "ITER",
    "transitivity" : "intr"
    },

    "llanga" : {
        "dictionary_form" : word.replace("llanga", "ppoq"),
        "inflection" : ["1SG"],
        "mood" : "OPT",
        "transitivity" : "intr"
    },

    "lli" : {
    "dictionary_form" : word.replace("lli", "ppoq"),
    "inflection" : ["3SG"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "llata" : {
    "dictionary_form" : word.replace("llata", "ppoq"),
    "inflection" : ["1PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "llit" : {
    "dictionary_form" : word.replace("llit", "ppoq"),
    "inflection" : ["3PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "rlanga" : {
        "dictionary_form" : word.replace("rlanga", "rpoq"),
        "inflection" : ["1SG"],
        "mood" : "OPT",
        "transitivity" : "intr"
    },

    "rli" : {
    "dictionary_form" : word.replace("rli", "rpoq"),
    "inflection" : ["3SG"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "rlata" : {
    "dictionary_form" : word.replace("rlata", "rpoq"),
    "inflection" : ["1PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "rlit" : {
    "dictionary_form" : word.replace("rlit", "rpoq"),
    "inflection" : ["3PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "langa" : {
        "dictionary_form" : word.replace("langa", "voq"),
        "inflection" : ["1SG"],
        "mood" : "OPT",
        "transitivity" : "intr"
    },

    "li" : {
    "dictionary_form" : word.replace("li", "voq"),
    "inflection" : ["3SG"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "lata" : {
    "dictionary_form" : word.replace("lata", "voq"),
    "inflection" : ["1PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "lit" : {
    "dictionary_form" : word.replace("lit", "voq"),
    "inflection" : ["3PL"],
    "mood" : "OPT",
    "transitivity" : "intr"
    },

    "vakkit" : {
        "dictionary_form" : word.replace("vakkit", "vaa"),
        "inflection" : ["1SG", "2SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vara" : {
        "dictionary_form" : word.replace("vara", "vaa"),
        "inflection" : ["1SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vassi" : {
    "dictionary_form" : word.replace("vassi", "vaa"),
    "inflection" : ["1SG", "2PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "vakka" : {
    "dictionary_form" : word.replace("vakka", "vaa"),
    "inflection" : ["1SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "varma" : {
    "dictionary_form" : word.replace("varma", "vaa"),
    "inflection" : ["2SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "vat" : {
        "dictionary_form" : word.replace("vat", "vaa"),
        "inflection" : ["2SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vatsigut" : {
        "dictionary_form" : word.replace("vatsigut", "vaa"),
        "inflection" : ["2SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vatit" : {
    "dictionary_form" : word.replace("vatit", "vaa"),
    "inflection" : ["2SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "vaanga" : {
    "dictionary_form" : word.replace("vaanga", "vaa"),
    "inflection" : ["3SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "vaatit" : {
    "dictionary_form" : word.replace("vaatit", "vaa"),
    "inflection" : ["3SG", "2SGF"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "vaatigut" : {
        "dictionary_form" : word.replace("vaatigut", "vaa"),
        "inflection" : ["3SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vaasi" : {
        "dictionary_form" : word.replace("vaasi", "vaa"),
        "inflection" : ["3SG", "2PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "vai" : {
    "dictionary_form" : word.replace("vai", "vaa"),
    "inflection" : ["3SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppakkit" : {
        "dictionary_form" : word.replace("ppakkit", "ppaa"),
        "inflection" : ["1SG", "2SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppara" : {
        "dictionary_form" : word.replace("ppara", "ppaa"),
        "inflection" : ["1SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppassi" : {
    "dictionary_form" : word.replace("ppassi", "ppaa"),
    "inflection" : ["1SG", "2PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppakka" : {
    "dictionary_form" : word.replace("ppakka", "ppaa"),
    "inflection" : ["1SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "pparma" : {
    "dictionary_form" : word.replace("pparma", "ppaa"),
    "inflection" : ["2SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppat" : {
        "dictionary_form" : word.replace("ppat", "ppaa"),
        "inflection" : ["2SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppatsigut" : {
        "dictionary_form" : word.replace("ppatsigut", "ppaa"),
        "inflection" : ["2SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppatit" : {
    "dictionary_form" : word.replace("ppatit", "ppaa"),
    "inflection" : ["2SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppaanga" : {
    "dictionary_form" : word.replace("ppaanga", "ppaa"),
    "inflection" : ["3SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppaatit" : {
    "dictionary_form" : word.replace("ppaatit", "ppaa"),
    "inflection" : ["3SG", "2SGF"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "ppaatigut" : {
        "dictionary_form" : word.replace("ppaatigut", "ppaa"),
        "inflection" : ["3SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppaasi" : {
        "dictionary_form" : word.replace("ppaasi", "ppaa"),
        "inflection" : ["3SG", "2PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "ppai" : {
    "dictionary_form" : word.replace("ppai", "ppaa"),
    "inflection" : ["3SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpakkit" : {
      "dictionary_form" : word.replace("rpakkit", "rpaa"),
      "inflection" : ["1SG", "2SG"],
      "mood" : "IND",
      "transitivity" : "tr"
    },

    "rpara" : {
        "dictionary_form" : word.replace("rpara", "rpaa"),
        "inflection" : ["1SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "rpassi" : {
    "dictionary_form" : word.replace("rpassi", "rpaa"),
    "inflection" : ["1SG", "2PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpakka" : {
    "dictionary_form" : word.replace("rpakka", "rpaa"),
    "inflection" : ["1SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rparma" : {
    "dictionary_form" : word.replace("rparma", "rpaa"),
    "inflection" : ["2SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpat" : {
        "dictionary_form" : word.replace("rpat", "rpaa"),
        "inflection" : ["2SG", "3SG"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "rpatsigut" : {
        "dictionary_form" : word.replace("rpatsigut", "rpaa"),
        "inflection" : ["2SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "rpatit" : {
    "dictionary_form" : word.replace("rpatit", "rpaa"),
    "inflection" : ["2SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpaanga" : {
    "dictionary_form" : word.replace("rpaanga", "rpaa"),
    "inflection" : ["3SG", "1SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpaatit" : {
    "dictionary_form" : word.replace("rpaatit", "rpaa"),
    "inflection" : ["3SG", "2SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpaa" : {
    "dictionary_form" : word,
    "inflection" : ["3SG", "3SG"],
    "mood" : "IND",
    "transitivity" : "tr"
    },

    "rpaatigut" : {
        "dictionary_form" : word.replace("rpaatigut", "rpaa"),
        "inflection" : ["3SG", "1PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "rpaasi" : {
        "dictionary_form" : word.replace("rpaasi", "rpaa"),
        "inflection" : ["3SG", "2PL"],
        "mood" : "IND",
        "transitivity" : "tr"
    },

    "rpai" : {
    "dictionary_form" : word.replace("rpai", "rpaa"),
    "inflection" : ["3SG", "3PL"],
    "mood" : "IND",
    "transitivity" : "tr"
    },
  }

word = "allapalaarpaa"

print(get_possible_inflections(word))

inflections = get_possible_inflections(word)
for suffix, inflection in inflections.items():
  if word.endswith(suffix):
    print(inflection["dictionary_form"])

{'vunga': {'dictionary_form': 'allapalaarpaa', 'inflection': ['1SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'vutit': {'dictionary_form': 'allapalaarpaa', 'inflection': ['2SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'voq': {'dictionary_form': 'allapalaarpaa', 'inflection': ['3SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'vugut': {'dictionary_form': 'allapalaarpaa', 'inflection': ['1PL'], 'mood': 'IND', 'transitivity': 'intr'}, 'vusi': {'dictionary_form': 'allapalaarpaa', 'inflection': ['2PL'], 'mood': 'IND', 'transitivity': 'intr'}, 'ppunga': {'dictionary_form': 'allapalaarpaa', 'inflection': ['1SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'pputit': {'dictionary_form': 'allapalaarpaa', 'inflection': ['2SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'ppoq': {'dictionary_form': 'allapalaarpaa', 'inflection': ['3SG'], 'mood': 'IND', 'transitivity': 'intr'}, 'ppugut': {'dictionary_form': 'allapalaarpaa', 'inflection': ['1PL'], 'mood': 'IND', 'transitivity': 'intr'}, 'ppusi': {'dictiona

In [None]:
verb_templates = templates

In [119]:
english_mood_markers = {
    "IND": "",
    "PTCP": "that ",
    "ITER": "whenever ",
    "OPT": "may "
}

english_subject_pronouns = {
    "1SG" : "I ",
    "2SG" : "you ",
    "3SG" : "he/she ", # The most elegant solution to the 3SG pronoun is singular they with, say, "they all" for 3PL, but this is perhaps somewhat more easily readable
    "1PL" : "we ",
    "2PL" : "you all ",
    "3PL" : "they "
}

english_object_pronouns = {
    "1SG" : " me",
    "2SG" : " you",
    "3SG" : " him/her",
    "1PL" : " us",
    "2PL" : " you all",
    "3PL" : " them"
}

def analyze_greenlandic_verb(word: str) -> dict:
    """
    Uses the inflectional suffix of a Greenlandic verb to determine information about subject, object, mood, and dictionary form
    """
    inflections = get_possible_inflections(word)
    for suffix, inflection in inflections.items():
      if word.endswith(suffix):
        return inflection

def look_up_templates(dictionary_form: str, transitivity: str) -> list:
    """
    Returns a list of templates for the Greenlandic verb, keeping in mind transitivity.
    """
    templates = verb_templates[dictionary_form]["templates"]
    if transitivity == "intr": # Templates have already been filtered by transitivity; this is just a backup
        templates = [t for t in templates if "{{object}}" not in t]
    elif transitivity == "tr":
        templates = [t for t in templates if "{{object}}" in t]
    return templates

def fill_template(template: str, inflected_english_verb: str, subject_inflection: str, object_inflection: str) -> str:
    """
    Returns a filled-in English sentence from a template
    """
    sentence = template.replace("{{subject}}", english_subject_pronouns[subject_inflection])
    sentence = sentence.replace("{{object}}", english_object_pronouns[object_inflection])
    return sentence.replace("VERB", inflected_english_verb)

def inflect_english(verb_base: str, subject_inflection: str, mood: str) -> str:
    """
    Inflect English verb according to person/number
    """
    if mood == "OPT":
      tag = "VBP"
    elif subject_inflection == "3SG":
      tag = "VBZ"
    else:
      tag = "VBP"
    inflected = getInflection(verb_base, tag=tag)
    if not inflected:
      return None
    return inflected[0]

def translate_greenlandic_verb(word: str) -> list:
    """
    The full translation pipeline (tm)
    """
    morphological_analysis = analyze_greenlandic_verb(word)

    if not isinstance(morphological_analysis, dict):
      return None   # or: morphological_analysis = {}

    dictionary_form = morphological_analysis.get("dictionary_form")

    if not dictionary_form:
      dictionary_form = word

    templates = look_up_templates(
        morphological_analysis["dictionary_form"],
        morphological_analysis["transitivity"]
    )
    mood = morphological_analysis["mood"]
    translations = []
    if morphological_analysis["transitivity"] == "tr":
      subject_inflection = morphological_analysis["inflection"][0]
      object_inflection = morphological_analysis["inflection"][1]
      for t in templates:
        english_verb_uninflected = t.replace("{{subject}} ", "").replace(" {{object}}", "")
        inflected_verb = inflect_english(
          english_verb_uninflected, subject_inflection, mood
        )
        if inflected_verb is None:
          inflected_verb = english_verb_uninflected

        inflected_verb = inflected_verb.strip()

        translation = english_mood_markers[morphological_analysis["mood"]] + english_subject_pronouns[subject_inflection] + inflected_verb + english_object_pronouns[object_inflection]
        translations.append(translation)
    else:
      subject_inflection = morphological_analysis["inflection"][0]
      for t in templates:
        english_verb_uninflected = t.replace("{{subject}} ", "")
        inflected_verb = inflect_english(
          english_verb_uninflected, subject_inflection, mood
        )
        if inflected_verb is None:
          inflected_verb = english_verb_uninflected
        inflected_verb = inflected_verb.strip()
        translation = english_mood_markers[morphological_analysis["mood"]] + english_subject_pronouns[subject_inflection] + inflected_verb
        translations.append(translation)

    return translations

translate_greenlandic_verb("allapalaarpaa")


['he/she carves him/her',
 'he/she draws him/her',
 'he/she embroiders him/her',
 'he/she provides him/her']

In [109]:
def calculate_chrf(gold_translation: list, output: list) -> tuple:
  chrf_scores = []
  for correct_translation, generated_translation in zip(gold_translation, output):
    chrf_scores.append(nltk.translate.chrf_score.chrf_precision_recall_fscore_support(correct_translation, generated_translation, 1))
  return chrf_scores[0]

def measure_time(word: str) -> int:
  """
  Run this function to measure the speed of template generation
  """
  # Record time for template generation
  start_time = time.time()
  _ = translate_greenlandic_verb(word) # _ is a dummy variable
  end_time = time.time()
  elapsed = end_time - start_time
  print("Template generation took {:.3f} "
  "seconds.".format(elapsed))
  return elapsed


In [112]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_semantic_similarity_individual(sentence1: str, sentence2: str, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')):
  # Tokenize the sentences
  tokens1 = tokenizer.tokenize(sentence1)
  tokens2 = tokenizer.tokenize(sentence2)

  # Add [CLS] and [SEP] tokens
  tokens = ['[CLS]'] + tokens1 + ['[SEP]'] + tokens2 + ['[SEP]']

  # Convert tokens to input IDs
  input_ids = tokenizer.convert_tokens_to_ids(tokens)

  # Load the BERT tokenizer and model
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')

  # Convert tokens to input IDs
  input_ids1 = torch.tensor(tokenizer.convert_tokens_to_ids(tokens1)).unsqueeze(0)  # Batch size 1
  input_ids2 = torch.tensor(tokenizer.convert_tokens_to_ids(tokens2)).unsqueeze(0)  # Batch size 1

  # Obtain the BERT embeddings
  with torch.no_grad():
      outputs1 = model(input_ids1)
      outputs2 = model(input_ids2)
      embeddings1 = outputs1.last_hidden_state[:, 0, :]  # [CLS] token
      embeddings2 = outputs2.last_hidden_state[:, 0, :]  # [CLS] token

  # Calculate similarity
  similarity_score = cosine_similarity(embeddings1, embeddings2)
  return similarity_score

def get_semantic_similarity(sentence1: str, sentence2: str):
  for i, j in zip(sentence1, sentence2):
    return get_semantic_similarity_individual(i,j)


In [122]:
def chrf_fscore(ref, hyp):
    chrf = calculate_chrf([ref], [hyp])

    if isinstance(chrf, tuple):
        return chrf[-2]
    return float(chrf)

def test_translations(test_set: dict, model=None):
    """
    Evaluates Greenlandic verb translations.

    Args:
        test_set: dict in the form {greenlandic_verb: list_of_correct_translations}
        model: optional SentenceTransformer model for semantic similarity

    Returns:
        metrics: dict containing averages, coverage, and counts
    """
    from sentence_transformers import SentenceTransformer, util

    if model is None:
        model = SentenceTransformer("all-MiniLM-L6-v2")

    speeds = 0.0
    chrf_score = 0.0
    semantic_score = 0.0

    valid = 0       # number of successful translations
    missing = 0     # verbs skippde due to missing dictionary/templates/inflection
    errors = 0      # verbs where some other exception occurred

    for verb, translations in test_set.items():
        try:
            pred = translate_greenlandic_verb(verb)

            if pred is None:
                missing += 1
                continue

            if isinstance(pred, (list, tuple)):
                predicted_translation = ", ".join(pred)
            else:
                predicted_translation = str(pred)

            correct_translation = ", ".join(sorted(translations))

            chrf = calculate_chrf([correct_translation], [predicted_translation])
            if isinstance(chrf, tuple):
                chrf_value = chrf[-1]  # F-score
            else:
                chrf_value = float(chrf)
            chrf_score += chrf_value / 100.0

            emb_ref = model.encode(correct_translation, convert_to_tensor=True)
            emb_pred = model.encode(predicted_translation, convert_to_tensor=True)
            semsim = util.cos_sim(emb_ref, emb_pred).item()
            semantic_score += semsim

            # Measure translation time
            speeds += measure_time(verb)

            valid += 1

        except Exception as e:
            print(f"[ERROR] verb = {verb}: {e}")
            errors += 1

    if valid == 0:
        raise ValueError("No successful translations in the test set.")

    metrics = {
        "avg_speed": speeds / valid,
        "avg_chrf": chrf_score / valid,                # normalized 0–1
        "avg_semantic_similarity": semantic_score / valid,
        "valid": valid,
        "missing": missing,
        "errors": errors,
        "coverage": valid / len(test_set),
    }

    return metrics

test_translations(test_set)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
[ERROR] verb = qallunerivunga: 'qallunerivoq'
[ERROR] verb = qallunerisutit: 'qallunerivoq'
[ERROR] verb = qallunerisugut: 'qallunerivoq'
[ERROR] verb = akivalit: 'akivavoq'
Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
Template generation took 0.000 seconds.
[ERROR] verb = utoqqaavusi: 'utoqqaavoq'
[ERROR] verb = utoqqaasusi: 'utoqqaavoq'
[ERROR] verb = utoqqaalanga: 'utoqqaavoq'
[ERROR] verb = uppikaasoq: 'uppikaavoq'
[ERROR] verb = uppikaalata: 'uppikaavoq'
[ERROR] verb = uppikaavutit: 'uppikaavoq'
[ERROR] verb = ungialissigaangama: 'ungialissivoq'
[ERROR] verb = ungiallissigaangama: 'ungiallissivoq'
[ERROR] verb = ungiallissigaangat: 'ungiallissivoq'
[ERROR] verb = ungiallissilit: 'ungiallissivoq'
[ERROR] verb = aalajagaangassi: 'aalajavoq'
[ERROR] ver

{'avg_speed': 0.0001375760350908552,
 'avg_chrf': 0.25214285714285717,
 'avg_semantic_similarity': 0.8484260163136891,
 'valid': 28,
 'missing': 3,
 'errors': 91,
 'coverage': 0.22950819672131148}

In [123]:
print(len(test_set))

122
