In [None]:
from nllb_try.config import config
from nllb_try.downloadtatoeba import main_download
from nllb_try.corpus import main_corpus
# from nllb_try.evaluate import main_evaluate


# Step 1: Download data
main_download(config["source_langs_tatoeba"])

# Step 2: Load and create parallel corpus
corpus_objects = main_corpus(config["source_langs_tatoeba"], config["source_langs_nllb"])

In [None]:
from nllb_try.train import main_train
# Step 3: Train the model
main_train(corpus_objects)

In [None]:
from nllb_try.tryout import main_tryout
from nllb_try.config import config
MODEL_SAVE_PATH = config["MODEL_SAVE_PATH"]
print(MODEL_SAVE_PATH)

input_sentences_nld = [
    "Ik hou van kaas. Midden van de nacht, laat het razen.",
    "ik hou van chocolade.",
    "De zon schijnt helder. Ik heb mijn zonnebril op.",
    "de zon schijnt fel vandaag.",
    "Zij leest een boek.",
    "het boek ligt op de tafel.",
    "Wij drinken water, zei de kater.",
    "De kat slaapt zachtjes. Groetjes van oma.",
    "we gaan morgen naar oma.",
    "de straatkat is een kat die op straat leeft.",
    "de hond blaft naar de kat.",
    "ik heb een hekel aan regen.",
    "Het regent buiten. Neem een paraplu mee!",
    "ik ben benieuwd of dit werkt.",
    "Jij bent het zusje van mijn buurman!",
    "Onverwachts kwamen de ouders van Bert op bezoek.",
    "Hebben jullie dat gezegd?",
    "de molenaar ziet jullie.",
    "ik weet niet wat je bedoelt...",
    "wiens linkerarm is het sterkste? die van mij.",
    "Hoe graag wil je het hebben?",
    "het is vandaag mooi weer.",
    "die film was echt heel spannend.",
    "de vogel zingt een mooi liedje.",
    "het kind speelt in de tuin.",
]

print("--- initial translation (nl to gos) ---")

translations_gos = main_tryout(
    MODEL_SAVE_PATH,
    config["new_lang_nllb"],
    input_sentences_nld,
    src_lang='nld_Latn',
    tgt_lang='gos_Latn'
)

print("\n--- backtranslation (gos to nl) ---")
# Step 3: Use the output of the first translation for back-translation (Gos to Dutch)
back_translations_nld = main_tryout(
    MODEL_SAVE_PATH,
    config["new_lang_nllb"],
    translations_gos,
    src_lang='gos_Latn',
    tgt_lang='nld_Latn'
)

print("\n--- paired results ---")
for i, original_nld in enumerate(input_sentences_nld):
    print(f"Original Dutch:     {original_nld}")
    print(f"Translated Gos:     {translations_gos[i]}")
    print(f"Back-Translated Dutch: {back_translations_nld[i]}\n")

In [None]:
# evaluate the differen training stages of the model
from nllb_try.evaluate import main_evaluate
main_evaluate(corpus_objects, MODEL_SAVE_PATH, "gos_Latn", config["timestamp"])

In [None]:
import os

from nllb_try.tokenizer_and_model_setup import setup_model_and_tokenizer

model_versions = [
    d for d in os.listdir(MODEL_SAVE_PATH)
    if os.path.isdir(os.path.join(MODEL_SAVE_PATH, d))
]
model_versions.sort(key=lambda x: int(x))
print(model_versions)
latest_model = model_versions[-1]
model_path = os.path.join(MODEL_SAVE_PATH, latest_model)
print(f"Loading model from {model_path}...")
model, tokenizer = setup_model_and_tokenizer(model_path, new_lang=config["new_lang_nllb"], device='cuda')
print("Model loaded successfully.")

In [None]:
import huggingface_hub

In [None]:
huggingface_hub.login(token="")

In [None]:
upload_repo = "Tom9358/nllb-tatoeba-gos-nld-v1"

In [None]:
tokenizer.push_to_hub(upload_repo)
model.push_to_hub(upload_repo)

In [None]:
import os

def print_directory_structure(base_dir: str):
    """
    Print de structuur van mappen binnen de basisdirectory, tot twee niveaus diep.
    """
    try:
        first_level = os.listdir(base_dir)
        print(f"Eerste niveau in '{base_dir}' bevat:")
        print(first_level)

        for first_level_dir in first_level:
            first_level_path = os.path.join(base_dir, first_level_dir)
            if os.path.isdir(first_level_path):
                second_level = os.listdir(first_level_path)
                print(f"\nTweede niveau in '{first_level_dir}' bevat:")
                print(second_level)
    except Exception as e:
        print(f'Er is een fout opgetreden: {e}')

if __name__ == "__main__":
    base_directory = 'models'
    print_directory_structure(base_directory)