In [None]:
# make sure to (uv) pip install (-e) .

from nllb_try.config import config
from nllb_try.corpus import main_corpus

# Tatoeba download niet meer nodig, doet TatoebaCorpus zelf
# from nllb_try.downloadtatoeba import main_download
# main_download(config["source_langs_tatoeba"])

# Step 2: Load and create parallel corpus
corpus_objects = main_corpus(
    config["source_langs_tatoeba"],
    config["source_langs_nllb"],
    variety_dir="data/RUG_data"
)

In [None]:
# Assuming corpus_objects is your list

for i, corpus in enumerate(corpus_objects):
    print(f"\nCorpus {i}: {type(corpus).__name__}")
    print(f"  Source NLLB label: {corpus.source_lang_nllb}")
    print(f"  Target NLLB label: {corpus.target_lang_nllb}")

    # Sizes
    df = corpus.df
    df_train = corpus.df_train
    df_validate = corpus.df_validate
    print(f"  Total rows:     {len(df)}")
    print(f"  Train rows:     {len(df_train)}")
    print(f"  Validate rows:  {len(df_validate)}")

    # Longest/shortest target
    if len(df) > 0:
        lengths = df["target_sentence"].str.len()
        longest_idx = lengths.idxmax()
        shortest_idx = lengths.idxmin()
        print(f"  Longest target_sentence ({lengths[longest_idx]} chars):")
        print(f"    {df.loc[longest_idx, 'target_sentence']}")
        # print(f"  Shortest target_sentence ({lengths[shortest_idx]} chars):")
        # print(f"    {df.loc[shortest_idx, 'target_sentence']}")
    else:
        print("  No sentences in this corpus.")

print("\nDone.")

In [None]:
from nllb_try.train import main_train
# Step 3: Train the model
main_train(corpus_objects)

In [None]:
from nllb_try.tokenizer_and_model_setup import setup_model_and_tokenizer
from nllb_try.config import config

# Load the tokenizer and model (use params that your training uses):
model, tokenizer = setup_model_and_tokenizer(
    config["modelname"],
    config["modelpath"],
    config["new_lang_nllb"],
    config["similar_lang_nllb"],
    device=config['device']
)

import numpy as np
import matplotlib.pyplot as plt

def estimate_max_length_and_plot_outliers(tokenizer, sentences, percentile=98, sample_size=5000, n_outliers=5, title='Token Length Distribution'):
    """
    Estimate max_length, plot distribution, and print the most extreme outliers.
    """
    n = len(sentences)
    size = min(sample_size, n)
    sample_idxs = np.random.choice(n, size=size, replace=False)
    sample = [sentences[i] for i in sample_idxs]
    lengths = [len(tokenizer.tokenize(s)) for s in sample]
    suggested = int(np.percentile(lengths, percentile))

    # Stats
    print(
        f"Estimated max_length at {percentile}th percentile: {suggested}\n"
        f"Sample size={size}, max={max(lengths)}, mean={np.mean(lengths):.1f}, median={np.median(lengths):.1f}"
    )

    # Plot
    plt.figure(figsize=(8,4))
    plt.hist(lengths, bins=range(0, max(lengths)+1, 1), alpha=0.7, color='dodgerblue', edgecolor="black")
    plt.axvline(suggested, color="crimson", linestyle="dashed", lw=2,
                label=f"{percentile}th percentile ({suggested})")
    plt.xlabel("Tokenized Sentence Length")
    plt.ylabel("Count")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Report outliers (show longest n_outliers)
    print(f"\nTop {n_outliers} outliers (longest in sample):")
    lengths_np = np.array(lengths)
    outlier_idxs = lengths_np.argsort()[-n_outliers:][::-1]
    for i in outlier_idxs:
        print(f"\n[{lengths[i]} tokens]\n{sample[i][:500]}")  # Show up to 500 chars

    return suggested

source_sentences = []
target_sentences = []
for corpus in corpus_objects:
    source_sentences.extend(corpus.df_train['source_sentence'].tolist())
    target_sentences.extend(corpus.df_train['target_sentence'].tolist())

percentile = 98
max_length_src = estimate_max_length_and_plot_outliers(
    tokenizer, source_sentences, percentile, title='Source Sentence Token Lengths'
)
max_length_tgt = estimate_max_length_and_plot_outliers(
    tokenizer, target_sentences, percentile, title='Target Sentence Token Lengths'
)
optimal_max_length = max(max_length_src, max_length_tgt)
print(f"\nSuggested max_length for training: {optimal_max_length}")

In [None]:
from nllb_try.tryout import main_tryout
from nllb_try.config import config
MODEL_SAVE_PATH = config["MODEL_SAVE_PATH"]
print(MODEL_SAVE_PATH)

input_sentences_nld = [
    "Ik hou van kaas. Midden van de nacht, laat het razen.",
    "ik hou van chocolade.",
    "De zon schijnt helder. Ik heb mijn zonnebril op.",
    "de zon schijnt fel vandaag.",
    "Zij leest een boek.",
    "het boek ligt op de tafel.",
    "Wij drinken water, zei de kater.",
    "De kat slaapt zachtjes. Groetjes van oma.",
    "we gaan morgen naar oma.",
    "de straatkat is een kat die op straat leeft.",
    "de hond blaft naar de kat.",
    "ik heb een hekel aan regen.",
    "Het regent buiten. Neem een paraplu mee!",
    "ik ben benieuwd of dit werkt.",
    "Jij bent het zusje van mijn buurman!",
    "Onverwachts kwamen de ouders van Bert op bezoek.",
    "Hebben jullie dat gezegd?",
    "de molenaar ziet jullie.",
    "ik weet niet wat je bedoelt...",
    "wiens linkerarm is het sterkste? die van mij.",
    "Hoe graag wil je het hebben?",
    "het is vandaag mooi weer.",
    "die film was echt heel spannend.",
    "de vogel zingt een mooi liedje.",
    "het kind speelt in de tuin.",
    "wat had jij daarop kunnen zeggen?"
]

print("--- initial translation (nl to gos) ---")

translations_gos = main_tryout(
    MODEL_SAVE_PATH,
    config["new_lang_nllb"],
    input_sentences_nld,
    src_lang='nld_Latn',
    tgt_lang='gos_Latn'
)

print("\n--- backtranslation (gos to nl) ---")
# Step 3: Use the output of the first translation for back-translation (Gos to Dutch)
back_translations_nld = main_tryout(
    MODEL_SAVE_PATH,
    config["new_lang_nllb"],
    translations_gos,
    src_lang='gos_Latn',
    tgt_lang='nld_Latn'
)

print("\n--- paired results ---")
for i, original_nld in enumerate(input_sentences_nld):
    print(f"Original Dutch:     {original_nld}")
    print(f"Translated Gos:     {translations_gos[i]}")
    print(f"Back-Translated Dutch: {back_translations_nld[i]}\n")

In [None]:
# evaluate the different training stages of the model
# MODEL_SAVE_PATH = config["MODEL_SAVE_PATH"]
from nllb_try.evaluate import main_evaluate
main_evaluate(corpus_objects, MODEL_SAVE_PATH, "gos_Latn", config["timestamp"])

In [None]:
# from nllb_try.config import config
# MODEL_SAVE_PATH = "/home/tom.brand/Offline/nllb-tryout/notebooks/checkpoints/nllb-200-distilled-600M-nld-gos-20251207-220722/"

import os

from nllb_try.tokenizer_and_model_setup import setup_model_and_tokenizer
model_path = os.path.join(MODEL_SAVE_PATH, "epoch12") # select the epoch that scores best
print(f"Loading model from {model_path}...")
model, tokenizer = setup_model_and_tokenizer(model_path, new_lang=config["new_lang_nllb"], device='cuda')
print("Model loaded successfully.")

In [None]:
from nllb_try.tryout import translate
print(translate(
    'De zon schijnt helder. Ik heb mijn zonnebril op.',
    src_lang='nld_Latn',
    tgt_lang='gos_Latn',
    model=model,
    tokenizer=tokenizer
))

In [None]:
import huggingface_hub

In [None]:
huggingface_hub.login(token="")

In [None]:
upload_repo = "Tom9358/nllb-tatoeba-gos-nld-v1"

In [None]:
tokenizer.push_to_hub(upload_repo)
model.push_to_hub(upload_repo)

In [None]:
import os

def print_directory_structure(base_dir: str):
    """
    Print de structuur van mappen binnen de basisdirectory, tot twee niveaus diep.
    """
    try:
        first_level = os.listdir(base_dir)
        print(f"Eerste niveau in '{base_dir}' bevat:")
        print(first_level)

        for first_level_dir in first_level:
            first_level_path = os.path.join(base_dir, first_level_dir)
            if os.path.isdir(first_level_path):
                second_level = os.listdir(first_level_path)
                print(f"\nTweede niveau in '{first_level_dir}' bevat:")
                print(second_level)
    except Exception as e:
        print(f'Er is een fout opgetreden: {e}')

if __name__ == "__main__":
    base_directory = 'models'
    print_directory_structure(base_directory)