In [1]:
import os

# 1. Install the Kaggle API client
!pip install kaggle

# 2. Upload your kaggle.json file
#    - Go to Kaggle, click on your profile picture -> Account -> Create New API Token.
#    - This will download `kaggle.json`.
#    - In Colab, you would typically upload this file to your working directory
#      (e.g., using files.upload() or by mounting Google Drive).
#    - Ensure the file is placed in `~/.kaggle/kaggle.json` (or `/root/.kaggle/kaggle.json` in Colab) and has correct permissions.

# Example of creating the directory and moving the file if uploaded to /content
import shutil
if not os.path.exists('/root/.kaggle'):
    os.makedirs('/root/.kaggle')
# Assuming kaggle.json was uploaded to /content/
if os.path.exists('/content/kaggle.json'):
    shutil.move('/content/kaggle.json', '/root/.kaggle/kaggle.json')
    os.chmod('/root/.kaggle/kaggle.json', 0o600) # Set read/write permissions for owner only
    print("Kaggle API key configured.")
else:
    print("kaggle.json not found in /content/. Please upload your API key.")

# 3. Download a dataset
# Replace 'dataset-owner/dataset-name' with the actual dataset you want.
# For example, to download the dataset referenced in your existing code:
!kaggle datasets download -d deep-past-initiative/deep-past-initiative-machine-translation -p ./datasets

# Or for a competition dataset (requires competition accept rules):
!kaggle competitions download -c deep-past-initiative-machine-translation -p ./datasets

# After downloading, you might need to unzip it
!unzip -q ./datasets/deep-past-initiative-machine-translation.zip -d ./datasets

print("Kaggle download instructions provided. Uncomment and run the relevant lines after setting up your API key.")


Kaggle API key configured.
403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/datasets/metadata/deep-past-initiative/deep-past-initiative-machine-translation
Downloading deep-past-initiative-machine-translation.zip to ./datasets
 86% 187M/218M [00:00<00:00, 399MB/s]
100% 218M/218M [00:00<00:00, 395MB/s]
Kaggle download instructions provided. Uncomment and run the relevant lines after setting up your API key.


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m100.8/100.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.6.0


In [None]:
import pandas as pd
import torch
import os
import numpy as np
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType

# --- 1. SETUP ---
# Fix memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
MODEL_NAME = "google/byt5-base"
TRAIN_PATH = "/content/datasets/train.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- 2. DATA LOADING ---
df = pd.read_csv(TRAIN_PATH)
df = df.dropna(subset=["transliteration", "translation"])
df = df[df["transliteration"].str.len() > 0]

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

# --- 3. METRICS ---
metric = evaluate.load("sacrebleu")

# --- 4. MODEL ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# --- 5. LORA ---
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=32,
    lora_alpha=32,
    lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 6. PREPROCESSING ---
def preprocess_function(examples):
    inputs = [f"translate Akkadian to English: {str(x)}" for x in examples["transliteration"]]
    targets = [str(x) for x in examples["translation"]]

    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        max_length=512,
        truncation=True
    )
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

# --- 7. METRICS LOGIC ---
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": result["score"],
        "gen_len": np.mean([len(t) for t in decoded_preds])
    }

# --- 8. TRAINING ARGUMENTS (FINAL STABLE CONFIG) ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./akkadian_byt5_final_stable",

    # --- MEMORY & SPEED STRATEGY ---
    per_device_train_batch_size=16,   # High Batch Size (Speed)
    per_device_eval_batch_size=8,     # Lower Eval Batch (Safety)
    gradient_accumulation_steps=1,    # Fast updates
    gradient_checkpointing=True,      # ‚úÖ ENABLED: Essential to fit Batch 16
    dataloader_num_workers=4,
    # -------------------------------

    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,

    num_train_epochs=15,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,

    learning_rate=3e-4,
    warmup_steps=100,

    # Stability
    fp16=False,                       # Keep False (Stable)
    max_grad_norm=1.0,

    # Generation
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=1,

    logging_steps=50,
    report_to="none"
)

# --- 9. TRAIN ---
torch.cuda.empty_cache() # Clear any leftover memory
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("üõ°Ô∏è Starting Robust Training (Batch 16 + Checkpointing)...")
trainer.train()

# --- 10. SAVE ---
model.save_pretrained("./akkadian_byt5_final_model")
tokenizer.save_pretrained("./akkadian_byt5_final_model")
print("‚úÖ Done.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/254 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 4,423,680 || all params: 587,256,576 || trainable%: 0.7533


Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

üõ°Ô∏è Starting Robust Training (Batch 16 + Checkpointing)...




Step,Training Loss,Validation Loss,Bleu,Gen Len
200,4.135363,3.810008,0.034351,125.44586
400,3.934226,3.612483,0.04011,125.694268
600,3.845555,3.51942,0.021184,126.0


