In [1]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install SentencePiece
!pip install 'accelerate>=0.26.0'
!pip install transformers[torch]



ERROR: Invalid requirement: "'accelerate": Expected package name at the start of dependency specifier
    'accelerate
    ^




In [10]:

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load text files
import re

def clean_text(text):
    text = text.lower().strip()  # Lowercase and strip spaces
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

def load_text(filename):
    with open(filename, "r", encoding="utf-8") as file:
        return [clean_text(line) for line in file.read().splitlines() if line.strip()]

# Load data
en_sentences = load_text("en.txt")
gloss_sentences = load_text("gloss.txt")
en_sentences = [line.strip() for line in en_sentences if line.strip()]
gloss_sentences = [line.strip() for line in gloss_sentences if line.strip()]
# Use T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [11]:
# Prepare dataset with task prefix
def tokenize_function(examples):
    inputs = ["translate English to Gloss: " + sentence for sentence in examples["en"]]
    targets = examples["gloss"]
    
    model_inputs = tokenizer(inputs, padding=True, truncation=True, max_length=128)
    labels = tokenizer(targets, padding=True, truncation=True, max_length=128)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
print(len(en_sentences), len(gloss_sentences))
# Create dataset
data = Dataset.from_dict({"en": en_sentences, "gloss": gloss_sentences})
tokenized_datasets = data.map(tokenize_function, batched=True)

100 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
# Load model (T5 for translation)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Train model
trainer.train()

# Save model
model.save_pretrained("english_to_gloss_t5")
tokenizer.save_pretrained("english_to_gloss_t5")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,9.806201
2,No log,9.029267
3,No log,8.441807
4,No log,8.036497
5,No log,7.893742


('english_to_gloss_t5\\tokenizer_config.json',
 'english_to_gloss_t5\\special_tokens_map.json',
 'english_to_gloss_t5\\spiece.model',
 'english_to_gloss_t5\\added_tokens.json')

In [13]:
test_en_sentences = load_text("test_en.txt")
test_gloss_sentences = load_text("test_gloss.txt")

In [14]:
!pip install sacrebleu
import sacrebleu



In [18]:
# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# BLEU Score Evaluation
def compute_bleu(model, tokenizer, dataset):
    references = []
    predictions = []
    
    for example in dataset:
        input_text = "translate English to Gloss: " + example["en"]
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids
        output_ids = model.generate(input_ids)
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        references.append([example["gloss"]])
        predictions.append(prediction)
    print ('reference:', references[:5], 'prediction: ', predictions[:5])
    bleu = sacrebleu.corpus_bleu(predictions, references)
    return bleu.score

# Create test dataset
test_data = Dataset.from_dict({"en": test_en_sentences, "gloss": test_gloss_sentences})
bleu_score = compute_bleu(model, tokenizer, test_data)
print("BLEU Score on Test Set:", bleu_score)


Evaluation Results: {'eval_loss': 7.893741607666016, 'eval_runtime': 14.2882, 'eval_samples_per_second': 6.999, 'eval_steps_per_second': 0.49, 'epoch': 5.0}
reference: [['now condition'], ['alack for pitty pro1 not remembringwhqhowpro1 cride out then will cry pro3it ore againe pro3itbea hint that wrings mine eyes too t'], ['heare little further and then pro1 le bring thee to present businesse which now vpon without which this storybemost impertinent'], ['wherefore did pro3they not that howre destroy vs'], ['well demanded wench poss1 tale prouokes that question deare pro3they durst not so deare loue poss1 people bore pro1me nor set marke so bloudy on businesse but with colours fairer painted poss3they foule ends']] prediction:  ['jetzt die Lage', 'ich schreie es ere es ist ein Hinweis,', '', 'Darum zerstörten sie nicht diesen Thron', 'es hat mir erfordert, daß ich es er']
BLEU Score on Test Set: 0.0
