In [1]:
import sentencepiece

In [2]:
from datasets import load_dataset

dataset = load_dataset('grammarly/coedit')

In [4]:
from transformers import T5Tokenizer

# Define constants
model_id = "t5-small"
MAX_LENGTH = 128

# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_id)

# Preprocess the dataset
def preprocess_data(examples):
    inputs = [sentence for sentence in examples['src']]
    targets = [sentence for sentence in examples['tgt']]
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=MAX_LENGTH, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained(model_id)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1686,0.245516
2,0.1583,0.234669
3,0.1556,0.232516


TrainOutput(global_step=25902, training_loss=0.19142372173173539, metrics={'train_runtime': 4999.5952, 'train_samples_per_second': 41.446, 'train_steps_per_second': 5.181, 'total_flos': 7011145177104384.0, 'train_loss': 0.19142372173173539, 'epoch': 3.0})

In [5]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.33.0


In [18]:
model.save_pretrained('gennie')
tokenizer.save_pretrained('gennie')

('gennie/tokenizer_config.json',
 'gennie/special_tokens_map.json',
 'gennie/spiece.model',
 'gennie/added_tokens.json')

In [13]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual token
login(token='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')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/shreeyutm/.cache/huggingface/token
Login successful


In [17]:
trainer.push_to_hub()

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1721845277.LAPTOP-N36F6CKI.6531.0:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Shreeyut/results/commit/201e12692a2e0128c71e81e2bd070b034e2a0acb', commit_message='End of training', commit_description='', oid='201e12692a2e0128c71e81e2bd070b034e2a0acb', pr_url=None, pr_revision=None, pr_num=None)