In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

In [None]:
# Load CodeSearchNet Dataset
dataset = load_dataset("code_search_net", "python")
train_data, test_data = dataset["train"], dataset["test"]

In [None]:
# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def preprocess_data(batch):
    # Tokenize formatted code as target
    target_tokens = tokenizer(batch["func_code_string"], padding="max_length", truncation=True, max_length=128)
    # Create unformatted version by removing spaces from each code snippet in the batch
    unformatted_code = [code.replace(" ", "") for code in batch["func_code_string"]]
    input_tokens = tokenizer(unformatted_code, padding="max_length", truncation=True, max_length=128)

    return {
        "input_ids": input_tokens["input_ids"],
        "attention_mask": input_tokens["attention_mask"],
        "labels": target_tokens["input_ids"]
    }

In [None]:
# Apply preprocessing
train_data = train_data.map(preprocess_data, batched=True)
test_data = test_data.map(preprocess_data, batched=True)

In [None]:
# Define training arguments and Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="none"  # This disables W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)

In [None]:
# Train the model
trainer.train()