In [1]:
# train_codet5_colab.ipynb
# Step 1: Install dependencies
!pip install transformers datasets



In [2]:
# Step 2: Upload your local `.jsonl` file
from google.colab import files

#!rm -f python_articles.jsonl
uploaded = files.upload()  # Upload `python_articles.jsonl`
# Get the uploaded filename (automatically)
input_file = next(iter(uploaded))  # Gets the name of the uploaded file
# Optional: Preview file content
# print(uploaded[input_file].decode("utf-8")[:500])  # First 500 characters

Saving python_articles.jsonl to python_articles.jsonl


In [3]:
# Step 3: Convert JSONL to formatted plain text for model

import json
import os

output_json_file = "train_data.json"

examples = []

with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        try:
            item = json.loads(line.strip())
            instruction = item.get("instruction", "").strip()
            code = item.get("code", "").strip()
            if instruction and code:
                examples.append({
                    "input": instruction,
                    "output": code
                })
        except json.JSONDecodeError:
            continue

with open(output_json_file, "w", encoding="utf-8") as out_json:
    json.dump(examples, out_json, indent=2)

print(f"✅ Converted to {len(examples)} structured instruction-code pairs")
os.remove(input_file)
print(f"🗑️ Deleted original upload: {input_file}")


✅ Generated formatted train_data.txt from python_articles.jsonl
🗑️ Deleted file: python_articles.jsonl


In [4]:
# Step 4: Load dataset
from datasets import Dataset

with open("train_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)
print(dataset)





Dataset({
    features: ['text'],
    num_rows: 300
})


In [6]:
# Step 5: Load model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Salesforce/codeT5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Fix for potential config issues
if hasattr(model.config, "loss_type"):
    delattr(model.config, "loss_type")

tokenizer.pad_token = tokenizer.eos_token


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [7]:
# Step 6: Tokenize dataset
def tokenize(example):
    inputs = tokenizer(example["input"], truncation=True, padding="max_length", max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]



In [8]:
# Step 7: Define training args
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    disable_tqdm=False,
    report_to="none",
    fp16=True,  # Only if running on GPU
)



In [9]:
# Step 8: Train
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()
print("✅ Training complete.")



Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,1.7565
20,0.1921
30,0.0275
40,0.17
50,0.0395
60,0.0011
70,0.002
80,0.0009
90,0.0278
100,0.0409


✅ Training complete.


In [10]:
# Step 9: Save model
model.save_pretrained("./trained-model")
tokenizer.save_pretrained("./trained-model")
print("✅ Model saved to ./trained-model")


('./trained-model/tokenizer_config.json',
 './trained-model/special_tokens_map.json',
 './trained-model/vocab.json',
 './trained-model/merges.txt',
 './trained-model/added_tokens.json',
 './trained-model/tokenizer.json')

In [12]:
# Step 10: Download model as zip
# Step 10: Download trained model
!zip -r trained-model.zip ./trained-model
from google.colab import files
files.download("trained-model.zip")

!ls -lh


  adding: trained-model/ (stored 0%)
  adding: trained-model/special_tokens_map.json (deflated 97%)
  adding: trained-model/tokenizer.json (deflated 82%)
  adding: trained-model/model.safetensors (deflated 7%)
  adding: trained-model/merges.txt (deflated 54%)
  adding: trained-model/generation_config.json (deflated 33%)
  adding: trained-model/vocab.json (deflated 59%)
  adding: trained-model/config.json (deflated 62%)
  adding: trained-model/tokenizer_config.json (deflated 94%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>