<a href="https://colab.research.google.com/github/vichruth/Low-Parameter-ai-editor/blob/main/model_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install -q transformers==4.57.1 accelerate datasets torch torchvision sympy==1.13.1

# STEP 2: Upload your dataset
from google.colab import files
print(" Please upload your 'bug_fix_dataset.jsonl' file:")
uploaded = files.upload()

import os
uploaded_filenames = list(uploaded.keys())
if not uploaded_filenames:
    raise ValueError(" No file uploaded.")
data_file = uploaded_filenames[0]
print(f" Uploaded: {data_file}")

# STEP 3: Imports
import torch
import transformers
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)

def main():
    # --- Load model ---
    model_name = "Salesforce/codet5-base"
    print(f" Loading model: {model_name}")
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    model.gradient_checkpointing_enable()

    # --- Load dataset ---
    dataset = load_dataset("json", data_files=data_file)
    split = dataset["train"].train_test_split(test_size=0.1, seed=42)
    train_data, val_data = split["train"], split["test"]
    print(f" Train={len(train_data)}, Val={len(val_data)}")

    prefix = "fix bug: "

    def preprocess_function(batch):
        inputs = [prefix + code for code in batch["buggy_code"]]
        targets = batch["fixed_code"]
        model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print(" Tokenizing data...")
    train_dataset = train_data.map(preprocess_function, batched=True)
    val_dataset = val_data.map(preprocess_function, batched=True)

    collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # --- Training setup ---
    use_fp16 = torch.cuda.is_available()
    print(f" GPU available: {use_fp16}")
    print("Transformers version:", transformers.__version__)

    args = TrainingArguments(
        output_dir="./lowparam-bugfixer-model",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=10,
        weight_decay=0.01,
        save_total_limit=2,
        logging_steps=50,
        fp16=use_fp16,
        load_best_model_at_end=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=collator,
    )

    print("\n Starting fine-tuning …")
    trainer.train()
    print("\n Fine-tuning complete!")

    # --- Save model ---
    final_path = "./lowparam-bugfixer-model"
    trainer.save_model(final_path)
    tokenizer.save_pretrained(final_path)
    print(f" Model saved to {final_path}")

    # --- Zip & download ---
    print("Zipping model for download …")
    !zip -r lowparam-bugfixer-model.zip ./lowparam-bugfixer-model
    try:
        files.download("lowparam-bugfixer-model.zip")
        print("Download started.")
    except Exception as e:
        print(f" Auto-download failed: {e}")
        print("Please download manually from the left panel.")

# Run it all
main()
