In [None]:
# train_gpt2_colab.ipynb
# Step 1: Install dependencies
!pip install transformers datasets



In [None]:
# Step 2: Upload your local `.jsonl` file
from google.colab import files

#!rm -f python_articles.jsonl
uploaded = files.upload()  # Upload `python_articles.jsonl`
# Get the uploaded filename (automatically)
input_file = next(iter(uploaded))  # Gets the name of the uploaded file
# Optional: Preview file content
# print(uploaded[input_file].decode("utf-8")[:500])  # First 500 characters

Saving python_articles.jsonl to python_articles.jsonl


In [None]:
# Step 3: Convert JSONL to formatted plain text for GPT2

import json
import os

output_txt_file = "train_data.txt"

# Detect file type based on known schema or filename
def detect_and_format(item, file_name):
    if "instruction" in item and "code" in item:
        instruction = item.get("instruction", "").strip()
        code = item.get("code", "").strip()
        if instruction and code:
            return f"### Instruction:\n{instruction}\n\n### Code:\n```python\n{code}\n```"
    elif "url" in item and "content" in item:
        url = item.get("url", "").strip()
        content = item.get("content", "").strip()
        if content:
            return f"### URL:\n{url}\n\n### Content:\n{content}"
    else:
        return None

with open(input_file, "r", encoding="utf-8") as infile, open(output_txt_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        try:
            item = json.loads(line.strip())
            formatted = detect_and_format(item, input_file)
            if formatted:
                outfile.write(formatted + "\n\n")
        except json.JSONDecodeError:
            continue

print(f"✅ Generated formatted {output_txt_file} from {input_file}")

# Optionally delete the uploaded file
os.remove(input_file)
print(f"🗑️ Deleted file: {input_file}")


✅ Generated formatted train_data.txt from python_articles.jsonl
🗑️ Deleted file: python_articles.jsonl


In [None]:
# Step 4: Load dataset from formatted paragraphs
from datasets import Dataset

with open("train_data.txt", "r", encoding="utf-8") as f:
    content = f.read()

# Split entries by double line break (used in your formatting)
entries = [e.strip() for e in content.split("\n\n") if e.strip()]
data = [{"text": e} for e in entries]

dataset = Dataset.from_list(data)
print(dataset)




Dataset({
    features: ['text'],
    num_rows: 300
})


In [None]:
# Step 5: Load model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Hugging Face-supported model
#model_name = "gpt2"
#model_name = "Salesforce/codeT5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Fix for warning: remove or set valid loss_type
if hasattr(model.config, "loss_type"):
    delattr(model.config, "loss_type")  # or: model.config.loss_type = "ForCausalLMLoss"

tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Step 6: Tokenize dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# Step 7: Define training args
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    disable_tqdm=False,
    report_to="none",
    fp16=True,  # only if on GPU
)


In [None]:
# Step 8: Train
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()
print("✅ Training complete.")


Step,Training Loss
10,0.1334
20,0.1498
30,0.2237
40,0.2982
50,0.224
60,0.1139
70,0.0996
80,0.2363
90,0.1051
100,0.377


✅ Training complete.


In [None]:
# Step 9: Save model
model.save_pretrained("./trained-model")
tokenizer.save_pretrained("./trained-model")

('./trained-gpt2/tokenizer_config.json',
 './trained-gpt2/special_tokens_map.json',
 './trained-gpt2/vocab.json',
 './trained-gpt2/merges.txt',
 './trained-gpt2/added_tokens.json',
 './trained-gpt2/tokenizer.json')

In [None]:
# Step 10: Download model as zip
!zip -r trained-gpt2.zip ./trained-gpt2
from google.colab import files
files.download("trained-model.zip")

  adding: trained-gpt2/ (stored 0%)
  adding: trained-gpt2/special_tokens_map.json (deflated 60%)
  adding: trained-gpt2/tokenizer.json (deflated 82%)
  adding: trained-gpt2/model.safetensors (deflated 7%)
  adding: trained-gpt2/merges.txt (deflated 53%)
  adding: trained-gpt2/generation_config.json (deflated 24%)
  adding: trained-gpt2/vocab.json (deflated 59%)
  adding: trained-gpt2/config.json (deflated 51%)
  adding: trained-gpt2/tokenizer_config.json (deflated 54%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls -lh


total 443M
drwxr-xr-x 12 root root 4.0K Jul 11 23:57 output
drwxr-xr-x  1 root root 4.0K Jul 10 13:35 sample_data
-rw-r--r--  1 root root  29K Jul 11 23:18 train_data.txt
drwxr-xr-x  2 root root 4.0K Jul 11 23:58 trained-gpt2
-rw-r--r--  1 root root 443M Jul 11 23:58 trained-gpt2.zip
