# 🧠 Fine-Tune GPT2 untuk Chatbot Kampus
Model ini akan dilatih dengan dataset tanya-jawab kampus berbasis GPT2.

In [1]:
!pip install transformers datasets

^C


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl (25.8 MB)
   -----------------------------------


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 📁 Upload File Dataset `qa_kampus.jsonl`

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="qa_kampus.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset

## 🔧 Load Tokenizer & Model GPT2

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

## 🔠 Tokenisasi Dataset

In [None]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

## 🏋️ Fine-Tune GPT2

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=10,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

trainer.train()

## 💾 Simpan Model & Tokenizer

In [None]:
trainer.save_model("gpt2-kampus")
tokenizer.save_pretrained("gpt2-kampus")

## ⬇️ Download Model ke Laptop

In [None]:
!zip -r gpt2-kampus.zip gpt2-kampus
files.download("gpt2-kampus.zip")