In [None]:
import requests

url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
response = requests.get(url)

with open("alpaca_data.json", "wb") as file:
    file.write(response.content)

In [2]:
from datasets import load_dataset

# Load Alpaca dataset
dataset = load_dataset("json", data_files="alpaca_data.json")

# Sample data exploration
print(dataset['train'][0])


{'instruction': 'Give three tips for staying healthy.', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'input': ''}


In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token


In [6]:
def preprocess_function(examples):
    inputs = [instr + inp for instr, inp in zip(examples['instruction'], examples['input'])]
    targets = examples['output']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs['labels'] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

KeyError: 'validation'

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
model.save_pretrained("./saved/fine-tuned-model")
tokenizer.save_pretrained("./saved/fine-tuned-model")