In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

# Split the dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.05)  # 5% for test, 95% for train
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


In [2]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Tokenize and preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = inputs.input_ids.copy()
    return inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1711282 [00:00<?, ? examples/s]

Map:   0%|          | 0/90068 [00:00<?, ? examples/s]

In [4]:
tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1711282
})

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./data/results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    logging_dir="./data/logs",
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxiaoxinyin[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1000,0.968
2000,0.4524
3000,0.4405
