In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

# Split the dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.05)  # 5% for test, 95% for train
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


In [2]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")

# Tokenize and preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    inputs["labels"] = inputs.input_ids.copy()
    return inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1711282 [00:00<?, ? examples/s]

Map:   0%|          | 0/90068 [00:00<?, ? examples/s]

In [4]:
tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1711282
})

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./data/results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    logging_dir="./data/logs",
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Train the model
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
