In [15]:
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    Pipeline
)
# We need DatasetDict to organize our splits
from datasets import load_dataset, DatasetDict

# Define the path to your text file
text_file_path = "pg31100.txt"

print(f"Loading and processing custom dataset from: {text_file_path}")

# Load the dataset from the text file
# The 'text' loader reads one line per example.
raw_dataset = load_dataset('text', data_files={'train': text_file_path})

# Create a validation split (e.g., 90% train, 10% validation)
# The split is done on the 'train' key we just loaded
split = raw_dataset['train'].train_test_split(test_size=0.1)

# Re-organize the splits into a DatasetDict
dataset = DatasetDict({
    'train': split['train'],
    'validation': split['test']
})

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

def tokenize_function(examples):
    # We set truncation=False here because we will handle it in the group_texts function
    return tokenizer(examples["text"])

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])





Loading and processing custom dataset from: pg31100.txt


In [21]:
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=256,
    n_embd=256,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
print(f"Model created with {model.num_parameters():,} parameters.")

args = TrainingArguments(
    output_dir="./austen-gpt",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

Model created with 16,090,880 parameters.


In [22]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss,Validation Loss
500,6.4695,5.418536
1000,4.7989,4.91153
1500,4.4425,4.710504
2000,4.2999,4.595109
2500,4.153,4.514796
3000,4.0584,4.460492
3500,4.0019,4.428689
4000,3.8861,4.399166
4500,3.8825,4.355317
5000,3.7333,4.356992


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [20]:
pipe = Pipeline('text-generation', model='./austen-gpt2/checkpoint-2830')
prompt = "Miss Bennet was a woman of"
output = pipe(prompt, max_length=50, num_return_sequences=2)
print(output)

TypeError: Can't instantiate abstract class Pipeline without an implementation for abstract methods '_forward', '_sanitize_parameters', 'postprocess', 'preprocess'