In [1]:
from transformers import (
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TextDataset,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    pipeline)

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [3]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [4]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="tv_game_of_thrones_training.txt",
    block_size=128)
     
test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="tv_game_of_thrones_test.txt",
    block_size=128)



In [5]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [6]:
training_args = TrainingArguments(
    output_dir = 'tv_data/out', # the output directory for the model predictions and checkpoints
    overwrite_output_dir = True, # overwrite the content of the output directory
    per_device_train_batch_size = 16, # the batch size for training
    per_device_eval_batch_size = 16, # the batch size for evaluation
    learning_rate = 5e-5, # defaults to 5e-5
    num_train_epochs = 3, # total number of training epochs to perform
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [7]:
import torch
torch.cuda.empty_cache()
trainer.train()

***** Running training *****
  Num examples = 11832
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4437
 11%|█▏        | 500/4437 [01:24<10:55,  6.01it/s]Saving model checkpoint to data/out\checkpoint-500
Configuration saved in data/out\checkpoint-500\config.json


{'loss': 3.3184, 'learning_rate': 4.4365562316880776e-05, 'epoch': 0.34}


Model weights saved in data/out\checkpoint-500\pytorch_model.bin
 23%|██▎       | 1000/4437 [02:52<09:18,  6.15it/s] Saving model checkpoint to data/out\checkpoint-1000
Configuration saved in data/out\checkpoint-1000\config.json


{'loss': 3.1629, 'learning_rate': 3.873112463376155e-05, 'epoch': 0.68}


Model weights saved in data/out\checkpoint-1000\pytorch_model.bin
 34%|███▍      | 1500/4437 [04:22<08:04,  6.06it/s]  Saving model checkpoint to data/out\checkpoint-1500
Configuration saved in data/out\checkpoint-1500\config.json


{'loss': 3.0714, 'learning_rate': 3.309668695064232e-05, 'epoch': 1.01}


Model weights saved in data/out\checkpoint-1500\pytorch_model.bin
 45%|████▌     | 2000/4437 [05:50<06:37,  6.13it/s]  Saving model checkpoint to data/out\checkpoint-2000
Configuration saved in data/out\checkpoint-2000\config.json


{'loss': 2.9427, 'learning_rate': 2.7462249267523105e-05, 'epoch': 1.35}


Model weights saved in data/out\checkpoint-2000\pytorch_model.bin
 56%|█████▋    | 2500/4437 [07:18<05:16,  6.11it/s]  Saving model checkpoint to data/out\checkpoint-2500
Configuration saved in data/out\checkpoint-2500\config.json


{'loss': 2.9221, 'learning_rate': 2.1827811584403878e-05, 'epoch': 1.69}


Model weights saved in data/out\checkpoint-2500\pytorch_model.bin
 68%|██████▊   | 3000/4437 [08:47<03:56,  6.07it/s]  Saving model checkpoint to data/out\checkpoint-3000
Configuration saved in data/out\checkpoint-3000\config.json


{'loss': 2.9081, 'learning_rate': 1.6193373901284655e-05, 'epoch': 2.03}


Model weights saved in data/out\checkpoint-3000\pytorch_model.bin
 79%|███████▉  | 3500/4437 [10:13<02:33,  6.09it/s]Saving model checkpoint to data/out\checkpoint-3500
Configuration saved in data/out\checkpoint-3500\config.json


{'loss': 2.8268, 'learning_rate': 1.0558936218165428e-05, 'epoch': 2.37}


Model weights saved in data/out\checkpoint-3500\pytorch_model.bin
 90%|█████████ | 4000/4437 [11:44<01:11,  6.08it/s]Saving model checkpoint to data/out\checkpoint-4000
Configuration saved in data/out\checkpoint-4000\config.json


{'loss': 2.8272, 'learning_rate': 4.924498535046203e-06, 'epoch': 2.7}


Model weights saved in data/out\checkpoint-4000\pytorch_model.bin
100%|██████████| 4437/4437 [13:03<00:00,  6.10it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 4437/4437 [13:03<00:00,  5.66it/s]

{'train_runtime': 783.9849, 'train_samples_per_second': 45.276, 'train_steps_per_second': 5.66, 'train_loss': 2.9804630825480336, 'epoch': 3.0}





TrainOutput(global_step=4437, training_loss=2.9804630825480336, metrics={'train_runtime': 783.9849, 'train_samples_per_second': 45.276, 'train_steps_per_second': 5.66, 'train_loss': 2.9804630825480336, 'epoch': 3.0})

In [8]:
trainer.save_model()

Saving model checkpoint to data/out
Configuration saved in data/out\config.json
Model weights saved in data/out\pytorch_model.bin


In [9]:
generator = pipeline('text-generation', tokenizer='gpt2', model='tv_data/out')

loading configuration file data/out\config.json
Model config GPT2Config {
  "_name_or_path": "data/out",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 50257
}

loading configur

In [16]:
print(generator('Nights', max_length=100)[0]['generated_text'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Nights of the Night’s Watch in the east:

-Lord Catelyn’s ravens. They circled the north shore of the Neck, circling the shoreline of Blackmont and Yunkai and Manderly Islands before descending toward Vulture Bay, where men were gathering dead leaves to hunt wolves.

-Ser Jaime’s direwolf, the Red Painted Grey with a pelt of blood on it. His black fur looked as pink as
