# GPT 2

In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer


train_path = 'final.txt'

tokenizer= GPT2Tokenizer.from_pretrained('gpt2')

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=20)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )


gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-textgen", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer_gpt2 = Trainer(
    model=gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)
trainer_gpt2.train()






<transformers.data.datasets.language_modeling.TextDataset at 0x1bfbf7a8490>

In [31]:
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [19]:
trainer_gpt2.save_model()

Saving model checkpoint to ./gpt2-textgen
Configuration saved in ./gpt2-textgen\config.json
Model weights saved in ./gpt2-textgen\pytorch_model.bin


In [21]:
from transformers import pipeline

text = pipeline('text-generation',model='./gpt2-textgen', tokenizer='gpt2')

loading configuration file ./gpt2-textgen\config.json
Model config GPT2Config {
  "_name_or_path": "./gpt2-textgen",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "use_cache": true,
  "vocab_size": 50257
}

load

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/tokenizer.json in cache at C:\Users\m_usa/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
creating metadata file for C:\Users\m_usa/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\m_usa/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\m_usa/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingfac

In [43]:
text('Visit')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Visit your boss.\nVisit your coworker. \nVisit your aunt.\nVisit your uncle.\nVisit your cousin.\nVisit your family.\nVisit your niece.\nVisit your nephew.\nVisit your grandson.\nVisit your granddaughter'}]

In [39]:
result = text('Visit')[0]['generated_text']
result.split('.')[0]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Visit a hospital every day for a year'

# Distiled GPT 2

In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer


train_path = 'final.txt'

tokenizer_distilgpt2 = GPT2Tokenizer.from_pretrained('distilgpt2')
train_dataset = TextDataset(
    tokenizer=tokenizer_distilgpt2,
    file_path=train_path,
    block_size=20)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer_distilgpt2, mlm=False,
    )


distilgpt2 = AutoModelForCausalLM.from_pretrained("distilgpt2")

training_args = TrainingArguments(
    output_dir="./distilgpt2-textgen", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=distilgpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)
trainer.train()

***** Running training *****
  Num examples = 140159
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 13140


Step,Training Loss
500,2.3609
1000,1.6351
1500,1.4293
2000,1.3197
2500,1.215
3000,1.1608
3500,1.091
4000,1.0567
4500,0.9927
5000,0.9513


Saving model checkpoint to ./distilgpt2-textgen\checkpoint-800
Configuration saved in ./distilgpt2-textgen\checkpoint-800\config.json
Model weights saved in ./distilgpt2-textgen\checkpoint-800\pytorch_model.bin
Saving model checkpoint to ./distilgpt2-textgen\checkpoint-1600
Configuration saved in ./distilgpt2-textgen\checkpoint-1600\config.json
Model weights saved in ./distilgpt2-textgen\checkpoint-1600\pytorch_model.bin
Saving model checkpoint to ./distilgpt2-textgen\checkpoint-2400
Configuration saved in ./distilgpt2-textgen\checkpoint-2400\config.json
Model weights saved in ./distilgpt2-textgen\checkpoint-2400\pytorch_model.bin
Saving model checkpoint to ./distilgpt2-textgen\checkpoint-3200
Configuration saved in ./distilgpt2-textgen\checkpoint-3200\config.json
Model weights saved in ./distilgpt2-textgen\checkpoint-3200\pytorch_model.bin
Saving model checkpoint to ./distilgpt2-textgen\checkpoint-4000
Configuration saved in ./distilgpt2-textgen\checkpoint-4000\config.json
Model weigh

TrainOutput(global_step=13140, training_loss=1.0022829859949864, metrics={'train_runtime': 779.5778, 'train_samples_per_second': 539.365, 'train_steps_per_second': 16.855, 'total_flos': 2145884253880320.0, 'train_loss': 1.0022829859949864, 'epoch': 3.0})

In [44]:
distilgpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [7]:
tokenizer_distilgpt2.save_pretrained('./distilgpt2-textgen')

tokenizer config file saved in ./distilgpt2-textgen\tokenizer_config.json
Special tokens file saved in ./distilgpt2-textgen\special_tokens_map.json


('./distilgpt2-textgen\\tokenizer_config.json',
 './distilgpt2-textgen\\special_tokens_map.json',
 './distilgpt2-textgen\\vocab.json',
 './distilgpt2-textgen\\merges.txt',
 './distilgpt2-textgen\\added_tokens.json')

In [11]:
trainer.save_model()

Saving model checkpoint to ./distilgpt2-textgen
Configuration saved in ./distilgpt2-textgen\config.json
Model weights saved in ./distilgpt2-textgen\pytorch_model.bin


In [13]:
from transformers import pipeline

chef = pipeline('text-generation',model='./distilgpt2-textgen', tokenizer='distilgpt2')

loading configuration file ./distilgpt2-textgen\config.json
Model config GPT2Config {
  "_name_or_path": "./distilgpt2-textgen",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "to

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/tokenizer.json in cache at C:\Users\m_usa/.cache\huggingface\transformers\accb287b5a5396b2597382916b6cc939fdab1366e89475a92338d3971b3d02b7.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
creating metadata file for C:\Users\m_usa/.cache\huggingface\transformers\accb287b5a5396b2597382916b6cc939fdab1366e89475a92338d3971b3d02b7.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/distilgpt2/resolve/main/vocab.json from cache at C:\Users\m_usa/.cache\huggingface\transformers\55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/distilgpt2/resolve/main/merges.txt from cache at C:\Users\m_usa/.cache\huggingface\transformers\9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file 

In [30]:
result = chef('Visit')[0]['generated_text']
result.split('.')[0]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Visit your son'