In [1]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

In [2]:
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

In [3]:
prompt = '''Enter two Sentinels-[first,] Francisco, [who paces up and down
at his post; then] Bernardo, [who approaches him].

  Ber. Who's there.?
'''

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Enter two Sentinels-[first,] Francisco, [who paces up and down
at his post; then] Bernardo, [who approaches him].

  Ber. Who's there.?
       (Ber.) Who's there, [in the dark,] Francisco?

  Ber. (M. Ber.) What are you doing there?
       (Ber.) Oh! what's wrong there?

  Ber. [


In [4]:
from sklearn.model_selection import train_test_split

with open('datasets/raw/hamlet.txt', 'r') as f:
    data = f.read()
    
train, test = train_test_split(data, test_size=0.15, shuffle=False)

print("Train dataset length: " + str(len(train)))
print("Test dataset length: " + str(len(test)))

train_path = 'datasets/train_dataset.txt'
test_path = 'datasets/test_dataset.txt'

with open(train_path, 'w') as f:
    f.write(''.join(train))
    
with open(test_path, 'w') as f:
    f.write(''.join(test))

Train dataset length: 162967
Test dataset length: 28759


In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128
    )

    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [6]:
from transformers import Trainer, TrainingArguments

model_path = "./models/gpt-neo-hamlet"

training_args = TrainingArguments(
    output_dir=model_path,         # The output directory
    overwrite_output_dir=True,     # overwrite the content of the output directory
    num_train_epochs=3,            # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16, # batch size for evaluation
    eval_steps=400,                # Number of update steps between two evaluations.
    save_steps=800,                # after # steps model is saved
    warmup_steps=500,              # number of warmup steps for learning rate scheduler
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [7]:
import torch
import gc

In [8]:
torch.cuda.empty_cache()
gc.collect()

0

In [9]:
trainer.train()

***** Running training *****
  Num examples = 432
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 162


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=162, training_loss=3.1887267312885803, metrics={'train_runtime': 23.4342, 'train_samples_per_second': 55.304, 'train_steps_per_second': 6.913, 'total_flos': 84631099539456.0, 'train_loss': 3.1887267312885803, 'epoch': 3.0})

In [10]:
trainer.save_model()

Saving model checkpoint to ./models/gpt-neo-hamlet
Configuration saved in ./models/gpt-neo-hamlet\config.json
Model weights saved in ./models/gpt-neo-hamlet\pytorch_model.bin


In [11]:
from transformers import pipeline

generator = pipeline('text-generation', model=model_path, tokenizer='EleutherAI/gpt-neo-125M')
result = generator(prompt, do_sample=True, temperature=0.9, max_length=100)[0]['generated_text']

loading configuration file ./models/gpt-neo-hamlet\config.json
Model config GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125M",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout": 0,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_i

In [12]:
print(result)

Enter two Sentinels-[first,] Francisco, [who paces up and down
at his post; then] Bernardo, [who approaches him].

  Ber. Who's there.?
  Francisco. No, sir.
  Ber. A room, sir?
  Francisco. That's my chamber-room.
  Ber. That's your chamber-room, sir.?
  Francisco. Yes, yes, sir; and, sir, it's not so
