In [1]:
!pip install transformers==4.2.2

Collecting transformers==4.2.2
  Downloading transformers-4.2.2-py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.9 MB/s 
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 47.7 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.46 tokenizers-0.9.4 transformers-4.2.2


In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

train_path = 'dataset/Sherlock_Holmes_Train.txt'
test_path = 'dataset/Sherlock_Holmes_Test.txt'

In [3]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (101177 > 1024). Running this sequence through the model will result in indexing errors


In [4]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")


training_args = TrainingArguments(
    output_dir="./Sherlock_Holmes_fine_tuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [5]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=75, training_loss=3.9111503092447917, metrics={'train_runtime': 4688.2749, 'train_samples_per_second': 0.016, 'total_flos': 226500360929280, 'epoch': 3.0})

In [6]:
trainer.save_model()

In [7]:
from transformers import pipeline

In [9]:
text = "It was the best of times, it was the worst of times"
model = pipeline('text-generation', model="./Sherlock_Holmes_fine_tuned" , tokenizer='gpt2')
output = model(text)[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
print(output)

It was the best of times, it was the worst of times. And he didn't have the heart to say it. It was all he had done, and he didn't know what would come in his future. It was too great a risk
