In [1]:
!pip install transformers==4.2.2

Collecting transformers==4.2.2
  Downloading transformers-4.2.2-py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 4.2 MB/s 
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 40.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.2 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.46 tokenizers-0.9.4 transformers-4.2.2


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

train_path = 'Alice_in_Wonderland_Train.txt'
test_path = 'Alice_in_Wonderland_Test.txt'

In [5]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [6]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")


training_args = TrainingArguments(
    output_dir="./Alice_fine_tuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



Downloading:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [7]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=18, training_loss=1.0132509867350261, metrics={'train_runtime': 964.9181, 'train_samples_per_second': 0.019, 'total_flos': 45459778830336, 'epoch': 3.0})

In [8]:
trainer.save_model()

In [9]:
from transformers import pipeline

In [10]:
text = "It was the best of times, it was the worst of times"
model = pipeline('text-generation', model="./Alice_fine_tuned" , tokenizer='xlnet-base-cased')
output = model(text)[0]['generated_text']

In [11]:
print(output)

It was the best of times, it was the worst of times. "Some people did great and other people did not. I'm not sure that the greatest thing that can happen is that the average person really has nothing to do with it, except have nothing. Some people did not but were blessed by the very end of that day. Some people did and were blessed by the very end of that day.
