In [1]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.5 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [2]:
import transformers # transformers library
import torch # PyTorch, we are using PyTorch as our library

In [3]:
# We are going to load in GPT-2 using the transformers library
gpt_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
# Loading in model now...
gpt_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')
# Takes a while to run...

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=764.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3247202234.0, style=ProgressStyle(descr…




In [4]:
## Making a function that will generate text for us ##
def gen_text(prompt_text, tokenizer, model, n_seqs=1, max_length=25):
  # n_seqs is the number of sentences to generate
  # max_length is the maximum length of the sentence
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
  # We are encoding the text using the gpt tokenizer. The return tensors are of type "pt"
  # since we are using PyTorch, not tensorflow
  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something, 
      # so we add the length of the original sequence to max_length
      temperature=1.0,
      top_k=0,
      top_p=0.9,
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs
  ) # We feed the encoded input into the model.
  ## Getting the output ##
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_() # the _ indicates that the operation will be done in-place
  generated_sequences = []
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence)
    total_sequence = (
        prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :]
    )
    generated_sequences.append(total_sequence)
  return generated_sequences

In [5]:
# Lots of syntax errors, but now we can test our model
## One important note: in our function, on line 5, make sure that
# return_tensor is return_tensors, otherwise you will get an error like
# this:
#####
# Another important note: on line 27 of the function, instead of
# clear_up_tokenization_spaces, write clean_up_tokenization_spaces
####
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",gpt_tokenizer,gpt_model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. They were about to stop them when']

In [6]:
# Sequence length was too small, lets increase it
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",
         gpt_tokenizer,
         gpt_model,
         max_length=100)
# Will take some time......

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. They were met by orc leaders that sliced up enemies as quickly as they reeled them in; then they engaged the Orcs' General within their walls of light, his shields strong enough to make all but one very small circle.\n\nThe general reached for an arrow-pointed shard. The keen point pierced Tirion's flesh as if it had been heat-baked through into cold steel."]

In [7]:
# We can demostrate n_seqs here
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",
         gpt_tokenizer,
         gpt_model,
         max_length=40,
         n_seqs=3) # Will take even longer....

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry.\n\n\n"What will I do?" Carridin suddenly asked as he halted in front of them.\n\n',
 "Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. There was no time for words to escape Aragorn's lips; he saw the orc warriors' keen eyes",
 'Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. Bolg had also done the same to them but before Bolgi could respond, he was blocked by Saruman']

In [8]:
# There are now 3 different outputs
# thats it