In [35]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer

In [37]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [38]:
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='tf')

### Gready Search

In [39]:
greedy_output = model.generate(input_ids, max_length=50, min_length=10)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure if I'll


### Beam Search

In [15]:
beam_outputs = model.generate(
    input_ids, 
    max_length=20, 
    num_beams=5, 
    early_stopping=True, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5
)

print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
    print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to
1: I enjoy walking with my cute dog, but I don't think I'll ever be able to walk
2: I enjoy walking with my cute dog, but I don't think I'll ever be able to do
3: I enjoy walking with my cute dog, but I don't think I'll be able to walk with
4: I enjoy walking with my cute dog, but I don't think I'll ever be able to get


### Sampling

In [17]:
tf.random.set_seed(0)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=.7
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I don't like to be at home too much. I also find it a bit weird when I'm out shopping. I am always away from my house a lot, but I do have a few friends


### In this part, we use the top-k scheme for probability limitation

In [18]:
tf.random.set_seed(0)

# set top_k to 50
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog. It's so good to have an environment where your dog is available to share with you and we'll be taking care of you.

We hope you'll find this story interesting!

I am from


### top-p sampling to account for the differences in sampling distributions

In [19]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# deactivate top_k sampling and sample only from 92% most likely words
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_p=0.92, 
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog. He will never be the same. I watch him play.


Guys, my dog needs a name. Especially if he is found with wings.


What was that? I had a lot of


In [20]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog. It's so good to have the chance to walk with a dog. But I have this problem with the dog and how he's always looking at us and always trying to make me see that I can do something
1: I enjoy walking with my cute dog. I like seeing him, I don't like having my dog go through me, but when we walk together that makes for a wonderful bonding moment. I appreciate the interaction, I just don't understand how it would
2: I enjoy walking with my cute dog and playing with our kids," said David J. Smith, director of the Humane Society of the US.

"So as a result, I've got more work in my time," he said.




### Using a constrained Language Model

In [62]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2LMHeadModel

tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [63]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [123]:
# src: https://huggingface.co/transformers/v4.1.1/_modules/transformers/generation_logits_process.html

def set_scores_to_inf_for_banned_tokens(scores, banned_tokens):
    """
    Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
    list of list of banned tokens to ban in the format [[batch index, vocabulary position],...

    Args:
        scores: logits distribution of shape (batch size, vocabulary size)
        banned_tokens: list of list of tokens to ban of length (batch_size)
    """
    banned_mask_list = []
    for idx, batch_banned_tokens in enumerate(banned_tokens):
        for token in batch_banned_tokens:
            banned_mask_list.append([idx, token])
    if not banned_mask_list:
        return scores

    banned_mask = torch.LongTensor(banned_mask_list)
    # print(banned_mask)
    indices = torch.ones(len(banned_mask))
    # print(indices)

    banned_mask = (
        torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()).to(scores.device).to_dense().bool()
    )
    scores = scores.masked_fill(banned_mask, -float("inf"))
    return scores

In [149]:
import pandas as pd
from termcolor import colored

In [125]:
hed_df = pd.read_csv('../datasets/Hedonometer(1).csv')
hed_df.head(3)

Unnamed: 0,Rank,Word,Word in English,Happiness Score,Standard Deviation of Ratings
0,0,laughter,laughter,8.5,0.93
1,1,happiness,happiness,8.44,0.97
2,2,love,love,8.42,1.11


In [126]:
hed_df['Happiness Score'].describe(include='all')

count    10187.000000
mean         5.373276
std          1.092155
min          1.300000
25%          4.900000
50%          5.440000
75%          6.020000
max          8.500000
Name: Happiness Score, dtype: float64

In [160]:
from transformers import LogitsProcessor
import numpy as np

class ABCLogits(LogitsProcessor):
  def __init__(self, vocab):
    """
    vocab is a dictionary where the keys are tokens
    and the values are the corresponding ids.
    """
    # create an array of tokens
    # remove the 'Ġ' token (used to represent a blank space in the tokenizer)
    self.keys = list(tokenizer.vocab.keys())
    index_to_pop = self.keys.index('Ġ') 
    self.keys.pop(index_to_pop)
    self.keys = np.array(self.keys)

    # create an array of ids
    # also remove the 'Ġ' token
    self.values = list(tokenizer.vocab.values())
    self.values.pop(index_to_pop)
    self.values = np.array(self.values)

    happy_words = hed_df[hed_df['Happiness Score'] > 4]['Word in English'].tolist()
    all_words = hed_df['Word in English'].tolist()
    
    is_happy = lambda x: (x in happy_words) or (x not in all_words)
    self.is_happy = np.vectorize(is_happy)
    # vectorized function used to get the first character of a token
    # ignores leading whitespaces and 'Ġ' tokens

    happy_indices = np.where(self.is_happy(self.keys) == True)
    # print(happy_indices)
    
    self.happy_words_values = self.values[happy_indices]
    print()

  def __call__(self, input_ids, scores):
    # print(scores)
    # print(np.mean(scores), np.min(scores), np.max(scores))
    banned_tokens = []
    banned_tokens.append(self.happy_words_values)
    print('Number of banned words: ', colored(len(self.happy_words_values), 'red'))
    # for every beam (partially generated sentence)
    # for beam_index, (beam_input_ids, beam_scores) in enumerate(zip(input_ids, scores)):
    #     if 
    #   # get the last token of this beam
    #   last_word = tokenizer.decode(beam_input_ids[-1])
    #   # get the first character of this last token
    #   starting_char = self.first_char(last_word)
    #   # if the last token starts with 'a',
    #   # ban all words that do not start with 'b', etc.
    #   if starting_char == 'a':
    #     banned_tokens.append(self.not_b_values)
    #   elif starting_char == 'b':
    #     banned_tokens.append(self.not_c_values)
    #   elif starting_char == 'c':
    #     banned_tokens.append(self.not_a_values)
    #   else:
    #     banned_tokens.append(self.not_a_values)
    # set the scores of all banned tokens over the beams to -inf
    scores = set_scores_to_inf_for_banned_tokens(scores, banned_tokens)
    return scores

In [161]:
from transformers import (
    BeamSearchScorer,
    LogitsProcessorList,
    StoppingCriteriaList,
    MaxLengthCriteria
)
import torch

# how many beams to track during the Viterbi algorithm
num_beams = 10
# how many beams to return after the algorithm
num_return_beams = 10

# the prompt to continue
prompt = 'studying is'

# tokenizing the prompt
prompt_tokenized = tokenizer(prompt, return_tensors='pt' )
prompt_tokenized = prompt_tokenized['input_ids']

# instantiating a BeamSearchScorer

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

beam_scorer = BeamSearchScorer(
    batch_size = prompt_tokenized.shape[0],
    num_beams = num_beams,
    num_beam_hyps_to_keep = num_return_beams,
    device=device
)

# instantiating a list of LogitsProcessor instances
# using our custom ABCLogits class
logits_processor = LogitsProcessorList([ABCLogits(tokenizer.vocab)])

# running beam search using our custom LogitsProcessor
generated = model.beam_search(
    torch.cat([prompt_tokenized] * num_beams),
    beam_scorer,
    logits_processor = logits_processor,
    stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=12)])
)

# printing the output beams
for index, output_tokenized in enumerate(generated):
  output = tokenizer.decode(output_tokenized)
  print(f'beam {index}: {output}')


Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
Number of banned words:  [31m50033[0m
beam 0: studying isnot the only thing that can be done to
beam 1: studying isnot the only thing that can be done.
beam 2: studying isnot the only thing that can be done in
beam 3: studying isnot an easy task, but it can be
beam 4: studying isnot an easy task, but it is a
beam 5: studying isnot the only thing that can be done with
beam 6: studying isnot the only thing that can be done,
beam 7: studying isnot the only thing that can be done for
beam 8: studying isnot an easy task, but it is an
beam 9: studying isnot the only thing that can be done by
