In [15]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view
import torch
import pandas as pd
set_seed(42)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
generator = pipeline('text-generation', model='gpt2')

In [16]:
phrase = 'My friend was right about this class. It is so fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')
response = model(**encoded_phrase, output_attentions=True, output_hidden_states=True)

len(response.attentions)

12

In [17]:
response.attentions[0].shape

torch.Size([1, 12, 13, 13])

In [18]:
encoded_phrase['input_ids'].shape

torch.Size([1, 13])

In [19]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġclass',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġfun',
 '!']

In [20]:
arr = response.attentions[9][0][0]
#we grab the attention score from the 10th encoder (index nine) and the first head (0) and the extra
# zero is because we have a batch size of 1
n_digits = 3
#we round it to the third decimal place

attention_df = pd.DataFrame((torch.round(arr*10**n_digits)/(10**n_digits)).detach()).applymap(float)
attention_df.columns = tokens
attention_df.index=tokens
attention_df

  attention_df = pd.DataFrame((torch.round(arr*10**n_digits)/(10**n_digits)).detach()).applymap(float)


Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġclass,.,ĠIt,Ġis,Ġso,Ġfun,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġclass,0.946,0.005,0.001,0.001,0.001,0.002,0.044,0.0,0.0,0.0,0.0,0.0,0.0
.,0.691,0.013,0.003,0.003,0.002,0.006,0.269,0.013,0.0,0.0,0.0,0.0,0.0
ĠIt,0.318,0.003,0.003,0.003,0.006,0.018,0.599,0.018,0.032,0.0,0.0,0.0,0.0
Ġis,0.331,0.006,0.002,0.002,0.003,0.018,0.533,0.013,0.062,0.03,0.0,0.0,0.0


In [21]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
model_view(response.attentions, tokens)
"""
we are only able to look backwards!
"""

<IPython.core.display.Javascript object>

'\nwe are only able to look backwards!\n'

In [22]:
#
response.hidden_states[-1].shape

torch.Size([1, 13, 768])

In [23]:
response.logits.shape

torch.Size([1, 13, 50257])

In [24]:
pd.DataFrame(zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])),
                columns=['Sequence up until', 'Next token with highest probability'] 
                 )

Unnamed: 0,Sequence up until,Next token with highest probability
0,My,Ċ
1,Ġfriend,","
2,Ġwas,Ġa
3,Ġright,.
4,Ġabout,Ġthat
5,Ġthis,.
6,Ġclass,.
7,.,ĠI
8,ĠIt,'s
9,Ġis,Ġa


In [25]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=False) #greedy search

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so fun! I love it! I love the'}]

In [26]:
generator(phrase, max_length = 20, num_return_sequences=1, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so fun!\n\nIt was a pleasure to'}]