In [1]:
import torch
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary from wikitext 103)
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')

# Tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)

# Convert token to vocabulary indices
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

INFO:pytorch_pretrained_bert.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin not found in cache, downloading to /var/folders/fy/f2vfpth16ys0j7sbslk2lx300000gn/T/tmp5llz26ju
100%|██████████| 9143613/9143613 [00:01<00:00, 5301343.79B/s]
INFO:pytorch_pretrained_bert.file_utils:copying /var/folders/fy/f2vfpth16ys0j7sbslk2lx300000gn/T/tmp5llz26ju to cache at /Users/anto/.pytorch_pretrained_bert/b24cb708726fd43cbf1a382da9ed3908263e4fb8a156f9e0a4f45b7540c69caa.a6a9c41b856e5c31c9f125dd6a7ed4b833fbcefda148b627871d4171b25cffd1
INFO:pytorch_pretrained_bert.file_utils:creating metadata file for /Users/anto/.pytorch_pretrained_bert/b24cb708726fd43cbf1a382da9ed3908263e4fb8a156f9e0a4f45b7540c69caa.a6a9c41b856e5c31c9f125dd6a7ed4b833fbcefda148b627871d4171b25cffd1
INFO:pytorch_pretrained_bert.file_utils:removing temp file /var/folders/fy/f2vfpth16ys0j7sbslk2lx300000gn/T/tmp5llz26ju
INFO:pytorch_pretrained_bert.tokenization_transfo_xl:loading vocabulary file http

Let's see how to use TransfoXLModel to get hidden states

In [3]:
# Load pre-trained model (weights)
model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
model.eval()

# If you have a GPU, put everything on cuda
# tokens_tensor_1 = tokens_tensor_1.to('cuda')
# tokens_tensor_2 = tokens_tensor_2.to('cuda')
# model.to('cuda')

with torch.no_grad():
    # Predict hidden states features for each layer
    hidden_states_1, mems_1 = model(tokens_tensor_1)
    # We can re-use the memory cells in a subsequent call to attend a longer context
    hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)

INFO:pytorch_pretrained_bert.modeling_transfo_xl:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin from cache at /Users/anto/.pytorch_pretrained_bert/12642ff7d0279757d8356bfd86a729d9697018a0c93ad042de1d0d2cc17fd57b.e9704971f27275ec067a00a67e6a5f0b05b4306b3f714a96e9f763d8fb612671
INFO:pytorch_pretrained_bert.modeling_transfo_xl:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json from cache at /Users/anto/.pytorch_pretrained_bert/a6dfd6a3896b3ae4c1a3c5f26ff1f1827c26c15b679de9212a04060eaf1237df.aef76fb1064c932cd6a2a2be3f23ebbfa5f9b6e29e8e87b571c45b4a5d5d1b90
INFO:pytorch_pretrained_bert.modeling_transfo_xl:Model config {
  "adaptive": true,
  "attn_type": 0,
  "clamp_len": 1000,
  "cutoffs": [
    20000,
    40000,
    200000
  ],
  "d_embed": 1024,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "div_val": 4,
  "dropatt": 0.0,
  "dropout": 0.1,
  "ext_len": 0,
  "init

And how to use TransfoXLLMHeadModel

In [45]:
# Tokenized input
text_1 = "In the field of NLP, language modeling involves predicting the next word in a sentence."
text_2 = "Applying this model recursively, will yield a computer-generated text. "
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)

# Convert token to vocabulary indices
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

In [46]:
# Load pre-trained model (weights)
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
model.eval()

# If you have a GPU, put everything on cuda
# tokens_tensor_1 = tokens_tensor_1.to('cuda')
# tokens_tensor_2 = tokens_tensor_2.to('cuda')
# model.to('cuda')

with torch.no_grad():
    # Predict all tokens
    predictions_1, mems_1 = model(tokens_tensor_1)
    # We can re-use the memory cells in a subsequent call to attend a longer context
    predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)

# get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#assert predicted_token == 'who'
predicted_token

INFO:pytorch_pretrained_bert.modeling_transfo_xl:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin from cache at /Users/anto/.pytorch_pretrained_bert/12642ff7d0279757d8356bfd86a729d9697018a0c93ad042de1d0d2cc17fd57b.e9704971f27275ec067a00a67e6a5f0b05b4306b3f714a96e9f763d8fb612671
INFO:pytorch_pretrained_bert.modeling_transfo_xl:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json from cache at /Users/anto/.pytorch_pretrained_bert/a6dfd6a3896b3ae4c1a3c5f26ff1f1827c26c15b679de9212a04060eaf1237df.aef76fb1064c932cd6a2a2be3f23ebbfa5f9b6e29e8e87b571c45b4a5d5d1b90
INFO:pytorch_pretrained_bert.modeling_transfo_xl:Model config {
  "adaptive": true,
  "attn_type": 0,
  "clamp_len": 1000,
  "cutoffs": [
    20000,
    40000,
    200000
  ],
  "d_embed": 1024,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "div_val": 4,
  "dropatt": 0.0,
  "dropout": 0.1,
  "ext_len": 0,
  "init

'.'

In [47]:
predictions_2.shape, predictions_1.shape

(torch.Size([1, 9, 267735]), torch.Size([1, 15, 267735]))

In [48]:
text_1, text_2

('In the field of NLP, language modeling involves predicting the next word in a sentence.',
 'Applying this model recursively, will yield a computer-generated text. ')

In [49]:
for i in range(5):
    predicted_index = torch.argmax(predictions_2[0, i, :]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token, end=' ')

the to to to be 

In [50]:
mems_next = mems_2
tokens_tensor_next = tokens_tensor_2

for i in range(20):    
    with torch.no_grad():
        # We can re-use the memory cells in a subsequent call to attend a longer context
        predictions_next, mems_next = model(tokens_tensor_next, mems=mems_next)

    predicted_index_tensor = torch.argmax(predictions_next[0, -1, :])
    tokens_tensor_next = torch.cat(
        [tokens_tensor_2[:, 1:], predicted_index_tensor.reshape(1, 1)], 1)
        
    predicted_index = predicted_index_tensor.item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token, end=' ')

<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 

In [25]:
tokenizer.convert_ids_to_tokens([predicted_index])

['who']

In [29]:
tokens_tensor_2.shape

torch.Size([1, 5])

In [27]:
tokens_tensor_2, predicted_index

(tensor([[ 1666, 12034,    11,     8, 56081]]), 52)

In [36]:
predicted_index_tensor = torch.argmax(predictions_next[0, -1, :]).reshape(1, 1)
torch.cat([tokens_tensor_2[:, 1:], predicted_index_tensor], 1).shape

torch.Size([1, 5])

In [33]:
torch.argmax(predictions_next[0, -1, :]).reshape(1, 1)

tensor([[52]])

In [23]:
tokens_tensor_2.shape

torch.Size([1, 5])

In [None]:
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')

In [79]:
line = "This car was produced in China"
line_tokenized = tokenizer.tokenize(line)
line_indexed = tokenizer.convert_tokens_to_ids(line_tokenized)
tokens_tensor = torch.tensor([line_indexed])
#tokens_tensor = tokens_tensor.to(device)

In [80]:
max_predictions = 50
mems = None
for i in range(max_predictions):
    predictions, mems = model(tokens_tensor, mems=mems)
    predicted_index_tensor = torch.topk(predictions[0, -1, :],5)[1][1] 
    predicted_index = predicted_index_tensor.item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)
    
    tokens_tensor = torch.cat((tokens_tensor, predicted_index_tensor.reshape(1, 1)), dim=1)

,
the
most
the
Chinese
used
.
<eos>
The
two
other
vehicles
produced
were
:
the
A
car
and
a
T
@-@
vehicle
,
which
was
the
only
car
to
have
been
built
by
China
,
the
least
used
and
most
widely
available
,
and
this
vehicle
had
the
least


In [60]:
valu, idx = torch.topk(predictions[0, -1, :],5, sorted=True)
valu, idx

(tensor([-2.3047, -2.7053, -2.8323, -3.0960, -3.2882], grad_fn=<TopkBackward>),
 tensor([ 5,  1, 19,  7,  8]))

In [54]:
predictions.shape

torch.Size([1, 27, 267735])