In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

# Test Drive
[Quick Tour of Transformers](https://github.com/huggingface/transformers#quick-tour)

In [3]:
from transformers import *

In [4]:
# Transformers has a unified API
# for 10 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          #(OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          #(GPT2Model,       GPT2Tokenizer,       'gpt2'),
          #(CTRLModel,       CTRLTokenizer,       'ctrl'),
          #(TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          #(XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          #(XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          #(RobertaModel,    RobertaTokenizer,    'roberta-base'),
          #(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])
    print('model_class',model_class)
    with torch.no_grad():
        print('hidden state')
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

model_class <class 'transformers.modeling_bert.BertModel'>
hidden state


In [5]:
last_hidden_states, last_hidden_states.shape

(tensor([[[-0.0549,  0.1053, -0.1065,  ..., -0.3550,  0.0686,  0.6506],
          [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
          [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
          ...,
          [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
          [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
          [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]]]),
 torch.Size([1, 9, 768]))

In [20]:
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

BERT_MODEL_CLASSES = [BertModel, BertForQuestionAnswering]


# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    model = model_class.from_pretrained(pretrained_weights,
                                        output_hidden_states=True,
                                        output_attentions=True)
    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
    all_hidden_states, all_attentions = model(input_ids)[-2:]

    # Models are compatible with Torchscript
    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
    traced_model = torch.jit.trace(model, (input_ids,))

    # Simple serialization for models and tokenizers
    model.save_pretrained('./models/')  # save
    model = model_class.from_pretrained('./models/')  # re-load
    tokenizer.save_pretrained('./models/')  # save
    tokenizer = BertTokenizer.from_pretrained('./models/')  # re-load

    # SOTA examples for GLUE, SQUAD, text generation...

## Try pipelines

In [1]:
from transformers import pipeline

story = 'The girls measured the garage door, then they went to Home Depot and ' + \
        'ordered a new garage door. They said it cost more than they expected, and will be here in a month.' + \
        'On the way home they stopped at Masa for a drink.'


In [2]:
nlp = pipeline('question-answering')
nlp({
    'question': 'Where did the girls eat?',
    'context': story
})

HBox(children=(IntProgress(value=0, description='Downloading', max=546, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=230, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=555, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=265481570, style=ProgressStyle(description_…




convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 263.68it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1829.18it/s]


{'answer': 'the garage door,',
 'end': 35,
 'score': 0.16736820226754645,
 'start': 19}

In [3]:
story = 'The girls measured the garage door, then they went to Home Depot and ' + \
        'ordered a new garage door. They said it cost more than they expected, and will be here in a month.' + \
        'On the way home they stopped at Masa for snacks.'

In [4]:
nlp({
    'question': 'Where did the girls eat?',
    'context': story
})

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 219.07it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4132.32it/s]


{'answer': 'Masa', 'end': 203, 'score': 0.3451716326275722, 'start': 199}

In [5]:
def answer(story, question):
    return nlp({'question': question, 'context': story })

In [8]:
answer('Most important of all is that I never use a notebook with rippable paper; these notebooks are important documents I keep with me forever. Every page gets a page number and the date at the top. I reserve the first page of each notebook for a table of contents. Whenever something happens that I might need to refer to later I put it and its page number in the table of contents. Many table of contents entries end up with many page numbers after them, such as pages with T-shirt ideas, game ideas, and haikus. Because of this system I can say “see #3 on 2016-05-24” and go right to something I noted years ago. I go through about one notebook per year and have them going back a long time.',
      'What goes in the table of contents?')

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 74.37it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4826.59it/s]


{'answer': 'page number',
 'end': 351,
 'score': 0.5145550246995185,
 'start': 340}

In [9]:
pipeline('fill-mask')

TypeError: 'NoneType' object is not subscriptable

Setting this aside for now and trying to read and work through the documentation.

## Quick Tour
[Transformers Quickstart](https://huggingface.co/transformers/quickstart.html)

In [10]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
#import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

Use ```BertModel``` to encode inputs into hidden-states:

In [11]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

In [12]:
encoded_layers.shape

torch.Size([1, 14, 768])

In [35]:
model.config.hidden_size

768

In [36]:
len(indexed_tokens)

14

Use ```BertForMaskedLM``` to predict a masked token:

In [37]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

predicted_token # Should be 'henson'

'henson'

In [39]:
predictions.shape

torch.Size([1, 14, 30522])

## GPT2

Predicting the next token from a prompt:

In [41]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "Who was Jim Henson ? Jim Henson was a"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/rlack/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/rlack/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71


How to use ```GPT2LMHeadModel``` to generate the next token following our text:

In [43]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# get the predicted next sub-word (in our case, the word 'man')
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

predicted_text

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/rlack/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.699bbd1c449e9861456f359d6daa51bd523ac085b4b531ab0aad5a55d091e942
INFO:transformers.configuration_utils:Model config GPT2Config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "embd_pdrop": 0.1,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_beams": 1,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_atte

'Who was Jim Henson? Jim Henson was a man'

In [18]:
torch.version.git_version

'8554416a199c4cec01c60c7015d8301d2bb39b64'