https://github.com/huggingface/transformers#quick-tour

In [1]:
import torch
from transformers import *


In [2]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2')
      #    (CTRLModel,       CTRLTokenizer,       'ctrl'),
      #    (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
      #    (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
      #    (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
      #    (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
      #    (RobertaModel,    RobertaTokenizer,    'roberta-base'),
       #   (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

In [3]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples


In [45]:
model_class, tokenizer_class, pretrained_weights  = MODELS[0]

# how to extract all hidden states and all attentions 

https://huggingface.co/transformers/main_classes/tokenizer.html
　
tokenizer.encodeは
```python
run_phrase = ["This is the first sentence." , "And a next sentence follows"]
```
みたいなのは食えない模様

model.forwardのアウトプットがどういうtupleかは
https://huggingface.co/transformers/model_doc/bert.html
を参照
- last hidden states
- pooler output
- hidden states
- attentions

In [32]:
run_phrase = "Periods are also tokenized"

In [53]:
run_phrase = "Ski"

In [6]:
run_phrase = "because BERT uses sentencepiece, sometimes the number of tokens is larger than word counts"

In [54]:
# Encode text
tokenized = tokenizer.encode(run_phrase, add_special_tokens=True)
input_ids = torch.tensor([tokenized])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

In [55]:
tokenized

[50, 4106]

In [51]:
input_ids

tensor([[20545]])

## tokenizer 

https://huggingface.co/transformers/main_classes/tokenizer.html

https://huggingface.co/transformers/model_doc/bert.html

### どのように分割されたかのチェック

In [12]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("internationalization"))

['international', 'ization']

In [13]:
tokenizer.encode_plus("internationalization")

{'input_ids': [45609, 1634],
 'token_type_ids': [0, 0],
 'attention_mask': [1, 1]}

# encoding

## last hiddenベクトルの性質チェック

In [27]:

model = model_class.from_pretrained(pretrained_weights, torchscript=True)

In [28]:
def my_last_hidden_dict(run_phrase):

    tokenized = tokenizer.encode(run_phrase, add_special_tokens=True)
    input_ids = torch.tensor([tokenized])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
        
    tokens = tokenizer.convert_ids_to_tokens(tokenized)
    run_dict = {tokens[i] : last_hidden_states[:,i,:].squeeze().numpy() for i in range(len(tokens))}
    return run_dict
    

多分語頭と語尾に相当するtokenが存在する

In [44]:
my_last_hidden_dict("ski").keys()

dict_keys(['ski'])

In [40]:
tmp = my_last_hidden_dict("Ski is fun.")['[CLS]']

KeyError: '[CLS]'

In [None]:
tmp.shape

In [17]:
import scipy as sp

In [18]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] ,my_last_hidden_dict("Snow")['snow'])

KeyError: 'ski'

In [None]:
# snowboardは2語に分割される
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] ,my_last_hidden_dict("Snowboard")['snowboard'])

In [None]:
snowboard = my_last_hidden_dict("Snowboard")

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] ,snowboard["snow"] + snowboard["##board"])

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] ,snowboard["##board"])

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] , my_last_hidden_dict("world ski championship")['ski'])

In [None]:
sp.spatial.distance.cosine(1,-1)

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] , my_last_hidden_dict("soccer")['soccer'])

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] , my_last_hidden_dict("sugar")['sugar'])

In [None]:
sp.spatial.distance.cosine(my_last_hidden_dict("Ski")['ski'] , my_last_hidden_dict("freestyle ski")['ski'])

wordpieceにより分割されていると思われる例：

In [None]:
my_last_hidden_dict("internationalization").keys()

## その他のレイヤーのベクトルの性質チェック
ゼロから順番に深いレイヤーなことに注意。
https://github.com/huggingface/transformers/issues/1950

In [41]:
# Models can return full list of hidden-states & attentions weights at each layer
model = model_class.from_pretrained(pretrained_weights,
                                    output_hidden_states=True,
                                    output_attentions=True)

In [42]:
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
all_hidden_states, all_attentions = model(input_ids)[-2:]

RuntimeError: index out of range at /opt/conda/conda-bld/pytorch_1556653114079/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:193

In [None]:
def my_hidden_dict(run_phrase , layer = 0):

    tokenized = tokenizer.encode(run_phrase, add_special_tokens=True)
    input_ids = torch.tensor([tokenized])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        all_hidden_states, all_attentions = model(input_ids)[-2:]
        #last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
        
    tokens = tokenizer.convert_ids_to_tokens(tokenized)
    run_dict = {tokens[i] : all_hidden_states[layer][:,i,:].squeeze().numpy() for i in range(len(tokens))}
    return run_dict
    

In [None]:
# 一番浅いレイヤーでもかなり文脈依存になってしまっている
l = -1
sp.spatial.distance.cosine(my_hidden_dict('ski' , l)['ski'] ,my_hidden_dict('freestyle ski', l)['ski'])

In [129]:
# 
l = 0
sp.spatial.distance.cosine(my_hidden_dict('ski' , l)['ski'] ,my_hidden_dict('freestyle ski', l)['ski'])

0.03874468803405762

## 最初のembedding処理後のベクトルの性質チェック


In [121]:
def my_embedding_dict(run_phrase , layer = 0):

    tokenized = tokenizer.encode(run_phrase, add_special_tokens=True)
    input_ids = torch.tensor([tokenized])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        all_hidden_states, all_attentions = model(input_ids)[-2:]
        #last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
    embed_matrix = model.get_input_embeddings().weight
    tokens = tokenizer.convert_ids_to_tokens(tokenized)
    run_dict = {tokens[i] : embed_matrix[tokenized[i],:].detach().squeeze().numpy() for i in range(len(tokens))}
    return run_dict
    

In [124]:
sp.spatial.distance.cosine(my_embedding_dict('ski' , l)['ski'] ,my_embedding_dict('freestyle ski', l)['ski'])

0.0

In [123]:
my_embedding_dict("Ski")["ski"]

array([-5.36629464e-03, -8.62949193e-02, -5.18712401e-03, -2.91774143e-02,
       -9.84724239e-02,  1.92878749e-02, -6.35578185e-02,  2.71245721e-03,
       -4.84481268e-02, -6.00291006e-02, -2.70059016e-02, -1.24797113e-01,
        6.52698847e-03,  8.33721459e-02,  5.17042279e-02, -4.73624393e-02,
       -1.08939119e-01, -1.66100613e-03,  3.23905535e-02, -2.55150665e-02,
       -8.46075267e-02, -3.65849547e-02, -2.20447723e-02, -6.00817576e-02,
       -1.01680659e-01,  6.35808706e-03, -5.35597503e-02, -8.54655206e-02,
       -1.04875557e-01, -3.55551876e-02, -3.13782953e-02,  4.86183092e-02,
       -2.48831343e-02,  4.24876101e-02, -9.63180698e-03,  6.49224967e-03,
       -2.72310916e-02,  4.13084356e-03, -1.12589009e-01,  4.29117419e-02,
        9.25005879e-03, -3.73848341e-02, -7.25976937e-03, -1.18009374e-01,
       -7.50806406e-02,  6.07025996e-03, -3.97465825e-02,  1.08596459e-02,
       -1.86708961e-02, -5.58486348e-03, -2.33931001e-02,  7.82739837e-03,
       -1.80814657e-02, -

In [112]:
type(model.get_input_embeddings())

torch.nn.modules.sparse.Embedding

In [115]:
model.get_input_embeddings().weight

torch.Size([30522, 768])

# 未分類


In [12]:
my_last_hidden("baseball").shape

torch.Size([1, 3, 768])

In [25]:
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

In [26]:
model_class = BERT_MODEL_CLASSES[0]

In [27]:
# Models can return full list of hidden-states & attentions weights at each layer
model = model_class.from_pretrained(pretrained_weights,
                                    output_hidden_states=True,
                                    output_attentions=True)

In [31]:
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
all_hidden_states, all_attentions = model(input_ids)[-2:]

In [35]:
dir(model)

['__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backend', '_backward_hooks', '_buffers', '_do_output_past', '_forward_hooks', '_forward_pre_hooks', '_generate_beam_search', '_generate_no_beam_search', '_get_name', '_get_resized_embeddings', '_init_weights', '_load_from_state_dict', '_load_state_dict_pre_hooks', '_modules', '_named_members', '_parameters', '_prune_heads', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_resize_token_embeddings', '_slow_forward', '_state_dict_hooks', '_tie_or_clone_weights', '_tracing_name', '_version', 'add_module', 'apply', 'base_model', 'base_model_prefix', 'buffers', 'children', 'confi

In [40]:
len(model(input_ids))

4

In [44]:
print(type(all_hidden_states))
print(len(all_hidden_states))
print(all_hidden_states[0].shape)

<class 'tuple'>
13
torch.Size([1, 16, 768])


In [45]:
print(type(all_attentions))
print(len(all_attentions))

<class 'tuple'>
12


In [22]:
print(all_attentions[0].shape)

torch.Size([1, 12, 16, 16])


In [None]:
# Models are compatible with Torchscript
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
traced_model = torch.jit.trace(model, (input_ids,))

# Simple serialization for models and tokenizers
model.save_pretrained('./directory/to/save/')  # save
model = model_class.from_pretrained('./directory/to/save/')  # re-load
tokenizer.save_pretrained('./directory/to/save/')  # save
tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load

# sandbox

In [8]:
# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

In [None]:
# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

In [None]:
for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    model = model_class.from_pretrained(pretrained_weights,
                                        output_hidden_states=True,
                                        output_attentions=True)
    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
    all_hidden_states, all_attentions = model(input_ids)[-2:]

    # Models are compatible with Torchscript
    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
    traced_model = torch.jit.trace(model, (input_ids,))

    # Simple serialization for models and tokenizers
    model.save_pretrained('./directory/to/save/')  # save
    model = model_class.from_pretrained('./directory/to/save/')  # re-load
    tokenizer.save_pretrained('./directory/to/save/')  # save
    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load

In [None]:
model