## The Masked Language Modeling Task

In [2]:
from transformers import BertForMaskedLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Transformers package comes with several standard "heads" on top of the standard BERT model
bert_lm = BertForMaskedLM.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
bert_lm  # inspect the model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [5]:
# Pipelines in transformers take in models/tokenizers and are easy way to perform several tasks

# We can perform an auto-encoder language model task
nlp = pipeline("fill-mask", model='bert-base-cased')  # could also do "model=bert_lm" for the same result

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
type(nlp.model)

transformers.models.bert.modeling_bert.BertForMaskedLM

In [7]:
nlp.tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
print(type(nlp.model))

preds = nlp(f"If you don’t {nlp.tokenizer.mask_token} at the sign, you will get a ticket.")

print('If you don’t *** at the sign, you will get a ticket.')

for p in preds:
    print(f"Token:{p['token_str']}. Score: {100*p['score']:,.2f}%")

<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>
If you don’t *** at the sign, you will get a ticket.
Token:look. Score: 48.00%
Token:stop. Score: 42.63%
Token:glance. Score: 1.39%
Token:arrive. Score: 0.88%
Token:turn. Score: 0.62%


## The Next Sentence Prediction Task

In [9]:
from transformers import BertForNextSentencePrediction, BertTokenizer
import torch

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [11]:
bert_nsp

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
text = "Deliver huge improvements to your machine learning pipelines without spending hours fine-tuning parameters!"
text2 = "This book’s practical case-studies reveal feature engineering techniques that upgrade your data wrangling—and your ML results."

In [13]:
inputs = tokenizer(text, text2, return_tensors='pt')

In [14]:
inputs

{'input_ids': tensor([[  101,  8116,  4121,  8377,  2000,  2115,  3698,  4083, 13117,  2015,
          2302,  5938,  2847,  2986,  1011, 17372, 11709,   999,   102,  2023,
          2338,  1521,  1055,  6742,  2553,  1011,  2913,  7487,  3444,  3330,
          5461,  2008, 12200,  2115,  2951, 23277,  5654,  2989,  1517,  1998,
          2115, 19875,  3463,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
inputs.input_ids  # tokens for sentence A and B

tensor([[  101,  8116,  4121,  8377,  2000,  2115,  3698,  4083, 13117,  2015,
          2302,  5938,  2847,  2986,  1011, 17372, 11709,   999,   102,  2023,
          2338,  1521,  1055,  6742,  2553,  1011,  2913,  7487,  3444,  3330,
          5461,  2008, 12200,  2115,  2951, 23277,  5654,  2989,  1517,  1998,
          2115, 19875,  3463,  1012,   102]])

In [16]:
inputs.token_type_ids  # segment Ids (0 == A & 1 == B)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [17]:
inputs.attention_mask  # pay attention to everything

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [18]:
# 0 == "isNextSentence" and 1 == "notNextSentence"
outputs = bert_nsp(**inputs)

outputs

NextSentencePredictorOutput(loss=None, logits=tensor([[ 6.0295, -5.5733]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
# calculate loss by passing through a label
outputs = bert_nsp(**inputs, labels=torch.LongTensor([0]))
outputs

NextSentencePredictorOutput(loss=tensor(9.1791e-06, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.0295, -5.5733]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [20]:
# calculate loss by passing through a label
outputs = bert_nsp(**inputs, labels=torch.LongTensor([1]))
outputs

NextSentencePredictorOutput(loss=tensor(11.6028, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.0295, -5.5733]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## Fine-tuning BERT to solve NLP tasks

In [21]:
from transformers import pipeline, BertForQuestionAnswering, BertForTokenClassification, BertForSequenceClassification


In [22]:
bert_sq = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_sq

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [23]:
bert_sq.classifier

Linear(in_features=768, out_features=3, bias=True)

In [24]:
# Finding a classifier on the Huggingface model repository

In [25]:
finbert = pipeline('text-classification', model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')

In [26]:
finbert('Stocks rallied and the British pound gained')

[{'label': 'positive', 'score': 0.6949425339698792}]

In [27]:
finbert('The stock did ok')

[{'label': 'neutral', 'score': 0.8065245747566223}]

In [28]:
finbert.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [29]:
bert_tc = BertForTokenClassification.from_pretrained('bert-base-uncased')
bert_tc

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [30]:
bert_tc.classifier

Linear(in_features=768, out_features=2, bias=True)

In [31]:
# Finding a token classifier on the Huggingface model repository

In [32]:
# https://huggingface.co/savasy/bert-base-turkish-ner-cased
custom_module = 'savasy/bert-base-turkish-ner-cased'
ner=pipeline('ner', model=custom_module, tokenizer=custom_module)

sequence = "Merhaba! Benim adım Sinan. San Francisco'dan geliyorum" # Hi! I'm Sinan. I come from San Francisco"
ner(sequence)

Some weights of the model checkpoint at savasy/bert-base-turkish-ner-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER',
  'score': 0.7242466,
  'index': 5,
  'word': 'Sinan',
  'start': 20,
  'end': 25},
 {'entity': 'B-LOC',
  'score': 0.99879956,
  'index': 7,
  'word': 'San',
  'start': 27,
  'end': 30},
 {'entity': 'I-LOC',
  'score': 0.99770975,
  'index': 8,
  'word': 'Francisco',
  'start': 31,
  'end': 40}]

In [33]:
bert_qa = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
bert_qa

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [34]:
bert_qa.qa_outputs

Linear(in_features=768, out_features=2, bias=True)

In [35]:
# Finding a QA model on the Huggingface model repository

In [36]:
model_name = "deepset/roberta-base-squad2"
qa = pipeline(model=model_name, tokenizer=model_name, revision="v1.0", task="question-answering")

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
sequence = "Where is Sinan living these days?", "Sinan lives in California but Matt lives in Boston."
qa(*sequence)

{'score': 0.9808393716812134, 'start': 15, 'end': 25, 'answer': 'California'}

In [38]:
sequence = "Where is Matt living these days?", "Sinan lives in California but Matt lives in Boston."
qa(*sequence)

{'score': 0.8637669086456299, 'start': 44, 'end': 50, 'answer': 'Boston'}

In [39]:
squad_pipe = pipeline('question-answering', 'bert-large-uncased-whole-word-masking-finetuned-squad')

config.json: 100%|██████████| 443/443 [00:00<00:00, 839kB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:23<00:00, 56.3MB/s]
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 250kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 6.85MB/s]
tokenizer.json: 100%|██████████| 466k/466k

In [40]:
squad_pipe("where is Sinan living these days?", "Sinan lives in California but Matt lives in Boston.")

{'score': 0.9924461245536804, 'start': 15, 'end': 25, 'answer': 'California'}

In [41]:
# visualize logits
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
large_tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# auto tokenizer is a cache all for figuring out what is the best tokenizer for the input
qa_input = large_tokenizer.encode_plus("where is Sinan living these days?", "Sinan lives in California but Matt lives in Boston.", return_tensors='pt')
print(qa_input)


{'input_ids': tensor([[ 101, 2073, 2003, 8254, 2319, 2542, 2122, 2420, 1029,  102, 8254, 2319,
         3268, 1999, 2662, 2021, 4717, 3268, 1999, 3731, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [42]:
large_qa_bert = AutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
output = large_qa_bert(**qa_input)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sns.(rc=)