In [7]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

sentence = "when will you be back? when life gives you lemons, don't make lemonade"
tokens = tokenizer.tokenize(sentence)
print(tokens)




['when', 'will', 'you', 'be', 'back', '?', 'when', 'life', 'gives', 'you', 'lemon', '##s', ',', 'don', "'", 't', 'make', 'lemon', '##ade']


In [8]:
vocab = tokenizer.vocab
print(vocab)

vocab_df = pd.DataFrame({
    "token": list(vocab.keys()),
    "token_id": list(vocab.values())
})

print(vocab_df)
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
print(vocab_df)




          token  token_id
0         marks      6216
1          curb     15786
2      implying     22203
3      Theology     15673
4         ##aya     12057
...         ...       ...
28991  Courtney     14675
28992     Lands     17854
28993  Turnpike     23303
28994  Rhodesia     22228
28995   whereby     13949

[28996 rows x 2 columns]
              token
token_id           
0             [PAD]
1         [unused1]
2         [unused2]
3         [unused3]
4         [unused4]
...             ...
28991           ##）
28992           ##，
28993           ##－
28994           ##／
28995           ##：

[28996 rows x 1 columns]


In [9]:
token_ids = tokenizer.encode(sentence)
print(token_ids)

len(tokens)
len(token_ids)

list(zip(tokens, token_ids[1:-1]))

[101, 1165, 1209, 1128, 1129, 1171, 136, 1165, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 102]


[('when', 1165),
 ('will', 1209),
 ('you', 1128),
 ('be', 1129),
 ('back', 1171),
 ('?', 136),
 ('when', 1165),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397)]

In [10]:
tokenizer.encode(sentence)

[101,
 1165,
 1209,
 1128,
 1129,
 1171,
 136,
 1165,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 102]

In [11]:
tokenizer.decode(token_ids)

"[CLS] when will you be back? when life gives you lemons, don ' t make lemonade [SEP]"

In [14]:
tokenizer_output = tokenizer(sentence)
print(tokenizer_output)


{'input_ids': [101, 1165, 1209, 1128, 1129, 1171, 136, 1165, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [19]:
encoded_inputs = tokenizer(sentence, return_tensors="pt")
print(encoded_inputs)


output = model(**encoded_inputs)





{'input_ids': tensor([[  101,  1165,  1209,  1128,  1129,  1171,   136,  1165,  1297,  3114,
          1128, 22782,  1116,   117,  1274,   112,   189,  1294, 22782,  6397,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.3240,  0.0843,  0.0786,  ..., -0.2520,  0.4724,  0.0283],
         [ 0.3792, -0.4676,  0.6445,  ..., -0.4751,  0.2385,  0.0642],
         [-0.1406,  0.6235, -0.3435,  ...,  0.2255,  0.5630,  0.3713],
         ...,
         [ 0.4398,  0.2633,  0.4809,  ...,  0.0820, -0.0455,  0.2710],
         [ 0.0796,  0.0406, -0.2211,  ..., -0.6643, -0.2406,  0.2005],
         [ 0.6601, -0.1696, -0.1890,  ..., -0.3122,  1.0518, -0.2274]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-6.2714e-01,  4.3541e-01,  9.9984e-01, -9.9315e-01,  9.6283e-01,
          9.2376e-01,  9.8607e-01, -9.9308e-01, -9.6858e-01, -5.4289e-01,
          9.7674e-01,  9.9833e-01, -9.9846e-01, -9.9978e-01,  8.3659e-01,
         -9.7506e-01,  9.8548e-01, -5.3422e-01, -9.9995e-01, -7.9513e-01,
         -4.3034e-01, -9.9987e-01,  2.5510e-01,  9.7719e-01,  9.7221e-01,
          5.8487e-02,  9.8240e-01,  9.9995e-01,  7.8937e-01, -2.491