In [1]:
%pip install transformers pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "bert-base-cased"

In [4]:
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading: 100%|██████████| 570/570 [00:00<00:00, 2.35MB/s]
Downloading: 100%|██████████| 436M/436M [00:11<00:00, 37.4MB/s] 
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 3.50MB/s]
Downloading: 100%|██████████| 436k/436k [00:00<00:00, 2.45MB/s]


In [5]:
sentence = "When life gives you lemons,don't make lemonade."

## Tokenizers

In [7]:
tokens = tokenizer.tokenize(sentence)
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade',
 '.']

In [10]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token":vocab.keys(),"token_id":vocab.values()})
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
vocab_df

Unnamed: 0_level_0,token
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]
...,...
28991,##）
28992,##，
28993,##－
28994,##／


In [11]:
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 119,
 102]

In [13]:
print(len(tokens),"-",len(token_ids))

14 - 16


In [17]:
vocab_df.iloc[101]

token    [CLS]
Name: 101, dtype: object

In [16]:
vocab_df.iloc[102]


token    [SEP]
Name: 102, dtype: object

In [18]:
list(zip(tokens, token_ids[1:-1]))

[('When', 1332),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397),
 ('.', 119)]

In [19]:
tokenizer.decode(token_ids[1:-1])

"When life gives you lemons, don't make lemonade."

In [20]:
tokenizer_out = tokenizer(sentence)
tokenizer_out

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
sentence2 = sentence.replace("don't","")

In [22]:
tokenizer_out2 = tokenizer([sentence,sentence2],padding = True)
# padding = True will add padding tokens to make the input the same length. used when no of sentences are even
tokenizer_out2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [23]:
tokenizer.decode(tokenizer_out2["input_ids"][0])


"[CLS] When life gives you lemons, don't make lemonade. [SEP]"

In [24]:
tokenizer.decode(tokenizer_out2["input_ids"][1])

'[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]'

## Word embeddings

var = 1

dict = {var}