In [12]:
from transformers import BertTokenizer, BertModel

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
tokens = tokenizer("I love Bangladesh",
                   return_tensors='pt')
#return_tensors='pt', the tokenizer instead returns the output as PyTorch tensors
#"pt" = PyTorch,"tf" = TensorFlow, and "np" = NumPy.

#this code is returning us token ids, token tyoe ids, and attention mask

In [15]:
tokens #inn tensor 101 is CLS, 102 is separator

{'input_ids': tensor([[ 101, 1045, 2293, 7269,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [17]:
tokens = tokenizer(["I love Bangladesh", "But hardly this country loves us"],
                   padding=True,
                   return_tensors='pt')
tokens
#padding will make the tensor length of same size and add 0s to match the length

{'input_ids': tensor([[ 101, 1045, 2293, 7269,  102,    0,    0,    0],
        [ 101, 2021, 6684, 2023, 2406, 7459, 2149,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
tokens = tokenizer(["I love Bangladesh", "But hardly this country loves us"],
                   padding="max_length",
                   max_length=15,
                   return_tensors='pt')
tokens
#adding max length in padding to have more control ovper the padding

{'input_ids': tensor([[ 101, 1045, 2293, 7269,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 101, 2021, 6684, 2023, 2406, 7459, 2149,  102,    0,    0,    0,    0,
            0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [19]:
tokens = tokenizer(["I love Bangladesh", "But hardly this country loves us"],
                   padding="max_length",
                   max_length=5,
                   truncation=True,
                   return_tensors='pt')
tokens

#truncation wiill help us to map the data accordiing to our max_length tokens
#if the token size is bigger than max_length, truncation will map than token without an error


{'input_ids': tensor([[ 101, 1045, 2293, 7269,  102],
        [ 101, 2021, 6684, 2023,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

In [20]:
tokens = tokenizer("I love Bangladesh",
                   return_tensors='pt')

In [23]:
model = BertModel.from_pretrained("bert-base-uncased")
output = model(**tokens)

#creating contexual embeddings

In [26]:
output['last_hidden_state']
#for each token of "CLS I love Bangladesh SEP" last_hidden_state is shoowing the contexual embedding

tensor([[[-0.0224,  0.2553, -0.2280,  ..., -0.2571,  0.1985,  0.3660],
         [ 0.2632,  0.5703,  0.0819,  ..., -0.4229,  0.5411, -0.5982],
         [ 1.0946,  0.9915,  0.5911,  ..., -0.3156,  0.3700, -0.2465],
         [ 0.1894, -0.2037, -0.4778,  ...,  0.4723,  0.0961, -0.5052],
         [ 0.7222,  0.2883, -0.2963,  ..., -0.2011, -0.4999, -0.4110]]],
       grad_fn=<NativeLayerNormBackward0>)

In [28]:
output['last_hidden_state'].shape
#berts embedding shape is 768

torch.Size([1, 5, 768])