based on model.py

# Bert Model

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch.nn as nn

class policy_network(nn.Module):
    
    def __init__(self, model_config="bert-base-uncased", add_linear=False, embedding_size=128, freeze_encoder=True, context_net=False):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_config)
        print("model_config:", model_config)
        self.model = AutoModelForTokenClassification.from_pretrained(model_config)
        
        # Freeze transformer encoder and only train the linear layer
        if freeze_encoder:
            for param in self.model.parameters():
                param.requires_grad = False

        if add_linear:
            # Add an additional small, adjustable linear layer on top of BERT tuned through RL
            self.embedding_size = embedding_size
            if context_net:
                input_dim = self.model.config.hidden_size * 2
            else:
                input_dim = self.model.config.hidden_size
            self.linear = nn.Linear(input_dim,
                                    embedding_size)  # 768 for bert-base-uncased, distilbert-base-uncased
        else:
            self.linear = None
            
    def forward(self, input_list, bert_forward=True, linear_forward=True):
        if bert_forward:
            input = self.tokenizer(input_list, truncation=True, padding=True, return_tensors="pt").to(self.model.device)
            # print(f"input: {input}")
            output = self.model(**input, output_hidden_states=True)
            # Get last layer hidden states
            last_hidden_states = output.hidden_states[-1]
            # Get [CLS] hidden states
            sentence_embedding = last_hidden_states[:, 0, :]  # len(input_list) x hidden_size
            # print(f"sentence_embedding: {sentence_embedding}")

        if linear_forward:
            if self.linear:
                if bert_forward:
                    sentence_embedding = self.linear(sentence_embedding)  # len(input_list) x embedding_size
                else:
                    sentence_embedding = self.linear(input_list)
        return sentence_embedding

The paper implementation is not straightforward since **AutoModelForTokenClassification** already has a linear layer

In [None]:

if False:
    model = policy_network(add_linear=True,
                       freeze_encoder=True)

model_config: bert-base-uncased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenizer

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [5]:
import pandas as pd

sentence = "The unbelievable quick brown fox jumps over the lazy dog."[:10]

# Tokenize the sentence
encoded = tokenizer(sentence, return_tensors="pt", return_attention_mask=True)
tokens = tokenizer.convert_ids_to_tokens(encoded.input_ids[0])
token_ids = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = encoded.attention_mask[0]

# Visualize the tokenized sentence
df = pd.DataFrame({
    "token": tokens,
    "token_id": token_ids,
    "attention_mask": attention_mask
})

# Display the DataFrame
print(df)


   token  token_id  attention_mask
0  [CLS]       101               1
1    the      1996               1
2     un      4895               1
3  ##bel      8671               1
4    ##i      2072               1
5  [SEP]       102               1


In [6]:
word = "albja;fdj;ldfj"
print(word in tokenizer.vocab)  # True
print(tokenizer.tokenize(word))  # 23653
print(tokenizer.convert_tokens_to_ids(word))  # 23653
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word)))  # 23653

False
['al', '##b', '##ja', ';', 'f', '##d', '##j', ';', 'ld', '##f', '##j']
100
[2632, 2497, 3900, 1025, 1042, 2094, 3501, 1025, 25510, 2546, 3501]


In [9]:
import pandas as pd

sentence = "The unbelievable quick brown fox jumps over the lazy dog."
sentence2 = "The orange cat jumps over the dog."

# Tokenize the sentence
encoded = tokenizer(sentence, sentence2, return_tensors="pt", return_attention_mask=True)
tokens = tokenizer.convert_ids_to_tokens(encoded.input_ids[0])
token_ids = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = encoded.attention_mask[0]
segment_ids = encoded.token_type_ids[0]

# Visualize the tokenized sentence
df = pd.DataFrame({
    "token": tokens,
    "token_id": token_ids,
    "segment_id": segment_ids,
    "attention_mask": attention_mask
})

# Display the DataFrame
print(df)


           token  token_id  segment_id  attention_mask
0          [CLS]       101           0               1
1            the      1996           0               1
2   unbelievable     23653           0               1
3          quick      4248           0               1
4          brown      2829           0               1
5            fox      4419           0               1
6          jumps     14523           0               1
7           over      2058           0               1
8            the      1996           0               1
9           lazy     13971           0               1
10           dog      3899           0               1
11             .      1012           0               1
12         [SEP]       102           0               1
13           the      1996           1               1
14        orange      4589           1               1
15           cat      4937           1               1
16         jumps     14523           1               1
17        

# Bert Output

In [6]:
from transformers import AutoModel, AutoTokenizer


In [11]:
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

input = tokenizer("Hello, world!", return_tensors="pt")
print(f"input: {input}")
output = model(**input)
print(f"output: {output.keys()}") # output: odict_keys(['last_hidden_state', 'pooler_output'])

print(f"output.last_hidden_state.shape: {output.last_hidden_state.shape}") # output.last_hidden_state.shape: torch.Size([1, 6, 768])

print(f"output.pooler_output.shape: {output.pooler_output.shape}") # output.pooler_output.shape: torch.Size([1, 768])

# print(model.config)

input: {'input_ids': tensor([[ 101, 7592, 1010, 2088,  999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
output: odict_keys(['last_hidden_state', 'pooler_output'])
output.last_hidden_state.shape: torch.Size([1, 6, 768])
output.pooler_output.shape: torch.Size([1, 768])
