In [1]:
!git clone https://github.com/wzwzeyal/bert_classifier_repo.git

fatal: destination path 'bert_classifier_repo' already exists and is not an empty directory.


In [2]:
from bert_classifier_repo.module import BertClassifierModule
import torch
import pandas as pd


In [3]:
bert_model_name = 'avichr/heBERT_sentiment_analysis'

state_dict = torch.load('../saved_models/bert_token.pt', map_location='cpu')

model = BertClassifierModule(bert_model_name)

model.load_state_dict(state_dict)

Some weights of the model checkpoint at avichr/heBERT_sentiment_analysis were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [4]:
from bert_classifier_repo.module_trainer import bert_classifier_trainer

In [5]:
import transformers
tokenizer = transformers.BertTokenizer.from_pretrained(bert_model_name)
max_len = 130


In [6]:
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,#text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length= max_len,                  # Max length to truncate/pad
            #  The `pad_to_max_length` argument is deprecated and will be removed in a future version, 
            # use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, 
            # or use `padding='max_length'` to pad to a max length. 
            # In this case, you can give a specific length with `max_length` 
            # (e.g. `max_length=45`) or leave max_length to None to 
            # pad to the maximal input size of the model (e.g. 512 for Bert).

            # pad_to_max_length=True,         # Pad sentence to max length
            padding='max_length',
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks 

In [7]:
ROOT_PATH = '../data/for_sentiment'

X_val = pd.read_csv(f'{ROOT_PATH}/val_token_df.gz')

val_inputs, val_masks = preprocessing_for_bert(X_val.comment_clean)

res = model(val_inputs, val_masks)
print(res)