In [1]:
import os
import torch
import torch.nn.functional as F
from pytorch_transformers import BertTokenizer, cached_path
from utils.transformers import TransformerWithClfHeadAndAdapters

In [2]:
model_path = "transformer_results"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
config = torch.load(cached_path(os.path.join(model_path, "model_training_args.bin")))
model = TransformerWithClfHeadAndAdapters(config["config"],
                                          config["config_ft"]).to(device)
state_dict = torch.load(cached_path(os.path.join(model_path, "model_weights.pth")),
                        map_location=device)

model.load_state_dict(state_dict)   # Load model state dict
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)  # Load tokenizer

In [8]:

clf_token = tokenizer.vocab['[CLS]']  # classifier token
pad_token = tokenizer.vocab['[PAD]']  # pad token

In [3]:

def encode(inputs):
    # Encode text as IDs using the BertTokenizer
    return list(tokenizer.convert_tokens_to_ids(o) for o in inputs)

In [11]:
max_length = config['config'].num_max_positions  # Max length from trained model
max_length

256

In [18]:
text = "This movie showcased the true ability of The Rock as a phenomenal actor."
text

'This movie showcased the true ability of The Rock as a phenomenal actor.'

In [19]:
inputs = tokenizer.tokenize(text)
if len(inputs) >= max_length:
    inputs = inputs[:max_length - 1]
ids = encode(inputs) + [clf_token]
print(inputs)
print(ids)

['This', 'movie', 'showcased', 'the', 'true', 'ability', 'of', 'The', 'Rock', 'as', 'a', 'phenomena', '##l', 'actor', '.']
[1188, 2523, 24980, 1103, 2276, 2912, 1104, 1109, 2977, 1112, 170, 14343, 1233, 2811, 119, 101]


In [20]:
model.eval();

In [21]:
with torch.no_grad():   # Disable backprop
    tensor = torch.tensor(ids, dtype=torch.long).to(device)
    tensor_reshaped = tensor.reshape(1, -1)
    tensor_in = tensor_reshaped.transpose(0, 1).contiguous()  # to shape [seq length, 1]
    logits = model(tensor_in,
                   clf_tokens_mask=(tensor_in == clf_token),
                   padding_mask=(tensor_reshaped == pad_token))

In [22]:
val, _ = torch.max(logits, 0)
val = F.softmax(val, dim=0).detach().cpu().numpy()

In [23]:
print("Class probabilities: ", val, type(val))

Class probabilities:  [6.9524435e-04 5.4976675e-03 2.4256413e-01 7.5099933e-01 2.4359378e-04] <class 'numpy.ndarray'>


In [24]:
pred = int(val.argmax()) + 1
print("Class prediction for text example: ", pred)

Class prediction for text example:  4


In [27]:
from torchsummary import summary

In [31]:
tensor_in.shape

torch.Size([16, 1])

In [33]:
print(model,tensor_in.shape)

TransformerWithClfHeadAndAdapters(
  (transformer): Transformer(
    (tokens_embeddings): Embedding(28996, 410)
    (position_embeddings): Embedding(256, 410)
    (dropout): Dropout(p=0.1, inplace=False)
    (attentions): ModuleList(
      (0): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=410, out_features=410, bias=True)
      )
      (1): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=410, out_features=410, bias=True)
      )
      (2): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=410, out_features=410, bias=True)
      )
      (3): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=410, out_features=410, bias=True)
      )
      (4): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=410, out_features=410, bias=True)
      )
      (5): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinea