In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


### Inizialize parameters, tokenizer, and model

In [3]:
MAX_LEN = 128
MAX_GRAD_NORM = 10

# temp_labels = ['M', 'F', 'O']
# label2id = {k: v for v, k in enumerate(temp_labels)}
# id2label = {v: k for v, k in enumerate(temp_labels)}
temp_labels = ['O', 'Generic she', 'Generic he',
               'Behavioural Stereotypes', 'i-Behavioural Stereotypes',
               'Societal Stereotypes', 'i-Societal Stereotypes',
               'Explicit Marking of Sex', 'i-Explicit Marking of Sex']

label2id = {k: v for v, k in enumerate(temp_labels)}
id2label = {v: k for v, k in enumerate(temp_labels)}
label2id

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained("Models/model_annotated")

### Load test datasets

In [4]:
# test_custom_labeled = pd.read_csv('Data/test_custom_labeled.csv')
# test_hugging_face = pd.read_csv('Data/test_hugging_face.csv')
test_annotated = pd.read_csv('Data/test_data_annotated.csv')

In [5]:
test_annotated.values[13]

array(['O,O,O,O,Societal Stereotypes,O,O,O,Behavioural Stereotypes,O,O,O,O,O,O,O',
       'The post involves supporting individuals to live as independently as possible in their own homes.'],
      dtype=object)

In [6]:
len(test_annotated)

161

### Predict single sentence

In [19]:
sentence = test_annotated.sentence[13]
sentence = "We are committed to building better neighborhoods wherever we are, not only for our residents, but for the greater community"
sentence = "Ideal candidates will be caring, compassionate, adaptable, flexible, committed to high standards of care and possess excellent communication skills both verbal and written."
#sentence = "the post involves supporting people to live as independently as possible in their own homes ."
#sentence = "A programmer must carry his laptop with him to work." 
#sentence = "A nurse should ensure that she  gets adequate rest"
#sentence = "He / She is suitable for this position."
sentence = "Must be an extrovert who has innate quality of easily connecting with people!"
sentence = "Are you the master of technology and passionate person we are looking for?"
inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)

outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

m = torch.nn.Softmax(dim=1)
probabilities = m(logits[0])
prediction_probability = torch.max(probabilities, axis=1)[0].tolist()
prediction_probability = [ '%.3f' % elem for elem in prediction_probability ]

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions, prediction_probability)) # list of tuples. Each tuple = (wordpiece, prediction)


word_level_predictions = []
word_level_probabilities = []
for pair in wp_preds:
    if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
        continue
    else:
        word_level_predictions.append(pair[1])
        word_level_probabilities.append(pair[2])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print()
print('PRED:' + str(word_level_predictions))
#print(word_level_probabilities)
#print('TRUE:' + str(test_annotated.word_labels[13].split(',')))

are you the master of technology and passionate person we are looking for ?

PRED:['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Behavioural Stereotypes', 'O', 'O', 'O', 'O', 'O']


### Use softmax for probabilities

In [9]:
m = torch.nn.Softmax(dim=1)
probabilities = m(logits[0])
max_probability = torch.max(probabilities, axis=1)
boolean_mask = torch.as_tensor(list(map(bool,mask[0])))
filtered_probabilities = torch.masked_select(max_probability[0], boolean_mask).tolist()
filtered_probabilities = [ '%.2f' % elem for elem in filtered_probabilities ]

In [10]:
print(filtered_probabilities)

['1.00', '1.00', '1.00', '1.00', '1.00', '0.50', '1.00', '1.00', '1.00', '0.92', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '0.98']


In [10]:
test_annotated

Unnamed: 0,word_labels,sentence
0,"O,O,O,O,O,Behavioural Stereotypes,O,O,O,O,O,O,...","Athletes receive their training plans, connect..."
1,"O,O,O,O,O,Behavioural Stereotypes,O,O,O,O,O,O,...","Athletes receive their training plans, connect..."
2,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O",Analyze Telecom (wireless and wireline) and Ne...
3,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","In return, you can expect an attractive salary..."
4,"O,O,O,O,O,O,O,O,O,O,Behavioural Stereotypes,O,...",Probing as to what they want and present what ...
...,...,...
156,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...",The department is very people-oriented both in...
157,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","More than 4,000 of the world’s most demanding ..."
158,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O",Our drivers average 10+ years on the job and a...
159,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...",We offer you a chance to join an agile team of...
