# Unsupervised NER

In [1]:
from transformers import RobertaTokenizer, RobertaModel, BertTokenizer, BertModel
from kmeans_pytorch import kmeans, kmeans_predict
import json
from pprint import pprint
import spacy
from tqdm import tqdm
import torch

In [2]:
model_name = 'bert-base-cased'

# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


![image](basic_idea.png)

## Import Raw Text Data

Import the raw textual data and tag it with spacy NER tags

In [3]:
with open('./data/data_text.json', 'r') as file:
    data = json.load(file)

nlp = spacy.load("en_core_web_sm")

As a crude first principle, take every `PROPN`, `NOUN`, `ADJ`, and `NUM` get the tokenized id for each

In [4]:
# Split entire corpus into sentences
sentences = [s for text in data.values() for s in text.strip().split('. ')]

# Tag every sentence with POS tags
tagged_sentences = [nlp(s) for s in sentences]

tagged_ids = []
for sent in tqdm(tagged_sentences, desc='Collecting IDs'):
    propns = [tok.text for tok in sent if tok.pos_ in ['PROPN']]
    if propns:
        # Collects all word and sub-word token ids which have the desired POS tags
        # [1:-1] to remove [CLS] and [SEP] tokens
        tagged_ids.append([ids_i for ids in [tokenizer.encode(p)[1:-1] for p in propns] for ids_i in ids])
    else:
        tagged_ids.append([])

Collecting IDs: 100%|██████████| 50138/50138 [00:12<00:00, 4008.62it/s]


The extraction of embedding layers is using code from [here](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/). This algorithm fails if there are multiple instances of the same token but only one is considered a named entity by SpaCy as all tokens with the same tokenised id are found in the encoded text and the contexts derived.

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.eval()
context_vectors = torch.empty(0, 768).to(device)

model.to(device)

for sent, ids in tqdm(list(zip(sentences, tagged_ids))):
    ids_t = torch.tensor(ids)
    enc_sent = tokenizer.encode_plus(sent, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(enc_sent.input_ids.to(device), enc_sent.attention_mask.to(device))
        hidden_states = outputs[2]

    # Extract model embeddings layer activations
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove batches dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap layer and token dimensions
    token_embeddings = token_embeddings.permute(1, 0, 2)
    
    # Identify indices within encoded text to calculate context embeddings
    target_indices = (enc_sent.input_ids.T == ids_t).nonzero()

    # Use the sum of the last 4 embedding layers as an aggregation of context for the selected indices
    stacked_token_embeddings = token_embeddings.repeat(ids_t.size(0), 1, 1, 1)
    embedding_aggregate = torch.sum(stacked_token_embeddings[target_indices[:, 1], target_indices[:, 0], -4:], dim=1)

    context_vectors = torch.cat((context_vectors, embedding_aggregate))

print(context_vectors.shape)
print(len([i for id in tagged_ids for i in id]))

100%|██████████| 50138/50138 [09:54<00:00, 84.34it/s]

torch.Size([251097, 768])
218617





Cluster context vectors into 4 clusters representing `PERSON`, `LOCATION`, `ORGANISATION`, `MISC`

In [6]:
cluster_ids, cluster_centres = kmeans(context_vectors, 3, distance='cosine', device=device)

running k-means on cuda..


[running kmeans]: 84it [01:15,  1.12it/s, center_shift=0.000057, iteration=84, tol=0.000100]  


In [7]:
i = 7
print(tagged_sentences[i])
print([tok.text for tok in tagged_sentences[i] if tok.pos_ in ['PROPN']])
print(cluster_ids[:10])

In the centre of the composition an exquisite goblet is held out by the steward, into which a black servant pours a golden liquid
[]
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])


In [20]:
i = 14

test_sent = sentences[i]
tagged_test_sent = nlp(test_sent)
propns = [tok.text for tok in tagged_test_sent if tok.pos_ in ['PROPN']]
test_sent_ids = [ids_i for ids in [tokenizer.encode(p)[1:-1] for p in propns] for ids_i in ids]

enc_sent = tokenizer.encode_plus(test_sent, return_tensors='pt')

ids_t = torch.tensor(test_sent_ids)
model.to(device)
with torch.no_grad():
    outputs = model(enc_sent.input_ids.to(device), enc_sent.attention_mask.to(device))
    hidden_states = outputs[2]

# Extract model embeddings layer activations
token_embeddings = torch.stack(hidden_states, dim=0)

# Remove batches dimension
token_embeddings = torch.squeeze(token_embeddings, dim=1)

# Swap layer and token dimensions
token_embeddings = token_embeddings.permute(1, 0, 2)

# Identify indices within encoded text to calculate context embeddings
target_indices = (enc_sent.input_ids.T == ids_t).nonzero()

# Use the sum of the last 4 embedding layers as an aggregation of context for the selected indices
stacked_token_embeddings = token_embeddings.repeat(ids_t.size(0), 1, 1, 1)
embedding_aggregate = torch.sum(stacked_token_embeddings[target_indices[:, 1], target_indices[:, 0], -4:], dim=1)

# Hacky solution to allow single entity tensors to be classified
if embedding_aggregate.size(0) == 1:
    embedding_aggregate = embedding_aggregate.repeat(2, 1)

clusters = kmeans_predict(embedding_aggregate, cluster_centres, device=device, distance='cosine')

tok_thing = [tok.text for tok in tagged_test_sent if tok.pos_ in ['PROPN']]
indices = [len(tokenizer.encode(tok.text)[1:-1]) for tok in tagged_test_sent if tok.pos_ in ['PROPN']]

print(tok_thing)
print(clusters)
clusters_list =  clusters.tolist()
print(clusters_list)
thing = [(tok_thing[i], [clusters_list.pop(0) for _ in range(length)]) for i, length in enumerate(indices)]

pprint(thing)

The group on the left – the bride and groom, the dog, the servant with his back to the viewer and the boy holding the glass – are all inspired by Veronese’s large-scale Marriage at Cana (Louvre, Paris), though transformed under the influence of Caravaggio and Ribera
[159, 28032, 6420, 2825, 1161, 10111, 9699, 2123, 16185, 2497, 21541, 155, 24851, 1161]
predicting on cuda..
['Veronese', 'Cana', 'Louvre', 'Paris', 'Caravaggio', 'Ribera']
tensor([1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1])
[1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1]
[('Veronese', [1, 1, 1]),
 ('Cana', [1, 1]),
 ('Louvre', [1, 2]),
 ('Paris', [0]),
 ('Caravaggio', [2, 1, 1]),
 ('Ribera', [1, 1, 1])]


# Next Steps

1) Subword-token aggregation when calculating context vectors
2) Getting generic context vector instead of word based context vector
3) Multi-word token aggregation for single entity context vector
4) Different aggregation techniques