# Unsupervised NER

In [125]:
from transformers import RobertaTokenizer, RobertaModel, BertTokenizer, BertModel
import kmeans_pytorch
import json
from pprint import pprint
import spacy
from tqdm import tqdm
import torch

from gatenlp import Document
from gatenlp.lib_spacy import AnnSpacy


In [94]:
model_name = 'bert-base-cased'

# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


![image](basic_idea.png)

## Import Raw Text Data

Import the raw textual data and tag it with spacy NER tags

In [11]:
with open('./data/data_text.json', 'r') as file:
    data = json.load(file)

nlp = spacy.load("en_core_web_sm")
tagged_data = {k: nlp(v) for k, v in tqdm(data.items())}

100%|██████████| 5100/5100 [03:24<00:00, 24.93it/s]


As a crude first principle, take every `PROPN`, `NOUN`, `ADJ`, and `NUM` and map the context vectors

In [115]:
thing = [(i, tok.text, tok.pos_) for i, tok in enumerate(tagged_data['0']) if tok.pos_ in ['PROPN', 'NOUN', 'NUM']]

# Crudely split into sentences
sentences = data['0'].strip().split('. ')

enc_text = tokenizer.encode_plus(sentences[0], return_tensors='pt')

print(sentences[0])
print([tok.pos_ for tok in nlp(sentences[0])])
print(enc_text.input_ids.squeeze().tolist())
print(tokenizer.convert_ids_to_tokens(enc_text.input_ids.squeeze().tolist()))

model.eval()
with torch.no_grad():
    outputs = model(enc_text.input_ids, enc_text.attention_mask)
    hidden_states = outputs[2]

token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1, 0, 2)

christ_vec = torch.sum(token_embeddings[6][-4:], dim=0)
john_vec = torch.sum(token_embeddings[13][-4:], dim=0)

This crowded banquet scene shows Christ’s first miracle (John 2: 1–10)
['DET', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'PROPN', 'PART', 'ADJ', 'NOUN', 'PUNCT', 'PROPN', 'NUM', 'PUNCT', 'NUM', 'PUNCT']
[101, 1188, 11090, 25234, 2741, 2196, 4028, 787, 188, 1148, 14173, 113, 1287, 123, 131, 122, 782, 1275, 114, 102]
['[CLS]', 'This', 'crowded', 'banquet', 'scene', 'shows', 'Christ', '’', 's', 'first', 'miracle', '(', 'John', '2', ':', '1', '–', '10', ')', '[SEP]']


In [201]:
sentences = data['0'].strip().split('. ')
tagged_sentences = [nlp(s) for s in sentences]

# Potential problem is that th eotkenisation used by spacy may produce tokens that aren't converted in the same way as bert
propn_ids = [tokenizer.convert_tokens_to_ids([tok.text for tok in doc if tok.pos_ in ['PROPN']]) for doc in tagged_sentences]
print(propn_ids)

print(len(sentences))

enc_text = tokenizer.encode_plus(sentences[0], return_tensors='pt')

print(enc_text.input_ids)
id_thing = (enc_text.input_ids.squeeze() == propn_ids[0][0]).nonzero().squeeze()

model.eval()
with torch.no_grad():
    outputs = model(enc_text.input_ids, enc_text.attention_mask)
    hidden_states = outputs[2]

token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1, 0, 2)

thing1 = torch.sum(token_embeddings[id_thing][-4:], dim=0)

[[4028, 1287], [100, 100], [2090], [], [4028], [100, 100], [2090], [], [100], [10431, 100, 1260, 100, 23333, 100, 100, 100], [100, 16574, 26416, 100], [100, 10431, 100, 2413, 100, 100, 10431], [100], [100], [100, 100, 100, 2123, 100, 100], [100, 24140, 100, 1305, 3889, 3030, 5238, 100, 100, 1260, 100, 10431, 100], []]
17
tensor([[  101,  1188, 11090, 25234,  2741,  2196,  4028,   787,   188,  1148,
         14173,   113,  1287,   123,   131,   122,   782,  1275,   114,   102]])


In [268]:
for text in tqdm(data.values()):
    context_vectors = []

    sentences = text.strip().split('. ')
    tagged_sentences = [nlp(s) for s in sentences]

    # Potential problem is that th eotkenisation used by spacy may produce tokens that aren't converted in the same way as bert
    propn_ids = [tokenizer.convert_tokens_to_ids([tok.text for tok in doc if tok.pos_ in ['PROPN']]) for doc in tagged_sentences]

    for sent, ids in zip(sentences, propn_ids):
        enc_sent = tokenizer.encode_plus(sent, return_tensors='pt')
        propn_indices = [(enc_sent.input_ids.squeeze() == x).nonzero().squeeze() for x in ids]

        model.eval()
        with torch.no_grad():
            outputs = model(enc_sent.input_ids, enc_sent.attention_mask)
            hidden_states = outputs[2]

        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)

        for index in propn_indices:
            context_vectors.append(torch.sum(token_embeddings[index][-4:], dim=0))

  1%|          | 55/5100 [00:26<40:54,  2.06it/s]  


KeyboardInterrupt: 