In [5]:
from transformers import BertTokenizer, BertModel
from kmeans_pytorch import kmeans, kmeans_predict
from tqdm import tqdm
from pprint import pprint
import torch
import numpy as np
import pandas as pd
import json
from datasets import load_from_disk
from collections import Counter, defaultdict

In [2]:
model_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Preparation

The CoNLL2003 dataset is first used to evaluate performance. The ground truth ner tags are used only select named entities to allow the class allocation to be tested independently by assumung perfect extraction. Later work will be conducted into identifying which entities should be labelled. NER tags are mapped to a non-BIO based scheme to reduce the number of clusters.

In [6]:
!python format_conll2003.py --bert_model $model_name

with open('./data/CoNLL2003/tag_mappings.json') as file:
    tag_mappings = json.load(file)

dataset = load_from_disk('./data/CoNLL2003')

print(dataset['train'][0])

Found cached dataset conll2003 (/home/william/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 675.77it/s]
Loading cached processed dataset at /home/william/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-320df3d2a03ea85a.arrow
Loading cached processed dataset at /home/william/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-0a73fd1c3c751351.arrow
Loading cached processed dataset at /home/william/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-9f967a16c9b36547.arrow
{'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'ner_tags': [2, 0, 4,

## Training

Define a method to extract the context vectors of the extracted entities

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)
def get_context_vectors(input_ids, attention_mask, target_mask):
    # Ignore gradient to improve performance
    with torch.no_grad():
        outputs = model(input_ids.to(device), attention_mask.to(device))
        hidden_states = outputs.hidden_states

    # Extract model embeddings layer activations
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove batches dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap layer and token dimensions
    token_embeddings = token_embeddings.permute(1, 0, 2)
    
    # Identify indices within encoded text to calculate context embeddings
    target_indices = target_mask.to(device).nonzero()

    # Use the sum of the last 4 embedding layers as an aggregation of context for the selected indices
    stacked_token_embeddings = token_embeddings.repeat(torch.sum(target_mask), 1, 1, 1)
    embedding_aggregate = torch.sum(stacked_token_embeddings[torch.arange(0, torch.sum(target_mask)), target_indices[:, 1], -4:], dim=1)

    return embedding_aggregate

Extract all of the context vectors using the defined method on the training split

In [8]:
context_vectors = torch.empty(0, 768).to(device)

model.eval()
model.to(device)

for example in tqdm(dataset['train'], desc='Collecting Contexts'):
    input_ids = torch.tensor(example['input_ids'])
    attention_mask = torch.ones_like(input_ids)
    target_mask = torch.tensor(example['target_mask'])

    embedding_aggregate = get_context_vectors(input_ids, attention_mask, target_mask)
    context_vectors = torch.cat((context_vectors, embedding_aggregate))

print(context_vectors.shape)
print(sum(sum(x[0]) for x in dataset['train']['target_mask']))

Collecting Contexts: 100%|██████████| 14041/14041 [02:41<00:00, 87.16it/s]

torch.Size([65030, 768])
65030





## Cluster

Cluster context vectors using kmeans with $k = 4$

In [9]:
# Seeds for reproducing cluster centres
torch.random.seed = 42
torch.cuda.seed = 42
np.random.seed(42)

distance = 'cosine' # 'euclidean'

cluster_ids, cluster_centres = kmeans(context_vectors, 4, distance=distance, device=device)

running k-means on cuda..


[running kmeans]: 39it [00:11,  3.33it/s, center_shift=0.000000, iteration=39, tol=0.000100]  


Manually evaluate clusters

In [18]:
i = 3
example = dataset['train'][i]

input_ids = torch.tensor(example['input_ids'])
attention_mask = torch.ones_like(input_ids)
target_mask = torch.tensor(example['target_mask'])

test_context_vectors = get_context_vectors(input_ids, attention_mask, target_mask)

if len(test_context_vectors) == 1:
    test_context_vectors = test_context_vectors.repeat(2, 1)

test_cluster_ids = kmeans_predict(test_context_vectors, cluster_centres, distance=distance, device=device)
test_clusters_ids_list = test_cluster_ids.squeeze().tolist()

token_ids = {token: tokenizer.encode(token)[1:-1] for token, ner_tag in zip(example['tokens'], example['ner_tags']) if ner_tag != 0}
token_predictions = {token: [test_clusters_ids_list.pop(0) for _ in range(len(ids))] for token, ids in token_ids.items()}

print(token_predictions)

predicting on cuda..
{'European': [3], 'Commission': [3], 'German': [3], 'British': [3]}


## Evaluate

Calculate context vectors for all validation samples

In [11]:
val_context_vectors = torch.empty(0, 768).to(device)

model.eval()
model.to(device)

for example in tqdm(dataset['validation'], desc='Collecting Contexts'):
    input_ids = torch.tensor(example['input_ids'])
    attention_mask = torch.ones_like(input_ids)
    target_mask = torch.tensor(example['target_mask'])

    embedding_aggregate = get_context_vectors(input_ids, attention_mask, target_mask)
    val_context_vectors = torch.cat((val_context_vectors, embedding_aggregate))

print(val_context_vectors.shape)
print(sum(sum(x[0]) for x in dataset['validation']['target_mask']))

Collecting Contexts: 100%|██████████| 3250/3250 [00:37<00:00, 86.81it/s]


torch.Size([16225, 768])
16225


Identify cluster allocations

In [15]:
val_cluster_ids = kmeans_predict(val_context_vectors, cluster_centres, distance=distance, device=device)

predicting on cuda..


Assign cluster ids to each token in the dataset and obtain a single cluster id for items with multiple, i.e. words composed of sub-word level tokens, by taking the mode or majority vote. Using `max(set(lst), key=lst.count))` for majority voting yields the most frequent list item or, in the case that there is a tie for most frequent, the item with the lowest value:

```python
a = [1, 1, 1, 0, 0]
b = [1, 1, 0, 0]
max(set(a), key=a.count)
# >> 1
max(set(b), key=b.count)
# >> 0
```
Another implementation may find that the head sub-word token is more informative.

Also collect the ground truth NER tag for each entity for comparison. `Note`: The cluster id numbers will not correspond to the NER tag numbers. They are two sets of labels that need to be mapped to eachother in some way.

Finally, find which mapping of cluster ids to NER ids yields the best f-1 performance. `Note`: This calculation does not consider entities that weren't tagged by using PROPN. This is to control for the initial entity extraction task which will be reviewed at another stage. This is to just evaluate whether this style of clustering is suitable for identifying entity classes.

In [16]:
def evaluate(cluster_ids, split):
    sample_predictions = []

    for example in tqdm(split):
        token_ids = {f'{i}-{token}': {
            'ids': tokenizer.encode(token)[1:-1],
            'ner_tag': ner_tag
        } for i, (token, ner_tag) in enumerate(zip(example['tokens'], example['ner_tags'])) if ner_tag != 0}

        token_predictions = {token: {
            'cluster_id': max(set((lst := [cluster_ids.pop(0) for _ in range(len(attributes['ids']))])), key=lst.count),
            'ner_tag': attributes['ner_tag']
        } for token, attributes in token_ids.items()}

        sample_predictions.append(token_predictions)

    mapping_freqs = Counter([(x_i['cluster_id'], x_i['ner_tag']) for x in sample_predictions for x_i in x.values()])
    p = defaultdict(dict)

    for k, v in mapping_freqs.items():
        p[k[0]].update({k[1]: v})

    df = pd.DataFrame.from_dict(data=p, orient='index').sort_index()
    df = df.reindex(sorted(df.columns), axis=1)

    eye = np.eye(len(df.columns), dtype=bool)
    total = np.sum(df.to_numpy().flatten())

    results = []
    for i in range(len(df.columns)):
        mapping = list(df[(df * np.roll(eye, i, axis=1)) != 0].stack().index)
        accuracy = np.sum((df * np.roll(eye, i, axis=1)).to_numpy().flatten()) / total * 100
        results.append({'mapping': mapping, 'accuracy': accuracy})

    return results

In [17]:
val_clusters_ids_list = val_cluster_ids.squeeze().tolist()
results = evaluate(val_clusters_ids_list, dataset['validation'])
pprint(results)

100%|██████████| 3250/3250 [00:01<00:00, 2071.12it/s]

[{'accuracy': 17.59851214692549, 'mapping': [(0, 1), (1, 2), (2, 3), (3, 4)]},
 {'accuracy': 5.579449029408346, 'mapping': [(0, 2), (1, 3), (2, 4), (3, 1)]},
 {'accuracy': 30.861327443914917, 'mapping': [(0, 3), (1, 4), (2, 1), (3, 2)]},
 {'accuracy': 45.96071137975125, 'mapping': [(0, 4), (1, 1), (2, 2), (3, 3)]}]





# Different Aggregations

Using a more testable implementation of the work in this notebook, explore variations on the original model to evaluate improvement. First import developed class

In [19]:
from context_vector_clustering import ContextClustering, CatNLayers, SumNLayers, MeanNLayers, SelectLayerN

Fit the class to the training examples

In [31]:
X_fit = ContextClustering(random_state=42).fit(dataset['train'], aggregation=CatNLayers(-4))

RuntimeError: Trying to create tensor with negative dimension -3072: [0, -3072]

## Evaluate

Predict the class assignment of the validation set

In [21]:
y_hat = X_fit.predict(dataset['validation'])
y_hat_list = y_hat.squeeze().tolist()

results = evaluate(y_hat_list, dataset['validation'])
pprint(results)

Collecting Contexts: 100%|██████████| 3250/3250 [00:36<00:00, 87.85it/s]


predicting on cuda..


100%|██████████| 3250/3250 [00:01<00:00, 2231.32it/s]

[{'accuracy': 14.576310589329303, 'mapping': [(0, 1), (1, 2), (2, 3), (3, 4)]},
 {'accuracy': 7.509008485412065, 'mapping': [(0, 2), (1, 3), (2, 4), (3, 1)]},
 {'accuracy': 53.89980239451354, 'mapping': [(0, 3), (1, 4), (2, 1), (3, 2)]},
 {'accuracy': 24.01487853074509, 'mapping': [(0, 4), (1, 1), (2, 2), (3, 3)]}]





# Next Steps

1) Subword-token aggregation when calculating context vectors
2) Getting generic context vector instead of word based context vector
3) Multi-word token aggregation for single entity context vector
4) Different aggregation techniques