In [13]:
import json
import torch
from transformers import AutoTokenizer, BertModel
from sklearn.cluster import KMeans, AgglomerativeClustering

In [2]:
# load our augmented dataset json file
augmented_dataset = None
with open('augmented_test.json', 'r') as f:
    augmented_dataset = json.load(f)

In [3]:
# load our model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def embed_documents(documents):
    doc_embeddings = []
    for doc in documents:
        inputs = tokenizer.encode_plus(doc, max_length=512, pad_to_max_length=True, return_tensors="pt")
        outputs = model(**inputs)

        last_hidden_states = outputs.last_hidden_state
        # we're going to average across the tokens to get the sentence embedding
        doc_embedding = torch.mean(last_hidden_states, dim=1)
        doc_embeddings.append(doc_embedding)
    
    return doc_embeddings

In [17]:
def cluster_documents_kmeans(documents):
    doc_embeddings = embed_documents(documents)
    # turn our list of embeddings into a numpy matrix
    doc_embeddings = torch.cat(doc_embeddings).detach().numpy()
    # cluster them into 3 clusters
    kmeans = KMeans(n_clusters=3, random_state=0).fit(doc_embeddings)
    return kmeans.labels_

def cluster_documents_hierarchical(documents):
    doc_embeddings = embed_documents(documents)
    # turn our list of embeddings into a numpy matrix
    doc_embeddings = torch.cat(doc_embeddings).detach().numpy()
    # cluster them into 3 clusters
    hierarchical = AgglomerativeClustering(n_clusters=3).fit(doc_embeddings)
    return hierarchical.labels_

In [None]:
# load our indices to use
sample_indices = pd.read_csv('sample_indices.csv')

# iterate through our data documents and get the embeddings
for data in augmented_dataset:
    k_clusters = cluster_documents_kmeans(data['documents'])
    h_clusters = cluster_documents_hierarchical(data['documents'])
    print(k_clusters)
    print(h_clusters)
    # increment counter if they're different
    break