In [132]:
# off the shelf BERT from Huggingface
from transformers import BertModel, BertTokenizer
# numpy
import numpy as np
# operator
import operator
# pandas
import pandas as pd
# sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.cluster import AgglomerativeClustering, KMeans

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [76]:
'''
Read in .tsv of tagged sample data as a Pandas data frame
Add appropriate header to the columns as well.
'''
def read_as_df(filename):
    df = pd.read_csv(filename, sep="\t", header = None, skiprows=[1])
    df.columns = ["Text_Loc", "Sample", "Rating", "Specificity","Adj", "Adv", "Noun", "Verb", "Adp", "Time"]
    df = df.iloc[1: , :]
    return df

In [9]:
def tokenize_sample(sample):
    # tokenize
    return tokenizer(sample, return_tensors="pt")

In [33]:
'''
Helper to add extra information to a 768-dimension BERT embedding.
Could be used in future; no application in this process since I am not going to use the embeddings for model input
(Not enough data).
'''
def augment_bert_embedding(bert_output, nums_to_add):
    temp = np.array(nums_to_add)
    augmented_bert_output = np.resize(bert_output, len(bert_output) + len(nums_to_add))
    augmented_bert_output[-len(nums_to_add):] = temp
    return augmented_bert_output

In [26]:
def get_embeddings(df):
    
    mat_list = []
    for i in range (0, len(df) - 1):
        # get sample, remove leading + trailing ellipses
        sample = df['Sample'][i][3:-3]
        
        
        token_inputs = tokenize_sample(sample)

        # converts input_ids to their tokenized form
        # ie "insinuating" -> is word-pieced into in/##sin/##uating (3 diff tokens!)
        tokens=tokenizer.convert_ids_to_tokens(token_inputs["input_ids"][0])
        outputs = model(**token_inputs)
        embedding_list = []

        # need to make a matrix for each sample
        for index, token in enumerate(tokens):
            # find the rep
            bert_embedding = outputs.last_hidden_state[0][index].detach().numpy()
            embedding_list.append(bert_embedding)
        mat = np.stack(embedding_list, axis=0)
        mat_list.append(mat)
    return mat_list

In [38]:
'''
Take in a list of (n, 768) embeddings and reduce their dims w/ an avg. calc
That way each sample can be compared
'''
def average_embeddings(embedding_list):
    avg_embeddings = []
    for mat in embedding_list:
        avg_embeddings.append(np.mean(mat, axis = 0))
    
    return avg_embeddings

In [137]:
def k_means(embeddings):
    k = KMeans(n_clusters=2, random_state = 46)
    k.fit(avg_embeddings)
    k_assignment = k.labels_
    return k_assignment

In [111]:
def agg_clustering(embeddings):
    clustering_model = AgglomerativeClustering(n_clusters=2, linkage="ward")
    clustering_model.fit(avg_embeddings)
    cluster_assignment = clustering_model.labels_
    return cluster_assignment

In [112]:
def clustering_eval(assignments, gold_labels):
    print(metrics.normalized_mutual_info_score(gold_labels, assignments), accuracy_score(gold_labels, assignments))

In [113]:
def print_clustering_assignments(assignments):
    clustered_samples = {}
    for sample_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_samples:
            clustered_samples[cluster_id] = []

        clustered_samples[cluster_id].append(samples['Text_Loc'][sample_id])

    for i, cluster in clustered_samples.items():
        print(f'Cluster {i} {cluster}')

In [77]:
samples = read_as_df("data/samples_data.tsv")

In [138]:
samples = samples.assign (
    label = lambda dataframe: samples['Rating'].map(lambda Rating: 1 if float(Rating) >= 3.11 else 0)
)

In [27]:
sample_embeddings = get_embeddings(samples)

In [109]:
samples['label'][0]

0

In [46]:
avg_embeddings = np.array(average_embeddings(sample_embeddings))

In [141]:
clustering_eval(k_lab, samples['label'])

0.0022937486004810255 0.48484848484848486


In [142]:
agg_lab = agg_clustering(avg_embeddings)

In [143]:
clustering_eval(agg_lab, samples['label'])

1.6552928531198963e-06 0.48760330578512395
