In [69]:
# csv
import csv
# off the shelf BERT from Huggingface
from transformers import BertModel, BertTokenizer
# numpy
import numpy as np
# pandas
import pandas as pd
# sklearn

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
'''
Read in .tsv of tagged sample data as a Pandas data frame
Add appropriate header to the columns as well.
'''
def read_as_df(filename):
    df = pd.read_csv(filename, sep="\t", header = None, skiprows=[1])
    df.columns = ["Text_Loc", "Sample", "Rating", "Specificity","Adj", "Adv", "Noun", "Verb", "Adp", "Time"]
    df = df.iloc[1: , :]
    return df

In [66]:
'''
Helper that invokes https://huggingface.co/transformers/fast_tokenizers.html
'''
def tokenize_sample(sample):
    # tokenize
    return tokenizer(sample, return_tensors="pt")

In [67]:
'''
Helper to add extra information to a 768-dimension BERT embedding.
Could be used in future
No application in this notebook since these embeddings are just for a base-truth classification
'''
def augment_bert_embedding(bert_output, nums_to_add):
    temp = np.array(nums_to_add)
    augmented_bert_output = np.resize(bert_output, len(bert_output) + len(nums_to_add))
    augmented_bert_output[-len(nums_to_add):] = temp
    return augmented_bert_output

In [44]:
'''
Extract the embedding values for each token in the samples
Uses the word_piece tokenization strategy (as BERT is wont to do).
Outputs 1 (num_words, 768) entry in a list per sample
'''
def get_embeddings(df):
    
    mat_list = []
    for i in range (1, len(df) - 1):
        # get sample, remove leading + trailing ellipses
        sample = df['Sample'][i][3:-3]
        
        token_inputs = tokenize_sample(sample)

        # converts input_ids to their tokenized form
        # ie "insinuating" -> is word-pieced into in/##sin/##uating (3 diff tokens!)
        tokens=tokenizer.convert_ids_to_tokens(token_inputs["input_ids"][0])
        outputs = model(**token_inputs)
        embedding_list = []

        # need to make a matrix for each sample
        for index, token in enumerate(tokens):
            # find the rep
            bert_embedding = outputs.last_hidden_state[0][index].detach().numpy()
            embedding_list.append(bert_embedding)
        mat = np.stack(embedding_list, axis=0)
        mat_list.append(mat)
    return mat_list

In [59]:
'''
Also track which tokens get sent to which ids
Note that each representation is "bidirectional" (which informs the value its embedding takes)
But stable per word; that is, "he" maps to embedding ID 2002 across every sample
So, can reliably keep track of tokens to ids using a dict
'''
def get_embedding_token_ids(df):
    
    token_word_mapping = {}
    count = 0
    for i in range (1, len(df) - 1):
        count += 1
        sample = df['Sample'][i][3:-3]
        token_inputs = tokenize_sample(sample)
        tokens=tokenizer.convert_ids_to_tokens(token_inputs["input_ids"][0])
        token_ids = np.array(token_inputs["input_ids"][0])
        for index, value in enumerate(token_ids.tolist()):
            if value not in token_word_mapping:
                token_word_mapping[value] = tokens[index]
    
    return token_word_mapping

In [12]:
samples = read_as_df("data/samples_data.tsv")

In [15]:
samples.head()

Unnamed: 0,Text_Loc,Sample,Rating,Specificity,Adj,Adv,Noun,Verb,Adp,Time
1,../Gutenberg/samples/adam_bede_601425_602225.txt,"...him, if he had known it, that the general a...",2.5,4.758620689655173,16,10,36,24,20,1
2,../Gutenberg/samples/middlemarch_1718514_17193...,"...passionate exclamation, as if some torture ...",2.0,2.9454545454545453,9,8,21,35,20,6
3,../Gutenberg/samples/tom_jones_1740540_1741340...,"...so vicious a passion from your heart, and y...",1.5,4.823529411764706,10,10,29,29,16,7
4,../Gutenberg/samples/the_jungle_691598_692398.txt,"...intensity, staring at the platform as if no...",2.0,3.4074074074074074,12,13,22,33,9,11
5,../Gutenberg/samples/frankenstein_7967_8767.txt,"...tastes are like my own, to approve or amend...",1.8,4.428571428571429,18,9,25,25,19,9


In [20]:
# list of matrices of embeddings
sample_embeddings = get_embeddings(samples)

In [23]:
# number of "tokens" in the first sample
sample_embeddings[0].shape

(179, 768)

In [63]:
# look up dict for id <-> token
sample_id_token_mapping = get_embedding_token_ids(samples)

In [64]:
# value for he
sample_id_token_mapping[2002]

'he'

In [65]:
# number of "tokens" across the sampes
len(sample_id_token_mapping)

7367

In [70]:
# write to file
with open('data/sample_embeddings.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerows(sample_embeddings)

In [71]:
# write to file
with open('data/sample_id_token_mappings.txt', 'w') as f:
    print(sample_id_token_mapping, file=f)