# Prep dataset

In [None]:
import pandas as pd
import numpy as np
import os
import scipy
import scipy.stats

from google.colab import drive
drive.mount('/content/drive')
data_foloder = '/here'
result = 'here'

baseline = pd.read_csv(data_foloder + 'here.csv')
incoh10 = pd.read_csv(data_foloder + 'here.csv')
incoh20 = pd.read_csv(data_foloder + 'here.csv')
incoh50 = pd.read_csv(data_foloder + 'here.csv')
ineff10 = pd.read_csv(data_foloder + 'here.csv')
ineff20 = pd.read_csv(data_foloder + 'here.csv')
ineff50 = pd.read_csv(data_foloder + 'here.csv')

baseline.head()

In [2]:
os.getcwd()

'/content'

# Install lib and dependencies

In [10]:
# Install the transformers package from Hugging Face 
# which will give us a pytorch interface for working with BERT.

import torch
from collections import OrderedDict

In [None]:
!pip install transformers

In [4]:
# check if any GPU instance is availabe
# https://www.databricks.com/blog/2021/10/28/gpu-accelerated-sentiment-analysis-using-pytorch-and-huggingface-on-databricks.html
# 'cude': GPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
DEVICE

device(type='cuda')

In [5]:
# load the pre-trained BERT model and tokenizer
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('bert-base-uncased', # for both word level and utterance level
           output_hidden_states = True) #.to(DEVICE); the above cell already takes care of this
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

# Contextualized word embeddings

Put the input text into a specific format that BERT can read. add the ```[CLS]``` to the beginning and ```[SEP]``` to the end of the input. convert the tokenized BERT input to the tensor format.

In [6]:
def bert_text_preparation(text, tokenizer):
  """
  Preprocesses text input in a way that BERT can interpret.
  """
  marked_text = "[CLS] " + text + " [SEP]" # add special tokens
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # find token IDs
  segments_ids = [1]*len(indexed_tokens) # for formating and vectors matrix calculations [not really an ID]

  # convert inputs to tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensor = torch.tensor([segments_ids])

  return tokenized_text, tokens_tensor, segments_tensor

to obtain the actual BERT embeddings, we take preprocessed input text, which now is represented by tensors, put it into our pre-trained BERT model.

which vector works best as a contextualized embedding depends on the task. 

according to Devlin et al (2019), the sum of the last four layers of the model worked well for NLP tasks

In [7]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains BERT embeddings for tokens, in context of the given response.
    """
    # gradient calculation id disabled
    with torch.no_grad():
      # obtain hidden states
      outputs = model(tokens_tensor, segments_tensor)
      hidden_states = outputs[2]

    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)

    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # swap dimensions 0 and 1 so we can loop over tokens 
    token_embeddings = token_embeddings.permute(1,0,2)

    # intialized list to store embeddings
    token_vecs_sum = []

    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the response

    # loop over tokens in response
    for token in token_embeddings:

        # "token" is a [12 x 768] tensor

        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    return token_vecs_sum

create contextual embeddings for a response.

In [None]:
# call the function 
# apply it to the dataframe
dfs = [baseline, incoh10, incoh20, incoh50, ineff10, ineff20, ineff50]
temp = -1
for df in dfs:
  temp += 1
  df["bert_emb"] = ''
  df.to_csv(result + str(temp) + '.csv')

df.head()

In [None]:
for df in dfs: 
  for i in df.index:
    context_embeddings = []
    if df['n_words'][i] > 4: # only process lines with response len bigger than 4
        # only keep the first 430 tokens. BERT uses a subword tokenizer (WordPiece), 
        # so the maximum length corresponds to 512 subword tokens.
        lst = df['content'][i].split(' ')[:431] 
        sentence = ' '.join(lst)
        tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
        list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

        # make ordered dictionary to keep track of the position of each word
        tokens = OrderedDict()

        # loop over tokens in sensitive sentence
        for token in tokenized_text[1:-1]:
          # keep track of position of word and whether it occurs multiple times
          if token in tokens:
            tokens[token] += 1
          else:
            tokens[token] = 1

          # compute the position of the current token
          token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
          current_index = token_indices[tokens[token]-1]

          # get the corresponding embedding
          token_vec = list_token_embeddings[current_index]
          
          # save values
          context_embeddings.append(token_vec)

        df['bert_emb'][i] = context_embeddings

    if i % 5 == 0:
        print('progress: ', i)

  df.to_csv(result + str(temp) + '.csv')  
  
df.head()

# Stats and similarities functions

In [15]:
# stats ignoring nan, apply to all LMs 
from numpy import nanmedian

import scipy
def iqr(x):
  return scipy.stats.iqr(np.array(x), nan_policy='omit')

from numpy import nanquantile
def q5(x):
    return np.nanquantile(np.array(x), 0.05)

def q95(x):
    return np.nanquantile(np.array(x), 0.95)

In [16]:
# cosine_similarity, apply to all LMs
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# MV5/10

In [17]:
# Average semantic similarity of each word in 5- or 10- words window

def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]
  
# n: How many elements each
# list should have
test = ['Alex','broke','the','vase','accidentally','.','But','Kai','did','it','on','purpose','.']
divide_chunks(test,5)
chopped = list(divide_chunks(test,5))
print(chopped)

[['Alex', 'broke', 'the', 'vase', 'accidentally'], ['.', 'But', 'Kai', 'did', 'it'], ['on', 'purpose', '.']]


In [18]:
def combinations(lst): # get w1, w2 combinations
    # input: a list of <= 5 tokens
    cmb = []
    rightside = lst[:] # initialize a list
    for wid, w1 in enumerate(lst): # each token gets a chance to be w1
        rightside = lst[wid:] # dynamically chop off w1 from the rest of the list
        while rightside: # loop until the rest of the list is empty
            w2 = rightside.pop(0) # stack up w2
            if w2 != w1: # get rid of ['Alex', 'Alex']
                cmb.append([w1, w2])  
    return cmb

testing = ['Alex', 'broke', 'the', 'vase', 'accidentally']
test_result = combinations(testing)
print(test_result)

[['Alex', 'broke'], ['Alex', 'the'], ['Alex', 'vase'], ['Alex', 'accidentally'], ['broke', 'the'], ['broke', 'vase'], ['broke', 'accidentally'], ['the', 'vase'], ['the', 'accidentally'], ['vase', 'accidentally']]


In [None]:
# prep dataframe
# append columns to the embeddings df

mvs=['5', '10']
stats = ['_median', '_iqr', '_q5', '_q95'] 
for df in dfs:
    # create new empty columns
    for mv in mvs:
        for stat in stats:
            cur = 'bert_word_mv' + mv + stat
            df[cur] = ''
df.head()

In [24]:
len(df['bert_emb'][0]) # it's a list of 431 tensor

431

In [34]:
df['bert_emb'][0][1].shape # each tensor is 768 dimension

torch.Size([768])

In [None]:
# combinations 'alex broke the vase': 
# alex broke, alex the, alex vase; broke the, broke vase; the vase
# Parola et al is inspired by Pauselli et al (p76): 
# ''Coherence is the average similarity of each word to each of the other
# words in the list, regardless of order or proximity. ''
# w2 = word_embed[word_id+1] # adjacent neighbour only
temp_file = -1
for df in dfs:
    temp_file += 1
    print(temp_file)
    for mv in mvs:
        # print progress
        cur = 'bert_word_mv' + mv
        print('current: ', cur)
        df[cur + '_similarity'] = '' # save the cosine similarities; all stats are derived from there

        # loop over each response
        for i in df.index:
            if type(df['bert_emb'][i]) != float: 
                # chop 1 big response sequence into 5/10-token chunks
                word_embed_chunk = list(divide_chunks(df['bert_emb'][i], int(mv))) 
                chunk_temp_collection = [] 
                # loop over each 5/10 chunk in the response
                for chunck_id, word_embed in enumerate(word_embed_chunk):
                    temp_collection = []
                    # add this for BERT to convert a list of tensors to a list of lists
                    word_embed = [x.numpy().tolist() for x in word_embed]
                    # calculate average similarities for that chunk (5 or 10 window)                   
                    cmbs = combinations(word_embed) # apply function 
                    for cmb in cmbs:
                        w1 = cmb[0]
                        w2 = cmb[1]
                        temp = cosine_similarity(w1, w2)
                        temp_collection.append(temp)
                    temp_sim = np.nanmean(temp_collection)
                    chunk_temp_collection.append(temp_sim) # incrementally append similarity mean to the list 

            # get a list of similarity means for that response, 
            # its len is the number of chunks that the response can be chopped into
            df[cur + '_similarity'][i] = chunk_temp_collection # similarity mv 5 or 10; store it for later reference/stats

            # add other stats here
            df[cur + '_median'][i] = np.nanmedian(chunk_temp_collection)
            df[cur + '_q5'][i] = q5(chunk_temp_collection)
            df[cur + '_q95'][i] = q95(chunk_temp_collection)
            df[cur + '_iqr'][i] = iqr(chunk_temp_collection)
    df.to_csv(result + str(temp_file) + '.csv')
df.head()

In [52]:
df.columns

Index(['grid', 'content', 'n_words', 'bert_emb', 'bert_word_mv5_median',
       'bert_word_mv5_iqr', 'bert_word_mv5_q5', 'bert_word_mv5_q95',
       'bert_word_mv10_median', 'bert_word_mv10_iqr', 'bert_word_mv10_q5',
       'bert_word_mv10_q95', 'bert_word_mv5_similarity',
       'bert_word_mv10_similarity'],
      dtype='object')

# K1:10

In [None]:
import ast # a module that evaluates mathematical expressions and statements

ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
stats = ['_median', '_iqr', '_q5', '_q95']
for df in dfs:
    # create new empty columns
    for k in ks:
        for stat in stats:
            cur = 'bert_word_k' + k + stat
            df[cur] = ''
df.head()

In [None]:
temp_file = -1
ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

for df in dfs:
    temp_file += 1
    # loop through each k
    for k in ks:
        cur = 'bert_word_k' + k 
        print('Coherence k ', k, 'temp_file: ', temp_file) # progress
        df[cur + '_similarity'] = ''
        # loop through each individual's response 
        for i in df.index:
            if type(df['bert_emb'][i]) != float:
                temp = []
                # calcuate similarity of word pairs at k inter-token distance
                for id,v in enumerate(df['bert_emb'][i]): 
                    w1 = v
                    try:
                        w2 = df['bert_emb'][i][id + int(k)] 
                    except IndexError:
                        continue
                    sim = cosine_similarity(w1, w2)
                    temp.append(sim) # a list of similarity scores for that response

                # intermediate df, save 
                df[cur + '_similarity'][i] = temp
                df[cur + '_iqr'][i] = iqr(temp) # add other stats here
                df[cur + '_median'][i] = np.nanmedian(temp)
                df[cur + '_q5'][i] = q5(temp)
                df[cur + '_q95'][i] = q95(temp)
    df.to_csv(result + str(temp_file) + '.csv')
df.head()

In [56]:
df.columns

Index(['grid', 'content', 'n_words', 'bert_emb', 'bert_word_mv5_median',
       'bert_word_mv5_iqr', 'bert_word_mv5_q5', 'bert_word_mv5_q95',
       'bert_word_mv10_median', 'bert_word_mv10_iqr', 'bert_word_mv10_q5',
       'bert_word_mv10_q95', 'bert_word_mv5_similarity',
       'bert_word_mv10_similarity', 'bert_word_k1_median', 'bert_word_k1_iqr',
       'bert_word_k1_q5', 'bert_word_k1_q95', 'bert_word_k2_median',
       'bert_word_k2_iqr', 'bert_word_k2_q5', 'bert_word_k2_q95',
       'bert_word_k3_median', 'bert_word_k3_iqr', 'bert_word_k3_q5',
       'bert_word_k3_q95', 'bert_word_k4_median', 'bert_word_k4_iqr',
       'bert_word_k4_q5', 'bert_word_k4_q95', 'bert_word_k5_median',
       'bert_word_k5_iqr', 'bert_word_k5_q5', 'bert_word_k5_q95',
       'bert_word_k6_median', 'bert_word_k6_iqr', 'bert_word_k6_q5',
       'bert_word_k6_q95', 'bert_word_k7_median', 'bert_word_k7_iqr',
       'bert_word_k7_q5', 'bert_word_k7_q95', 'bert_word_k8_median',
       'bert_word_k8_iqr', 'be