# Extracting Word Embeddings in BERT

This script tokenizes each speech document into words, and runs BERT model. Among 13 hidden layers of BERT model output, it extracts the last layer which corresponds to word embeddings. Since there are duplicate words within one speech document, it collapses multiple words into one by avering out embedding values. 

- This script uses Fast Tokenizer from the "AutoTokenizer" package. 


In [3]:
from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torch

In [7]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [4]:
#input is "light.csv" which does not include stop words. 
df = pd.read_csv('../../../data/processed/light.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()
text = texts[1]


In [6]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,ccode_iso,session,year,text
0,1,AFG,7,1952,I consider great honour privilege share opport...


In [16]:
  
# Tokenize the text
tokenized_text = tokenizer.tokenize(text)
    
truncate_length = len(tokenized_text) - 512 + 2  # +2 to account for [CLS] and [SEP]
        
# Truncate the beginning and end of the text
truncated_text = tokenized_text[truncate_length//2 : -truncate_length//2]

marked_text = ["[CLS] "] + truncated_text + [" [SEP]"]
# Add special tokens [CLS] and [SEP]
        
# Convert tokens to ids
indexed_tokens = tokenizer.convert_tokens_to_ids(marked_text)
        
# Create attention mask
attention_mask = [1] * len(indexed_tokens)

In [152]:
#This one prints out the tokenized word pieces, along with indices. 

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

It              100
privilege    19,800
express       4,745
,             1,145
Mr            1,871
.             1,263
President     2,541
,             1,714
con           1,353
##gratulations  1,583
Afghanistan   6,241
delegation    1,958
election      2,030
,             1,607
just            119
##ly          1,284
unanimously   2,059
voted         3,519
Assembly      1,362
.             1,169
It            6,561
also          7,616
privilege       117
extend        2,218
fellow        2,174
representatives 16,286
greeting      9,113
##s           3,519
Royal           119
Afghan        1,130
Government    2,157
,               117
well         23,614
sincere       7,279
##st          3,681
wishes          117
success      11,565
current         117
session      21,820
General      13,378
Assembly     14,819
.            10,774
Our           3,235
attachment    3,844
United        1,311
Nations       4,309
Charter       6,551
principles      119
complete      1,109
ad            7

In [20]:
# Pad sequences to max_seq_length
if len(indexed_tokens) < 512:
    indexed_tokens.append(0)
    attention_mask.append(0)

In [63]:
# Convert lists to PyTorch tensors
tokenized_texts = []
tokens_tensors = []
attention_masks = []

    
tokens_tensors.append(torch.tensor(indexed_tokens))
attention_masks.append(torch.tensor(attention_mask))
tokenized_texts.append(tokenized_text)

# Convert lists to PyTorch tensors
tokens_tensors = torch.stack(tokens_tensors)
attention_masks = torch.stack(attention_masks)


In [64]:
# Run the BERT model
with torch.no_grad():
    outputs = model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)), attention_mask=attention_masks.view(-1, attention_masks.size(-1)))



In [190]:
pd_words = pd.Series(marked_text, name='term')
print(pd_words.shape)

hidden_states = outputs[2][0].squeeze().numpy()
print(hidden_states.shape)

df_outputs = pd.DataFrame(hidden_states)
df_outputs["term"] = pd_words

#Each column represents each term. Dimension is 768 X 512.

(512,)
(512, 768)
            0         1         2         3         4         5         6  \
0   -0.118897 -0.518255  0.159338 -0.461482 -0.003488 -0.453042 -0.212884   
1    0.932840 -0.000157  0.029107 -0.740883  0.241258  1.106739 -0.891506   
2    0.091746  0.066845 -0.104953  0.101292  0.095589  0.919574  1.302369   
3    0.404450 -0.390100 -0.416748  0.195306  0.137170  0.059420 -0.139206   
4   -0.325989 -0.142688  0.105910 -0.326140 -0.423716  0.301251 -0.412353   
..        ...       ...       ...       ...       ...       ...       ...   
507 -0.599033  0.895255  0.119296  0.099264 -0.163519 -1.028048  0.527986   
508 -0.301014  0.420417 -1.077342 -0.835100  1.221513 -1.501802  0.221088   
509 -0.305659 -0.064368  0.096934 -0.491793  0.149717  0.113267  0.376909   
510 -0.066669  0.016273  0.371624 -0.115370 -0.437943 -0.797991 -0.567259   
511 -0.409629 -0.746030  0.954548 -0.989581 -0.666579 -0.456214 -0.482902   

            7         8         9  ...       759       76

In [205]:
df_outputs_embedding = df_outputs.groupby(['term']).mean()

In [206]:
df_outputs_embedding.to_csv("../../../output/embeddings.csv")

# Post Analysis

In [133]:
print ("Number of layers:", len(hidden_states), "(initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print('Type of hidden_states:', type(hidden_states))

print('Tensor shape for each layer: ', hidden_states[0].size())


Number of layers: 1 (initial embeddings + 12 BERT layers)
Number of batches: 512
Number of tokens: 768
Type of hidden_states: <class 'torch.Tensor'>
Tensor shape for each layer:  torch.Size([512, 768])


In [155]:
token_embeddings = hidden_states[-1]
token_embeddings = torch.squeeze(token_embeddings, dim=0)

print(token_embeddings.shape)
word_embedding = token_embeddings[0, :].numpy()

list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]


torch.Size([512, 768])


In [128]:

token_vecs_cat = torch.cat((token_embeddings[-1:], token_embeddings[-2:], token_embeddings[-3:], token_embeddings[-4:]), dim=0)

print('Shape is: %d x %d' % (token_vecs_cat.shape[0], token_vecs_cat.shape[1]))


Shape is: 10 x 768


In [151]:

# Collapse the embedding matrix by averaging duplicates
collapsed_token_vecs = torch.zeros_like(token_vecs_cat)
unique_tokens, unique_indices = torch.unique_consecutive(token_vecs_cat, return_inverse=True)

# Add a dimension to unique_indices to match the dimensions of collapsed_token_vecs
unique_indices = unique_indices.unsqueeze(1)

# Iterate over unique_tokens
for i in range(len(unique_tokens)):
    # Create a mask for the current unique token
    mask = (unique_indices == i).squeeze()

    # Average the corresponding vectors in the original matrix
    collapsed_token_vecs[mask] = torch.mean(token_vecs_cat[mask], dim=0)

collapsed_token_vecs_np = collapsed_token_vecs.numpy()

# Create a DataFrame from the collapsed embedding matrix
df_collapsed = pd.DataFrame(collapsed_token_vecs_np)

# Save the final DataFrame to a CSV file
#df_collapsed.to_csv('final_embeddings.csv', index=False)

print(df_collapsed.size)


        0         1         2         3         4         5         6    \
0 -0.409629 -0.746030  0.954548 -0.989581 -0.666579 -0.456214 -0.482902   
1 -0.066669  0.016273  0.371624 -0.115370 -0.437943 -0.797991 -0.567259   
2 -0.409629 -0.746030  0.954548 -0.989581 -0.666579 -0.456214 -0.482902   
3 -0.305659 -0.064368  0.096934 -0.491793  0.149717  0.113267  0.376909   
4 -0.066669  0.016273  0.371624 -0.115370 -0.437943 -0.797991 -0.567259   
5 -0.409629 -0.746030  0.954548 -0.989581 -0.666579 -0.456214 -0.482902   
6 -0.301014  0.420417 -1.077342 -0.835100  1.221513 -1.501802  0.221088   
7 -0.305659 -0.064368  0.096934 -0.491793  0.149717  0.113267  0.376909   
8 -0.066669  0.016273  0.371624 -0.115370 -0.437943 -0.797991 -0.567259   
9 -0.409629 -0.746030  0.954548 -0.989581 -0.666579 -0.456214 -0.482902   

        7         8         9    ...       758       759       760       761  \
0 -0.474517 -0.324448 -0.554452  ...  0.937598  0.148158  0.841253 -0.650847   
1 -0.527095 -0

In [195]:
from sklearn.metrics.pairwise import cosine_similarity

term_a = "problems"
term_b = "small"

# Check if the terms are in the tokenized text
if term_a in tokenized_text and term_b in tokenized_text:
    # Get the indices of the terms in the tokenized text
    index_sovereignty = tokenized_text.index(term_a)
    index_territory = tokenized_text.index(term_b)

    # Extract the embeddings for the terms
    embedding_sovereignty = token_embeddings[index_sovereignty, :]
    embedding_territory = token_embeddings[index_territory, :]

    # Reshape the embeddings to be 2D arrays
    embedding_sovereignty = embedding_sovereignty.reshape(1, -1)
    embedding_territory = embedding_territory.reshape(1, -1)

    # Compute cosine similarity
    similarity_score = cosine_similarity(embedding_sovereignty, embedding_territory)

    print(f"Cosine Similarity between '{term_a}' and '{term_b}':", similarity_score[0, 0])
else:
    print(f"One or both of the terms '{term_a}' and '{term_b}' not found in the tokenized text.")


Cosine Similarity between 'problems' and 'small': 0.007380645


In [125]:
print(tokenized_texts[0])

['It', 'privilege', 'express', ',', 'Mr', '.', 'President', ',', 'con', '##gratulations', 'Afghanistan', 'delegation', 'election', ',', 'just', '##ly', 'unanimously', 'voted', 'Assembly', '.', 'It', 'also', 'privilege', 'extend', 'fellow', 'representatives', 'greeting', '##s', 'Royal', 'Afghan', 'Government', ',', 'well', 'sincere', '##st', 'wishes', 'success', 'current', 'session', 'General', 'Assembly', '.', 'Our', 'attachment', 'United', 'Nations', 'Charter', 'principles', 'complete', 'ad', '##herence', 'principles', 'human', 'rights', 'self', '-', 'determination', 'peoples', 'based', 'ideological', 'grounds', 'also', 'result', 'long', 'experience', 'free', 'small', 'country', 'controversial', 'events', 'modern', 'history', '.', 'We', 'believe', 'peace', 'world', 'can', 'secured', 'bases', ',', 'certain', 'future', 'prosperity', 'depends', 'peace', '.', 'In', 'saying', ',', 'posing', 'moral', '##ists', ',', 'contrary', ',', 'hum', '##ility', 'expressing', 'conviction', 'fellow', 'Me

In [127]:
word_index = tokenized_texts[0].index("unanimously")
word_embedding = token_embeddings[0, word_index].numpy()
print(word_embedding)

0.2421685
