<h2 style="text-align: center;"><b>Problem One: Average Contextualized Embeddings</h2>


Load previous data

In [46]:
import os
import pickle
from gensim.models import KeyedVectors

# ---- 1. Vector totals checkpoint ----
vector_checkpoint_path = "vector_checkpoint.pkl"
if os.path.exists(vector_checkpoint_path):
    with open(vector_checkpoint_path, "rb") as f:
        saved = pickle.load(f)
    vector_totals = saved["vector_totals"]
    token_count = saved["token_count"]
    start_i_vectors = saved["start_i"]
    print(f"[Vectors] Loaded checkpoint at i = {start_i_vectors}")
else:
    vector_totals = {}
    token_count = {}
    start_i_vectors = 0
    print("[Vectors] Starting fresh")

# ---- 2. GloVe checkpoint ----
glove_checkpoint_path = "glove_checkpoint.pkl"
if os.path.exists(glove_checkpoint_path):
    with open(glove_checkpoint_path, "rb") as f:
        saved = pickle.load(f)
    start_i_glove = saved["glove_words_done"]
    word_embeddings = saved["word_embeddings"]
    print(f"[GloVe] Loaded checkpoint at batch {start_i_glove}")
else:
    start_i_glove = 0
    word_embeddings = {}
    print("[GloVe] Starting fresh")

# ---- 3. KeyedVectors checkpoint ----
gensim_checkpoint_path = "gensim_checkpoint.kv"
if os.path.exists(gensim_checkpoint_path):
    wv = KeyedVectors.load(gensim_checkpoint_path)
    start_idx = len(wv)
    print("[Gensim] Loaded KeyedVectors checkpoint")
else:
    wv = None  # will initialize later
    start_idx = 0
    print("[Gensim] Starting fresh")

[Vectors] Starting fresh
[GloVe] Starting fresh
[Gensim] Starting fresh


Initialize System

In [47]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Now read in the assignment file and generate conextualized vectors for the tokens present

In [None]:
# Read in file
file = "assignment4-dataset.txt"
small = "small.txt"
large = "large.txt"
million = "million.txt"

with open(large, 'r') as f:
    lines = f.read().splitlines()

In [None]:
# Function to save vector totals checkpoint
import pickle

vector_checkpoint_path = "vector_checkpoint.pkl"

def save_vectors(i):
    with open(vector_checkpoint_path + ".tmp", "wb") as f:
        pickle.dump({
            "vector_totals": vector_totals,
            "token_count": token_count,
            "start_i": i
        }, f)
    os.replace(vector_checkpoint_path + ".tmp", vector_checkpoint_path)
    print(f"[Vectors] Checkpoint saved at i = {i}")

In [None]:
# Function to process model output and accumulate vectors
def get_vector(output, input_ids, attention_mask):
    embeddings = output.last_hidden_state
    for i in range(embeddings.size(0)):
        token_ids = input_ids[i].cpu()
        mask = attention_mask[i]
        for j, id in enumerate(token_ids):
            if mask[j] == 1:
                vector = embeddings[i,j,:].cpu()
                id_int = id.item()
                if id not in vector_totals:
                    vector_totals[id_int] = vector.cpu()
                    token_count[id_int] = 1
                else:
                    vector_totals[id_int] += vector.cpu()
                    token_count[id_int] += 1

Used the Facebook/Roberta-Base model to tokenize and convert to token ID's, and then to create contextualized embeddings

In [None]:
from transformers import RobertaTokenizer, RobertaModel

# Transformer name used
name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaModel.from_pretrained(name)
model.to(device)

batch_size = 100
save_every = 50
for b, i in enumerate(range(start_i_vectors, len(lines), batch_size)):
    tqdm.write(f'Processing lines {i} to {min(i+batch_size, len(lines))}...')
    batch_words = lines[i:i+batch_size]
    encoded = tokenizer(batch_words, padding=True, truncation=True, return_tensors='pt')

    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)

    # Create contextual embeddings
    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        get_vector(output, input_ids, attention_mask)
    
    del batch_words, encoded, input_ids, attention_mask, output
    torch.cuda.empty_cache()
    
    # save every N batches
    if (i - start_i_vectors) % (save_every * batch_size) == 0 and i > start_i_vectors:
        save_vectors(i)

Create the average vectors for each token

In [52]:
vector_averages = {}
for token in vector_totals:
    vector_averages[token] = vector_totals[token] / token_count[token]

<h2 style="text-align: center;"><b>Problem Two: most_similar() function</h2>


These are the ID's which should not be included in the words

In [53]:
start_id = tokenizer.convert_tokens_to_ids('<s>')
end_id = tokenizer.convert_tokens_to_ids('</s>')
pad_id = tokenizer.pad_token_id
print(f'start_id: {start_id}, end_id: {end_id}, pad_id: {pad_id}')

start_id: 0, end_id: 2, pad_id: 1


Using the glove file, find the token sequences associated with different words and gather the embeddings that pair with words

In [54]:
# Function to save GloVe checkpoint
glove_checkpoint_path = "glove_checkpoint.pkl"

def save_glove(i):
    with open(glove_checkpoint_path + ".tmp", "wb") as f:
        pickle.dump({
            "glove_words_done": i,  # you can also save partial embeddings if needed
            "word_embeddings": word_embeddings
        }, f)
    os.replace(glove_checkpoint_path + ".tmp", glove_checkpoint_path)
    print(f"[GloVe] Checkpoint saved at batch {i}")

In [55]:
# Function to get word embeddings
def get_word_embeddings(batch, input_ids):
    for i, word in enumerate(batch):
        ids = input_ids[i]
        word_vector = []
        for j in range(len(ids)):
            token_id = input_ids[i][j].item()
            if token_id == start_id or token_id == end_id or token_id == pad_id:
                continue
            if token_id in vector_averages:
                word_vector.append(vector_averages[token_id])
        if len(word_vector) == 0:
            continue
        word_vector = torch.stack(word_vector).mean(dim=0)
        word_embeddings[word] = word_vector

In [None]:
# Read in GloVe vocabulary file
glove_file = "glove.6B.300d-vocabulary.txt"
with open(glove_file, 'r') as f:
    glove_words = f.read().splitlines()

# Process GloVe words in batches
save_every = 500

for b, i in enumerate(range(start_i_glove, len(glove_words), batch_size)):
    tqdm.write(f'Processing lines {i} to {min(i+batch_size, len(glove_words))}...')
    glove_batch = glove_words[i:i+batch_size]
    glove_tokens = tokenizer(glove_batch, padding=True, truncation=True, return_tensors='pt')

    glove_input_ids = glove_tokens.input_ids.to(device)
    glove_attention_mask = glove_tokens.attention_mask.to(device)

    get_word_embeddings(glove_batch, glove_input_ids)
        
    del glove_batch, glove_tokens, glove_input_ids, glove_attention_mask
    torch.cuda.empty_cache()

     # save every N batches
    if b % save_every == 0 and b > 0:
        save_glove(i)


Make the words and their corresponding vectors into KeyedVectors so that we can use the most_similar() function

In [57]:
# Save KeyedVectors
gensim_checkpoint_path = "gensim_checkpoint.kv"

def save_wv():
    wv.save(gensim_checkpoint_path)
    print("[Gensim] KeyedVectors saved")

In [None]:
# Convert word embeddings to KeyedVectors
from gensim.models import KeyedVectors

hidden_dim = next(iter(word_embeddings.values())).shape[0]
if wv is None:
    wv = KeyedVectors(vector_size=hidden_dim)

save_every = 10000
words = list(word_embeddings.keys())
vectors = [vec.numpy() for vec in word_embeddings.values()]


for i in range(start_idx, len(words), batch_size):
    tqdm.write(f'Processing words {i} to {min(i+batch_size, len(words))}...')
    batch_words = words[i:i+batch_size]
    batch_vectors = vectors[i:i+batch_size]

    wv.add_vectors(batch_words, batch_vectors)

    # save checkpoint every N batches
    if (i // batch_size) % (save_every // batch_size) == 0:
        save_wv()

Now do the most_similar() function

In [59]:
wv.most_similar('cactus')

[('cavalcanti', 0.9735779166221619),
 ('cavalcante', 0.9732860326766968),
 ('casket', 0.9722654819488525),
 ('calker', 0.9716652631759644),
 ('dotc', 0.971451997756958),
 ('cringe', 0.9713144898414612),
 ('criers', 0.9711087942123413),
 ('couderc', 0.9703741073608398),
 ('cossus', 0.970361053943634),
 ('cruce', 0.9703434109687805)]

In [60]:
wv.most_similar('cake')

[('fruitcake', 0.9662887454032898),
 ('cakey', 0.9655447006225586),
 ('cakebread', 0.9634582996368408),
 ('cakewalk', 0.960048496723175),
 ('mooncake', 0.958672285079956),
 ('shortcake', 0.9537740349769592),
 ('cakelike', 0.9502132534980774),
 ('beefcake', 0.9404978156089783),
 ('poundcake', 0.9397675395011902),
 ('pattycake', 0.9386507272720337)]

In [61]:
wv.most_similar('angry')

[('ryang', 0.9999999403953552),
 ('ryanggang', 0.9819610714912415),
 ('ryong', 0.9736228585243225),
 ('usry', 0.973404049873352),
 ('ryner', 0.9702824354171753),
 ('pry', 0.970122218132019),
 ('mlanghenry', 0.9697545170783997),
 ('terry', 0.9693702459335327),
 ('morry', 0.968619704246521),
 ('ryler', 0.9682397842407227)]

In [62]:
wv.most_similar('quickly')

[('clearly', 0.9788289070129395),
 ('quietly', 0.9755452275276184),
 ('dryly', 0.9753377437591553),
 ('fixedly', 0.9751794338226318),
 ('securely', 0.9747836589813232),
 ('smartly', 0.9746143221855164),
 ('hotly', 0.9745563864707947),
 ('cleanly', 0.9741897583007812),
 ('complexly', 0.9726245999336243),
 ('correctly', 0.9718415141105652)]

In [63]:
wv.most_similar('between')

[('betweenness', 0.9571166038513184),
 ('inbetween', 0.9554120302200317),
 ('inbetweeners', 0.9402250647544861),
 ('in-between', 0.9328943490982056),
 ('go-between', 0.932777464389801),
 ('bounderby', 0.9227321743965149),
 ('bytitle', 0.9213169813156128),
 ('byerley', 0.9161116480827332),
 ('byproducts', 0.9160301089286804),
 ('bynes', 0.9155882000923157)]

In [64]:
wv.most_similar('the')

[('theile', 0.9662163257598877),
 ('theo', 0.9658365845680237),
 ('thea', 0.9654410481452942),
 ('thep', 0.9651058912277222),
 ('edythe', 0.9639862775802612),
 ('therence', 0.9637654423713684),
 ('lythe', 0.9636275768280029),
 ('theus', 0.9635955691337585),
 ('thean', 0.9631367325782776),
 ('theodo', 0.9628977179527283)]