<h2 style="text-align: center;"><b>Problem One: Average Contextualized Embeddings</h2>


Load previous data

In [46]:
import os
import pickle
from gensim.models import KeyedVectors

# ---- 1. Vector totals checkpoint ----
vector_checkpoint_path = "vector_checkpoint.pkl"
if os.path.exists(vector_checkpoint_path):
    with open(vector_checkpoint_path, "rb") as f:
        saved = pickle.load(f)
    vector_totals = saved["vector_totals"]
    token_count = saved["token_count"]
    start_i_vectors = saved["start_i"]
    print(f"[Vectors] Loaded checkpoint at i = {start_i_vectors}")
else:
    vector_totals = {}
    token_count = {}
    start_i_vectors = 0
    print("[Vectors] Starting fresh")

# ---- 2. GloVe checkpoint ----
glove_checkpoint_path = "glove_checkpoint.pkl"
if os.path.exists(glove_checkpoint_path):
    with open(glove_checkpoint_path, "rb") as f:
        saved = pickle.load(f)
    start_i_glove = saved["glove_words_done"]
    word_embeddings = saved["word_embeddings"]
    print(f"[GloVe] Loaded checkpoint at batch {start_i_glove}")
else:
    start_i_glove = 0
    word_embeddings = {}
    print("[GloVe] Starting fresh")

# ---- 3. KeyedVectors checkpoint ----
gensim_checkpoint_path = "gensim_checkpoint.kv"
if os.path.exists(gensim_checkpoint_path):
    wv = KeyedVectors.load(gensim_checkpoint_path)
    start_idx = len(wv)
    print("[Gensim] Loaded KeyedVectors checkpoint")
else:
    wv = None  # will initialize later
    start_idx = 0
    print("[Gensim] Starting fresh")

[Vectors] Starting fresh
[GloVe] Starting fresh
[Gensim] Starting fresh


Initialize System

In [47]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Now read in the assignment file and generate conextualized vectors for the tokens present

In [None]:
# Read in file
file = "assignment4-dataset.txt"
small = "small.txt"
large = "large.txt"
million = "million.txt"

with open(large, 'r') as f:
    lines = f.read().splitlines()

In [None]:
# Function to save vector totals checkpoint
import pickle

vector_checkpoint_path = "vector_checkpoint.pkl"

def save_vectors(i):
    with open(vector_checkpoint_path + ".tmp", "wb") as f:
        pickle.dump({
            "vector_totals": vector_totals,
            "token_count": token_count,
            "start_i": i
        }, f)
    os.replace(vector_checkpoint_path + ".tmp", vector_checkpoint_path)
    print(f"[Vectors] Checkpoint saved at i = {i}")

In [None]:
# Function to process model output and accumulate vectors
def get_vector(output, input_ids, attention_mask):
    embeddings = output.last_hidden_state
    for i in range(embeddings.size(0)):
        token_ids = input_ids[i].cpu()
        mask = attention_mask[i]
        for j, id in enumerate(token_ids):
            if mask[j] == 1:
                vector = embeddings[i,j,:].cpu()
                id_int = id.item()
                if id not in vector_totals:
                    vector_totals[id_int] = vector.cpu()
                    token_count[id_int] = 1
                else:
                    vector_totals[id_int] += vector.cpu()
                    token_count[id_int] += 1

Used the Facebook/Roberta-Base model to tokenize and convert to token ID's, and then to create contextualized embeddings

In [51]:
from transformers import RobertaTokenizer, RobertaModel

# Transformer name used
name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaModel.from_pretrained(name)
model.to(device)

batch_size = 100
save_every = 50
for b, i in enumerate(range(start_i_vectors, len(lines), batch_size)):
    tqdm.write(f'Processing lines {i} to {min(i+batch_size, len(lines))}...')
    batch_words = lines[i:i+batch_size]
    encoded = tokenizer(batch_words, padding=True, truncation=True, return_tensors='pt')

    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)

    # Create contextual embeddings
    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        get_vector(output, input_ids, attention_mask)
    
    del batch_words, encoded, input_ids, attention_mask, output
    torch.cuda.empty_cache()
    
    # save every N batches
    if (i - start_i_vectors) % (save_every * batch_size) == 0 and i > start_i_vectors:
        save_vectors(i)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing lines 0 to 100...
Processing lines 100 to 200...
Processing lines 200 to 300...
Processing lines 300 to 400...
Processing lines 400 to 500...
Processing lines 500 to 600...
Processing lines 600 to 700...
Processing lines 700 to 800...
Processing lines 800 to 900...
Processing lines 900 to 1000...
Processing lines 1000 to 1100...
Processing lines 1100 to 1200...
Processing lines 1200 to 1300...
Processing lines 1300 to 1400...
Processing lines 1400 to 1500...
Processing lines 1500 to 1600...
Processing lines 1600 to 1700...
Processing lines 1700 to 1800...
Processing lines 1800 to 1900...
Processing lines 1900 to 2000...
Processing lines 2000 to 2100...
Processing lines 2100 to 2200...
Processing lines 2200 to 2300...
Processing lines 2300 to 2400...
Processing lines 2400 to 2500...
Processing lines 2500 to 2600...
Processing lines 2600 to 2700...
Processing lines 2700 to 2800...
Processing lines 2800 to 2900...
Processing lines 2900 to 3000...
Processing lines 3000 to 3100..

Create the average vectors for each token

In [52]:
vector_averages = {}
for token in vector_totals:
    vector_averages[token] = vector_totals[token] / token_count[token]

<h2 style="text-align: center;"><b>Problem Two: most_similar() function</h2>


These are the ID's which should not be included in the words

In [53]:
start_id = tokenizer.convert_tokens_to_ids('<s>')
end_id = tokenizer.convert_tokens_to_ids('</s>')
pad_id = tokenizer.pad_token_id
print(f'start_id: {start_id}, end_id: {end_id}, pad_id: {pad_id}')

start_id: 0, end_id: 2, pad_id: 1


Using the glove file, find the token sequences associated with different words and gather the embeddings that pair with words

In [54]:
# Function to save GloVe checkpoint
glove_checkpoint_path = "glove_checkpoint.pkl"

def save_glove(i):
    with open(glove_checkpoint_path + ".tmp", "wb") as f:
        pickle.dump({
            "glove_words_done": i,  # you can also save partial embeddings if needed
            "word_embeddings": word_embeddings
        }, f)
    os.replace(glove_checkpoint_path + ".tmp", glove_checkpoint_path)
    print(f"[GloVe] Checkpoint saved at batch {i}")

In [55]:
# Function to get word embeddings
def get_word_embeddings(batch, input_ids):
    for i, word in enumerate(batch):
        ids = input_ids[i]
        word_vector = []
        for j in range(len(ids)):
            token_id = input_ids[i][j].item()
            if token_id == start_id or token_id == end_id or token_id == pad_id:
                continue
            if token_id in vector_averages:
                word_vector.append(vector_averages[token_id])
        if len(word_vector) == 0:
            continue
        word_vector = torch.stack(word_vector).mean(dim=0)
        word_embeddings[word] = word_vector

In [56]:
# Read in GloVe vocabulary file
glove_file = "glove.6B.300d-vocabulary.txt"
with open(glove_file, 'r') as f:
    glove_words = f.read().splitlines()

# Process GloVe words in batches
save_every = 500

for b, i in enumerate(range(start_i_glove, len(glove_words), batch_size)):
    tqdm.write(f'Processing lines {i} to {min(i+batch_size, len(glove_words))}...')
    glove_batch = glove_words[i:i+batch_size]
    glove_tokens = tokenizer(glove_batch, padding=True, truncation=True, return_tensors='pt')

    glove_input_ids = glove_tokens.input_ids.to(device)
    glove_attention_mask = glove_tokens.attention_mask.to(device)

    get_word_embeddings(glove_batch, glove_input_ids)
        
    del glove_batch, glove_tokens, glove_input_ids, glove_attention_mask
    torch.cuda.empty_cache()

     # save every N batches
    if b % save_every == 0 and b > 0:
        save_glove(i)


Processing lines 0 to 100...
Processing lines 100 to 200...
Processing lines 200 to 300...
Processing lines 300 to 400...
Processing lines 400 to 500...
Processing lines 500 to 600...
Processing lines 600 to 700...
Processing lines 700 to 800...
Processing lines 800 to 900...
Processing lines 900 to 1000...
Processing lines 1000 to 1100...
Processing lines 1100 to 1200...
Processing lines 1200 to 1300...
Processing lines 1300 to 1400...
Processing lines 1400 to 1500...
Processing lines 1500 to 1600...
Processing lines 1600 to 1700...
Processing lines 1700 to 1800...
Processing lines 1800 to 1900...
Processing lines 1900 to 2000...
Processing lines 2000 to 2100...
Processing lines 2100 to 2200...
Processing lines 2200 to 2300...
Processing lines 2300 to 2400...
Processing lines 2400 to 2500...
Processing lines 2500 to 2600...
Processing lines 2600 to 2700...
Processing lines 2700 to 2800...
Processing lines 2800 to 2900...
Processing lines 2900 to 3000...
Processing lines 3000 to 3100..

Make the words and their corresponding vectors into KeyedVectors so that we can use the most_similar() function

In [57]:
# Save KeyedVectors
gensim_checkpoint_path = "gensim_checkpoint.kv"

def save_wv():
    wv.save(gensim_checkpoint_path)
    print("[Gensim] KeyedVectors saved")

In [58]:
# Convert word embeddings to KeyedVectors
from gensim.models import KeyedVectors

hidden_dim = next(iter(word_embeddings.values())).shape[0]
if wv is None:
    wv = KeyedVectors(vector_size=hidden_dim)

save_every = 10000
words = list(word_embeddings.keys())
vectors = [vec.numpy() for vec in word_embeddings.values()]


for i in range(start_idx, len(words), batch_size):
    tqdm.write(f'Processing words {i} to {min(i+batch_size, len(words))}...')
    batch_words = words[i:i+batch_size]
    batch_vectors = vectors[i:i+batch_size]

    wv.add_vectors(batch_words, batch_vectors)

    # save checkpoint every N batches
    if (i // batch_size) % (save_every // batch_size) == 0:
        save_wv()

Processing words 0 to 100...
[Gensim] KeyedVectors saved
Processing words 100 to 200...
Processing words 200 to 300...
Processing words 300 to 400...
Processing words 400 to 500...
Processing words 500 to 600...
Processing words 600 to 700...
Processing words 700 to 800...
Processing words 800 to 900...
Processing words 900 to 1000...
Processing words 1000 to 1100...
Processing words 1100 to 1200...
Processing words 1200 to 1300...
Processing words 1300 to 1400...
Processing words 1400 to 1500...
Processing words 1500 to 1600...
Processing words 1600 to 1700...
Processing words 1700 to 1800...
Processing words 1800 to 1900...
Processing words 1900 to 2000...
Processing words 2000 to 2100...
Processing words 2100 to 2200...
Processing words 2200 to 2300...
Processing words 2300 to 2400...
Processing words 2400 to 2500...
Processing words 2500 to 2600...
Processing words 2600 to 2700...
Processing words 2700 to 2800...
Processing words 2800 to 2900...
Processing words 2900 to 3000...
Pro