Initialize the system

In [None]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Read in the file (used small.txt as a smaller testing file)

In [2]:
file = "assignment4-dataset.txt"
small = "small.txt"

with open(small, 'r') as f:
    lines = f.read().splitlines()

Used the Facebook/Roberta-Base model to tokenize and convert to token ID's, and then to create contextualized embeddings

In [None]:
from transformers import RobertaTokenizer, RobertaModel

# Transformer name used
name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaModel.from_pretrained(name)
model.to(device)

# Convert to token ids
encoded = tokenizer(lines, padding=True, truncation=True, return_tensors='pt')

input_ids = encoded.input_ids.to(device)
attention_mask = encoded.attention_mask.to(device)

# Create contextual embeddings
output = model(input_ids=input_ids, attention_mask=attention_mask)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Find all unique instances of words and average out their embedding vectors

In [31]:
vector_totals = {}
token_count = {}

embeddings = output.last_hidden_state

for i in range(embeddings.size(0)):
    tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
    mask = attention_mask[i]
    for j, token in enumerate(tokens):
        if mask[j] == 1:
            vector = embeddings[i,j,:].cpu()
            if token not in vector_totals:
                vector_totals[token] = vector
                token_count[token] = 1
            else:
                vector_totals[token] += vector
                token_count[token] += 1

vector_averages = {}

for token in vector_totals:
    vector_averages[token] = vector_totals[token] / token_count[token]