
# Getting Embeddings from XLM RoBERTa
Created by: Sebastián Vallejo V.

Updated: July 8 2022

Call libraries:

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import random
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import os
import re
import string
from torch.utils.data import Dataset, DataLoader
import transformers
from torch.utils.data import TensorDataset
import time
import datetime
import subprocess
from scipy.sparse import csr_matrix
import scipy
import pyarrow as pa
import pyarrow.parquet as pq

Define functions:

In [None]:
# time:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# check gpu usage:
def show_gpu(msg):
    """
    ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
    """
    def query(field):
        return(subprocess.check_output(
            ['nvidia-smi', f'--query-gpu={field}',
                '--format=csv,nounits,noheader'],
            encoding='utf-8'))
    def to_int(result):
        return int(result.strip().split('\n')[0])

    used = to_int(query('memory.used'))
    total = to_int(query('memory.total'))
    pct = used/total
    print('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')

Load models. Choose models depending on GPU capabilities. 

In [None]:
model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large", do_lower_case=True)

Import dataframe.
The models use up all of the GPU memory so I have to do it in chuncks. I also have to reset the indices every time. 

In [None]:
# os.chdir("/Users/sebastian/OneDrive - University Of Houston/Papers and Chapters/Gendered Speech in Gendered Institutions/Data/data_styles")
data = pd.read_excel(r"speeches_sent_14.xlsx")
data = data.iloc[0:10000]
data = data.drop(3004)
data = data.reset_index()
data = data.drop('index', 1)
print(len(data))

Create tokenized data for tensor dataset. Adjust max_len depending on GPU capabilities. 

In [None]:
input_ids = []
attn_masks = []

max_len = 250

for x, row in data.iterrows():
    encoded_dict = tokenizer.encode_plus(row['text'],
    add_special_tokens = False,
    max_length=max_len, 
    padding='max_length',
    truncation=True,
    return_tensors='pt',
    is_split_into_words=True)
    input_ids.append(encoded_dict['input_ids'])
    attn_masks.append(encoded_dict['attention_mask'])

# Convert into tensor matrix (required step for TensorDataset, which helps with batching).
input_ids = torch.cat(input_ids, dim=0)
attn_masks = torch.cat(attn_masks, dim=0)

Torch seeds for replicability

In [None]:
seed_val = 1984
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.cuda.empty_cache() #Clear GPU cache if necessary

Set the model parameters. Adjust batch_size depending on GPU capabilities. 

In [None]:
class Settings:
    seed = 1984
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    batch_size = 16

Prepare tensor dataset

In [None]:
dataset = TensorDataset(input_ids, attn_masks)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=Settings.batch_size, shuffle = False)

Now we start the training to obtain the embeddings.

In [None]:
embeddings_batch = torch.Tensor([])
iter_dataloader = iter(train_dataloader)
total_t0 = time.time() # Measure the total training time

show_gpu('Initial GPU memory usage:') ### Just testing memory usage>
model.train()
show_gpu('GPU memory usage after loading training objects:') ### Just testing memory usage>

print("Start of model")

for batch in enumerate(train_dataloader):

    t0 = time.time() # Start timer
    batch = iter_dataloader.next()

    ### Create batches
    b_input_ids = batch[0].to(Settings.device)
    b_input_mask = batch[1].to(Settings.device)

    # Always clear any previously calculated gradients before performing a backward pass.
    model.zero_grad()

    ### Send model to device
    model.to(Settings.device)

    ### Run model
    output = model(b_input_ids, b_input_mask)

    # Clip the norm of the gradients to 1.0. This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    ### save last_hidden_state
    embeddings_batch = torch.cat((embeddings_batch.detach().cpu(),output[0].detach().cpu())) # Detach otherwise OUT OF MEMORY

    ### Just testing memory usage>
    show_gpu('GPU memory usage after training batch:')
    torch.cuda.empty_cache() #Clear GPU cache if necessary # Not sure if this is neessary at this stage
    show_gpu('GPU memory usage after clearing cache:')

#######
print("End of model")
# Delete model to make room un memory:
show_gpu('GPU memory usage after training:') ### Just testing memory usage>
del model
torch.cuda.empty_cache() #Clear GPU cache if necessary # Not sure if this is neessary at this stage
show_gpu('GPU memory usage after deleting model:') ### Just testing memory usage>

Each observation in embeddings_batch is a speech. To obtain the embeddings for each word we run a loop through each individual observation to create a matrix where each column is a word (token), each row is a speech (speech_id) and each cell is an embedding. In BERT every word is broken up into smaller bytes, so we have to put them back together again and average the embeddings.  

In [None]:
print("shape:", embeddings_batch.shape)

mask_pd = pd.DataFrame(attn_masks.numpy()).transpose()
mask_pd.head(20) # We only need the information from the tokens and not the padding

embeddings_final = pd.DataFrame(columns =['text_id'])
print("Start of loop")

for i in range(len(embeddings_batch)): # Start loop
    print("For loop number", i, sep=" ")

    # Get embeddings
    embeddings = embeddings_batch[i,:,:].detach() # We keep only the embeddings and remove gradient information
    embeddings_pd = pd.DataFrame(embeddings.cpu().numpy())

    embeddings_short = embeddings_pd.loc[:len(mask_pd.loc[mask_pd[i] == 1])]

    row_names_temp = pd.DataFrame(tokenizer.convert_ids_to_tokens(input_ids[i]))
    row_names_temp = row_names_temp.loc[:len(mask_pd.loc[mask_pd[i] == 1])]
    row_names_temp.loc[row_names_temp[0] == "<pad>", 0 ] = '▁pad▁'

    # Add tokens to each vectors
    embeddings_short.index = row_names_temp[0]

    ## combine syllables that are separated by estimating the mean
    embeddings_mean = pd.DataFrame()
    temp = pd.DataFrame()
    index_temp = []
    index_names = list('')

    len_index = len(embeddings_short.index)-1
    print("Index Length:", len_index, sep=" ")

    for index in range(len_index): # One less than the length in order to not go out of bounds
        temp = pd.DataFrame()
        index_temp = pd.DataFrame()

        temp = temp.append(embeddings_short.iloc[index], ignore_index=True)
        index_temp = embeddings_short.index[index]

        if "▁" not in embeddings_short.index[index+1]: # see if the next one does not have a _ that means it is part of the same word
            temp = temp.append(embeddings_short.iloc[index+1], ignore_index=True)
            index_temp = index_temp + embeddings_short.index[index + 1] # combine the syllables
            if "▁" not in embeddings_short.index[index+2]:
                temp = temp.append(embeddings_short.iloc[index+2], ignore_index=True)
                index_temp = index_temp + embeddings_short.index[index + 2]
                if "▁" not in embeddings_short.index[index+3]:
                    temp = temp.append(embeddings_short.iloc[index+3], ignore_index=True)
                    index_temp = index_temp + embeddings_short.index[index + 3]
                    if "▁" not in embeddings_short.index[index+4]:
                        temp =  temp.append(embeddings_short.iloc[index+4], ignore_index=True)
                        index_temp = index_temp + embeddings_short.index[index + 4]
                        embeddings_mean= embeddings_mean.append(temp.mean(), ignore_index=True)
                        index_names.append(index_temp)
                    else:
                        embeddings_mean= embeddings_mean.append(temp.mean(), ignore_index=True)
                        index_names.append(index_temp)
                else:
                    embeddings_mean= embeddings_mean.append(temp.mean(), ignore_index=True)
                    index_names.append(index_temp)
            else:
                embeddings_mean= embeddings_mean.append(temp.mean(), ignore_index=True)
                index_names.append(index_temp)
        else:
            embeddings_mean= embeddings_mean.append(temp.mean(), ignore_index=True)
            index_names.append(index_temp)

    embeddings_mean.index = index_names
    embeddings_mean = embeddings_mean.append(embeddings_short.iloc[len(embeddings_short)-1]) # add the last row (_end_) which will always be missing from the loop

    ## Erase all the words that do not start with _
    embeddings_mean = embeddings_mean[embeddings_mean.index.str.contains("▁") == True]

    ## Mean of repeated words **** Might need to think more about this (e.g. what if there is word play) **** 
    ## But it seems fine if the UoA is a sentences.
    embeddings_mean.index.name = 'tokens'
    embeddings_mean = embeddings_mean.groupby('tokens', as_index=True).mean()

    ## aggregate to make a list of embeddings
    embeddings_agg = embeddings_mean.aggregate(lambda x: [x.tolist()], axis=1)
    embeddings_agg = pd.DataFrame(embeddings_agg)
    embeddings_agg = embeddings_agg.reset_index()
    embeddings_agg.columns = ['text_id', data['text_id'][i]] # Add names to columns for merge

    ## Bind all
    embeddings_final = pd.merge(embeddings_final, embeddings_agg, how="outer",on='text_id')

print("End of loop")

To keep the usual shape of a dfm object, we transpose the matrix:

In [None]:
embeddings_final_t = embeddings_final.transpose()
embeddings_final_t.columns = embeddings_final_t.iloc[0]
embeddings_final_t = embeddings_final_t.iloc[1: , :]
embeddings_final_t = embeddings_final_t.reset_index(drop=True)

Save and continue in R:

In [None]:
# embeddings_final_t.to_excel('short_emb.xlsx')
# embeddings_final_t.to_feather('test.feather')
embeddings_final_t.to_parquet('speeches_emb_14_1.parquet', compression='BROTLI') # Smallest file size