# First adding the boiler plate code to get the seeds and GPU and imports all setup.

In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm




# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


# Next retreiving the text sentances and adding each sentance to a data frame to next tokenize them. I should probably also do our usual cleaning up of the sentances as well.

In [3]:
# Create the data frame that will store all the sentances and then eventually their tokens.
sentances_df = None

In [4]:
# First open the file that contains all the sentances.
i = 0

with open('datafiles/assignment4-dataset.txt','r') as text:

    sentances = text.readlines()
    
    sentances_df = pd.DataFrame({"sentence":sentances})

In [5]:
sentances_df

Unnamed: 0,sentence
0,The White Monkey is a 1925 American silent dra...
1,It was released by First National Pictures on ...
2,Plot\n
3,"As described in a film magazine review, Fleur,..."
4,He knows that Fleur married Michael without an...
...,...
4468820,1868 births\n
4468821,1947 deaths\n
4468822,German Assyriologists\n
4468823,German male non-fiction writers\n


# Next import the tokenizer for the reberta model that will be used to tokenize our sentances.

In [6]:
from transformers import AutoTokenizer

transformer_name = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)
ignore_index = -100

In [7]:
def tokenize_sentance(batch):
    # First calll tokenizer.
    tokenized_sentances = tokenizer(
        batch['sentence'],
        truncation=True,
        #padding=True
        #return_tensors="pt"
    )

    # Now add word id's to each of the sentances.
    #word_ids_lst = []
    #for i in range(len(batch['sentence'])):
        #curr_word_ids = tokenized_sentances.word_ids(batch_index=i)

        #curr_word_ids = [w_id if w_id!=None else ignore_index for w_id in curr_word_ids]
        
        #word_ids_lst.append(curr_word_ids)
    #tokenized_sentances['word_ids'] = word_ids_lst

    return tokenized_sentances

In [8]:
from datasets import Dataset

sentances_ds = Dataset.from_pandas(sentances_df)

sentances_ds = sentances_ds.map(tokenize_sentance,batched=True,remove_columns=['sentence'])

Map:   0%|          | 0/4468825 [00:00<?, ? examples/s]

In [9]:
print(sentances_ds[0])

{'input_ids': [0, 133, 735, 34546, 16, 10, 36248, 470, 8454, 4149, 822, 6, 3660, 30, 4720, 11998, 8, 8996, 4810, 1587, 1127, 338, 6, 1813, 11549, 6, 8, 4858, 8804, 4, 50118, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
from transformers import RobertaModel

RobertaModel.from_pretrained(transformer_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [11]:
from torch.utils.data import DataLoader

batch_size=100

# Now I want to just experiment with what happens if I pass one example to the classifier.

reb_pre_trained = RobertaModel.from_pretrained(transformer_name, add_pooling_layer=False)
reb_pre_trained.to(device)
sentances_inputs_ds = sentances_ds.select_columns(['input_ids','attention_mask'])
mod_embeding_sz = reb_pre_trained.config.hidden_size
#sentances_ds.set_format(type="torch")


# Now we collate within a batch and than we create the data loader to pass everything to 
# the model to produce the contextualized embeddings.
def collate_batch(batch):
    batch = tokenizer.pad(batch,return_tensors="pt")
    #batch_sentance_examples = {}
        
    #for key,value in batch.items():
        #if(isinstance(value,torch.Tensor)):
            #batch_sentance_examples[key] = value.to(device)
        #else:
            #print(f"Found somehting not a tensor:\n\n {value}")

    return batch

sentances_dl = DataLoader(sentances_ds,batch_size=batch_size,collate_fn=collate_batch)

# By the end of this loop this dict should be of the form {input_id:(cont_embed_sum,num_inst),...}
cont_embed_dict = {}


j = 1
with torch.no_grad():
    for batch in tqdm(sentances_dl,desc=f"On batch number {i}"):
        #if j==2: break

        #print(batch)

        # Send evything to the GPU.
        batch = {key: (value.to(device) if isinstance(value,torch.Tensor) else value) for key,value in batch.items()}
        
        #for key,value in batch.items():
            #if(isinstance(value,torch.Tensor)):
                #batch_sentance_examples[key] = value.to(device)
            #else:
                #print(f"Found somehting not a tensor:\n\n {value}")
            
        # Get the model to produce the contextualized embeddings.
        enc_curr = reb_pre_trained(**batch).last_hidden_state.detach().cpu()

        # Send evrything back to the cpu.
        batch = {key: value.detach().cpu() if isinstance(value,torch.Tensor) else value for key,value in batch.items()}

        # Now that the batch is back on the CPU for processing we can process all of the batches
        # contextualized embeddings.
        # Reminder to self: batch['attention_mask'/'input_ids'][sentance_num,token/token_mask]
        #                   enc_curr[sentance_num,embedding_num for token at position, embedding vector val]

        for i in range(enc_curr.shape[0]):
            # Get embeddings for current sentance.
            sent_embeds_curr = enc_curr[i,:,:].numpy()
            sent_atten_mask = batch['attention_mask'][i,:].numpy()
            sent_input_ids = batch['input_ids'][i,:].numpy()

            # Now loop through each embedding.
            for k in range(enc_curr.shape[1]):

                # First check with the attention mask if the current embedding is meaningful.
                if sent_atten_mask[k] != 0:
                    # If the attention mask is not zero than add its embedding to the running sum
                    # and increment the number of occurences.
                    curr_input_id = sent_input_ids[k]
                    if curr_input_id not in cont_embed_dict:
                        cont_embed_dict[curr_input_id] = {'tot_oc':0,'sum_embed':np.zeros(mod_embeding_sz)}
 
                    # Now that there is definently an input for this id the next thing to do is to add
                    # this embedding entry to its running total.
                    cont_embed_dict[curr_input_id]['tot_oc'] = cont_embed_dict[curr_input_id]['tot_oc'] + 1
                    #print(np.reshape(sent_embeds_curr[k,:],-1))
                    cont_embed_dict[curr_input_id]['sum_embed'] = np.add(cont_embed_dict[curr_input_id]['sum_embed'],
                                                                            sent_embeds_curr[k,:])
                    
        
        j += 1

On batch number 0:   0%|          | 0/44689 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
len(cont_embed_dict.keys())

49504

In [13]:
avg_cont_embed_dict = {}

for key, value in cont_embed_dict.items():
    avg_cont_embed_dict[key] = value['sum_embed'] / value['tot_oc']


avg_cont_embed_df = pd.DataFrame({'tok_id':avg_cont_embed_dict.keys(), \
                                  'avg_cont_embed':avg_cont_embed_dict.values()})

In [14]:
display(avg_cont_embed_df.head())

Unnamed: 0,tok_id,avg_cont_embed
0,0,"[-0.05390735231835314, 0.08659995588741708, -0..."
1,133,"[-0.06945269355797774, 0.005067573929508234, 0..."
2,735,"[0.0278572527410288, 0.17078800192580665, -0.0..."
3,34546,"[0.0019569261815032732, 0.042429491532477666, ..."
4,16,"[0.0823113064755886, 0.21211939386916828, 0.04..."


In [17]:
# Saving the token id's in a new csv file that just has the token_id's
avg_cont_embed_df[['tok_id']].to_csv('datafiles/tok_id_embeds.csv',index=False)

# Save numpy part as well.
twod_emb_vecs_np = np.stack(avg_cont_embed_df['avg_cont_embed'].to_numpy())
np.save('datafiles/cont_embed_vecs.npy',twod_emb_vecs_np)

In [23]:
# Now make sure we can load everything properly.
embed_tok_id_df = pd.read_csv('datafiles/tok_id_embeds.csv')
embed_vecs = np.load('datafiles/cont_embed_vecs.npy')

# Now to get back the original data frame.
avg_cont_embed_df = pd.DataFrame({'tok_id':embed_tok_id_df['tok_id'].to_numpy(),'avg_cont_embed':list(embed_vecs)})

In [24]:
avg_cont_embed_df.shape

(49504, 2)

In [25]:
display(avg_cont_embed_df.head())

Unnamed: 0,tok_id,avg_cont_embed
0,0,"[-0.05390735231835314, 0.08659995588741708, -0..."
1,133,"[-0.06945269355797774, 0.005067573929508234, 0..."
2,735,"[0.0278572527410288, 0.17078800192580665, -0.0..."
3,34546,"[0.0019569261815032732, 0.042429491532477666, ..."
4,16,"[0.0823113064755886, 0.21211939386916828, 0.04..."
