In [1]:
import csv
import torch
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# read from data.csv

all_sentences = []

with open('../data/data.csv', 'r') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        all_sentences.append(row['sentence'])

In [6]:
import pandas as pd
df = pd.read_csv("../data/data.csv")

In [7]:
df.shape

(5759, 19)

In [5]:
len(all_sentences)

5759

In [8]:
# tokens and embeddings as is from bert
all_tokens = []
all_embeddings = []

# average embeddings of word piece
tokenized_sentences_no_wordpiece = []
embedding_sentences_no_wordpiece = []

for sentence in all_sentences:
    # tokenize the sentences for bert

    marked_sentence = "[CLS]" + sentence + "[SEP]"
    # TODO: no need to add [CLS] and [SEP], tokenizer will do that
    
    tokenized_sentence = tokenizer.tokenize(marked_sentence)  # TODO: just do tokenizer()
    all_tokens.append(tokenized_sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
    
    # all 1s since we are feeding one sentence to bert at a time
    segments_ids = [1] * len(tokenized_sentence)  # TODO: bert tokenizer will output segment ID
    
    # convert python lists to torch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)  # TODO: move out of "for"
    model.eval()
    
    # do not run the back propagation. 
    with torch.no_grad():
        output = model(tokens_tensor, segments_tensors)
        
        embedding = output.last_hidden_state
        
        # squeeze the tensor to remove the batch
        embedding = torch.squeeze(embedding, dim=0)
        
        all_embeddings.append(embedding)
        
#         print(tokenized_sentence)
#         print(len(tokenized_sentence))
#         print(embedding.size())
        
        # combine the embedding of the word pieces and take the average
        token_no_wordpiece = []
        embedding_no_wordpiece = []
        
        for i in range(0, len(tokenized_sentence)):
                        
            each_token = tokenized_sentence[i]
            
            if len(each_token) > 2 and each_token[0] == each_token[1] == '#':
                
                # TODO: add some comments to below
                # TODO: write it into the format of InstanceFeatures
                # TODO: build token_id_orig_map
                if len(tokenized_sentence[i - 1]) > 2 and tokenized_sentence[i - 1][0] == tokenized_sentence[i - 1][1] == '#':
                    continue
                
                num_sequence = 2
                sum_embedding = embedding[i - 1].add(embedding[i])
                sum_token = each_token[2:]
                for j in range(i + 1, len(tokenized_sentence)):
                    
                    next_token = tokenized_sentence[j]
                    
                    if len(next_token) > 2 and next_token[0] == next_token[1] == '#':
                        num_sequence += 1
                        sum_embedding = sum_embedding.add(embedding[j])
                        sum_token += next_token[2:]
                    else:
                        break
                        
                token_no_wordpiece[-1] = token_no_wordpiece[-1] + sum_token
                embedding_no_wordpiece[-1] = torch.div(sum_embedding, num_sequence)
            else:
                token_no_wordpiece.append(tokenized_sentence[i])
                embedding_no_wordpiece.append(embedding[i])
        
        
        tokenized_sentences_no_wordpiece.append(token_no_wordpiece)
        embedding_sentences_no_wordpiece.append(torch.stack(embedding_no_wordpiece))
        # TODO: save everything into InstanceFeatures
        break
        
print("Complete")

Complete


In [None]:
# Original sentences
print(len(all_sentences))

# All tokenized sentence with word piece
print(len(all_tokens))

# All embeddings with word piece
print(len(all_embeddings))

# Assert all_tokens and all_embeddings have the same length
print(len(all_tokens) == len(all_embeddings))

# All tokenized sentence without word piece
print(len(tokenized_sentences_no_wordpiece))

# All embeddings without word piece
print(len(embedding_sentences_no_wordpiece))

# Assert tokenized_sentences_no_wordpiece and embedding_sentences_no_wordpiece have the same length 
print(len(tokenized_sentences_no_wordpiece) == len(embedding_sentences_no_wordpiece))

# The shape of an embedding is [# of tokens, 768]
print(all_embeddings[0].size())
print(embedding_sentences_no_wordpiece[0].size())
