In [1]:
import os
import codecs
import json
import numpy as np
from functools import reduce
from operator import add
import torch

In [2]:
from transformers import BertJapaneseTokenizer, BertModel
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese")

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
with codecs.open('/home/wu/repo/gesture-generation-using-WGAN/data/takekuchi/source/text/text21.json', 'r', 'utf8') as f:
    words = json.load(f)['word_list']

In [40]:
def process_word_vec(words):
    """
    Process recongized words as dict to bert embeddings
    Args:
        words - list of dict {'word', 'start_time', 'end_time'}
    Returns:
        features - embedded result mathcing the length of word_list
    """
    num_words = [] # num of words in each recognized chunk
    tokens = {
        'input_ids': torch.LongTensor([[2]]), # 2 - 'sos'
        'token_type_ids': torch.LongTensor([[0]]),
        'attention_mask': torch.LongTensor([[1]])
    }

    for i, word in enumerate(words):

        bert_tokens = tokenizer(word['word'], return_tensors='pt')

        tokens['input_ids'] = torch.cat([tokens['input_ids'], bert_tokens['input_ids'][:, 1:-1]], dim=-1) 
        tokens['token_type_ids'] = torch.cat([tokens['token_type_ids'], bert_tokens['token_type_ids'][:, 1:-1]], dim=-1) 
        tokens['attention_mask'] = torch.cat([tokens['attention_mask'], bert_tokens['attention_mask'][:, 1:-1]], dim=-1) 

        num_words.append(bert_tokens['input_ids'].size(-1) - 2)

    tokens['input_ids'] = torch.cat([tokens['input_ids'], torch.LongTensor([[3]])], dim=-1) # 3 - 'eof'
    tokens['token_type_ids'] = torch.cat([tokens['token_type_ids'], torch.LongTensor([[0]])], dim=-1)
    tokens['attention_mask'] = torch.cat([tokens['attention_mask'], torch.LongTensor([[1]])], dim=-1)

    embeddings = model(**tokens)['last_hidden_state'].detach().numpy()[0] # features for each tokenized word

    features = [] # features for each recongized chunk, may contain several words
    start_word_index = 0
    for i, num_word in enumerate(num_words):
        end_word_index = start_word_index + num_word
        if i == 0:
            end_word_index += 1 # include 'sos'
        if i == len(num_words) - 1:
            end_word_index += 1  # include 'eos
        # Average features for mulitple tokens in one word
        feature = embeddings[start_word_index:end_word_index]
        feature = np.mean(feature, axis=0)
        # print(tokens['input_ids'][0][start_word_index:end_word_index])
        features.append(feature)
        start_word_index = end_word_index
    features = np.stack(features)

    assert len(features) == len(words), "lenght of features must match length of words"
    return features