In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import BertTokenizer, BertModel

In [14]:
model_version = 'allenai/scibert_scivocab_uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version).to("cuda:0")
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def embed_text(text, model):
    with torch.no_grad():
        input_ids = torch.tensor(tokenizer.encode(text), device="cuda").unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0].to('cpu')  # The last hidden-state is the first element of the output tuple

    return last_hidden_states

In [5]:
from tqdm import tqdm
import json
# Load strings from an existing vocab
vocab_list = json.load(open("simple_ner_spacy/model-best/vocab/strings.json", 'r', encoding='utf-8'))

strings = {}
for line in tqdm(vocab_list):
    string = line.strip()
    em1 = embed_text(string, model).mean(1)
    strings[string] = em1.detach().numpy().tolist()

100%|██████████| 44145/44145 [06:34<00:00, 111.88it/s]


In [8]:
# Save the result in JSON
json.dump(strings, open(f'scibert_vectors.json', 'w'))

In [12]:
# Saved the result in spaCy loadable format
with open('scibert_layer0.vec', 'w', encoding='utf-8') as f:
    header_written = False
    for word in strings:
        if not header_written:
            f.write(f"{len(strings)} {len(strings[word][0])}\n")
            header_written = True
        vector_str = ' '.join([f'{x:5.4f}' for x in strings[word][0]])
        f.write(f"{word} {vector_str}\n")