```
// Copyright 2020 Twitter, Inc.
// SPDX-License-Identifier: Apache-2.0
```

# Generate Transformer Embeddings

Take an existing BERT model (with or without TPP pre-training) generate embeddings from it on translation pairs.


## Setup libraries


In [None]:
%pip install transformers==3.5.1 datasets==1.1.2 torch==1.4.0 seqeval==1.2.2 gensim==3.8.1

In [None]:
import re
from pathlib import Path

import numpy as np
from transformers import AutoModel, BertTokenizerFast


In [None]:
model_dir = Path("../models/multi_tatoeba_2t_en_hi_ja_ar_equal_bce_model/").expanduser()
tokenizer = BertTokenizerFast.from_pretrained(
    str(model_dir), max_len=512, truncation=True, padding=True
)


In [None]:
model = AutoModel.from_pretrained(str(model_dir)).eval()


In [None]:
batch = tokenizer.batch_encode_plus(
    ["This is a great world", "Obama went to Paris and Trump to London"],
    padding=True,
    max_length=512,
    return_tensors="pt",
)
batch


PDP: Data is scrubbed. For more information visit go/pycx-pdp.

In [None]:
output = model(**batch)
output


PDP: Data is scrubbed. For more information visit go/pycx-pdp.

In [None]:
[o.shape for o in output]


PDP: Data is scrubbed. For more information visit go/pycx-pdp.

In [None]:
data_file = Path("../data/en_ar_tatoeba.json").expanduser()


In [None]:
import json


In [None]:
sentences = []
labels = []
with data_file.open() as fp:
    for i, line in enumerate(fp):
        line = line.strip()
        if not line:
            continue
        if i > 1000:
            break
        line = json.loads(line)
        line_sents = line["unique_label_desc"]
        sentences.extend(line_sents)
        labels.extend([i] * len(line_sents))


In [None]:
sentences[:10]


PDP: Data is scrubbed. For more information visit go/pycx-pdp.

In [None]:
embeddings = []
for i in range(0, len(sentences), 20):
    sents = sentences[i : i + 20]
    batch = tokenizer.batch_encode_plus(
        sents, padding=True, max_length=512, return_tensors="pt"
    )
    emb = model(**batch)[1].detach().numpy()
    embeddings.append(emb)


In [None]:
embeddings = np.vstack(embeddings)


In [None]:
embeddings_path = Path("../data/en_ar_embeddings.ft.npz").expanduser()
np.savez(embeddings_path, embeddings=embeddings, labels=labels)


In [None]:
model_dir = "bert-base-multilingual-uncased"
model = AutoModel.from_pretrained(str(model_dir)).eval()


In [None]:
embeddings = []
for i in range(0, len(sentences), 20):
    sents = sentences[i : i + 20]
    batch = tokenizer.batch_encode_plus(
        sents, padding=True, max_length=512, return_tensors="pt"
    )
    emb = model(**batch)[1].detach().numpy()
    embeddings.append(emb)
embeddings = np.vstack(embeddings)


In [None]:
embeddings_path = Path("../data/en_ar_embeddings.base.npz").expanduser()
np.savez(embeddings_path, embeddings=embeddings, labels=labels)
