# AWESOME: Aligning Word Embedding Spaces of Multilingual Encoders

In [124]:
# %pip install transformers==3.1.0
import torch
import transformers
import itertools
from word_alignment_visualization import show_word_alignments
from IPython.display import display
import spacy
import spacy_alignments as tokenizations
import os

In [134]:
model = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [121]:
spacy.prefer_gpu()
nlp_en = spacy.load("en_core_web_sm")

## Import and tokenize texts

In [129]:
%cd /workspace/novel-tongue/notebooks/aligned_texts

/workspace/novel-tongue/notebooks/aligned_texts


In [135]:
src_sents = []
tgt_sents = []
with open(f"c1_aligned.txt", "r") as f:
    c1 = f.read()

    lines = c1.splitlines()
    for i in range(0, len(lines), 3):
        src_sents.append(lines[i])
        tgt_sents.append(lines[i+1])

In [136]:
src_tokenized_sents = []
tgt_tokenized_sents = []
for src_text, tgt_text in zip(src_sents, tgt_sents):

    # Tokenize the texts to get tokens directly
    src_tokens = tokenizer.tokenize(src_text)
    tgt_tokens = tokenizer.tokenize(tgt_text)

    src_tokenized_sents.append(src_tokens)
    tgt_tokenized_sents.append(tgt_tokens)

print(src_tokenized_sents)
print(tgt_tokenized_sents)

[['All', 'in', 'the', 'golden', 'afternoon'], ['Full', 'leis', '##ure', '##ly', 'we', 'gli', '##de', ';', 'For', 'both', 'our', 'oa', '##rs', ',', 'with', 'little', 'skill', ','], ['By', 'little', 'arms', 'are', 'pl', '##ied', ',', 'While', 'little', 'hands', 'make', 'vain', 'pret', '##ence'], ['Our', 'wa', '##ndering', '##s', 'to', 'guide', '.'], ['Ah', ',', 'cruel', 'Three', '!'], ['In', 'such', 'an', 'hour', '.', 'Ben', '##eath', 'such', 'dream', '##y', 'weather', '.'], ['To', 'be', '##g', 'a', 'tale', 'of', 'br', '##eath', 'too', 'weak'], ['To', 'st', '##ir', 'the', 'tin', '##iest', 'feat', '##her', '!'], ['Yet', 'what', 'can', 'one', 'poor', 'voice', 'av', '##ail'], ['Against', 'three', 'tongue', '##s', 'together', '?'], ['Imperio', '##us', 'Prima', 'flash', '##es', 'forth', 'Her', 'edi', '##ct', '"', 'to', 'begin', 'it', '"', '[UNK]'], ['In', 'gent', '##ler', 'tone', 'Sec', '##unda', 'hopes'], ['"', 'There', 'will', 'he', 'non', '##sens', '##e', 'in', 'it', '!', '"', '[UNK]'], ['

## Run the model and print the resulting alignments.

In [None]:
word_alignments = []

for sent_src, sent_tgt in zip(src_tokenized_sents, tgt_tokenized_sents):

  # pre-processing
  # sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
  token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
  wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
  ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']

  ids_src = ids_src.to(device)
  ids_tgt = ids_tgt.to(device)

  sub2word_map_src = []
  for i, word_list in enumerate(token_src):
    sub2word_map_src += [i for x in word_list]
  sub2word_map_tgt = []
  for i, word_list in enumerate(token_tgt):
    sub2word_map_tgt += [i for x in word_list]

  # alignment
  align_layer = 8
  threshold = 1e-8
  model.eval()
  with torch.no_grad():
    out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
    out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]

    dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))

    softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
    softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)

    softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

  align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
  align_words = set()
  for i, j in align_subwords:
    align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )

  # printing
  class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

  alignments = []
  for i, j in sorted(align_words):
    print(f'{color.BOLD}{color.BLUE}{sent_src[i]}{color.END}==={color.BOLD}{color.RED}{sent_tgt[j]}{color.END}')
    alignments.append([i, j])

  word_alignments.append([sent_src, sent_tgt, alignments])


print(word_alignments)

Display resulting alignments

In [140]:
for alignments in word_alignments[-2:]:
    src_tokens = alignments[0]
    tgt_tokens = alignments[1]
    alignment = alignments[2]
    display(show_word_alignments(src_tokens, tgt_tokens, alignment))

In [145]:
%cd /workspace/novel-tongue/notebooks/word_alignments

/workspace/novel-tongue/notebooks/word_alignments


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [147]:
# save results
with open(f"c1_word_aligned.txt", "w") as f:
    text = ""
    for alignments in word_alignments:
        src_tokens = alignments[0]
        tgt_tokens = alignments[1]
        alignment = alignments[2]
        text += " ".join(src_tokens) + "\n" + " ".join(tgt_tokens) + "\n" + str(alignment) + "\n\n"
    f.write(text)

## Align spaCy and mBert tokenizations

Get spaCy tokenized sentences

In [115]:
src_spacy_tokenized_sents = []
# process src_sents, tgt_sents with spacy tokenizer
for src in src_sents:
    doc_src = nlp_en(src)
    src_tokens = [token.text for token in doc_src]
    src_spacy_tokenized_sents.append(src_tokens)

print(src_spacy_tokenized_sents)

[['All', 'in', 'the', 'golden', 'afternoon'], ['Full', 'leisurely', 'we', 'glide', ';', 'For', 'both', 'our', 'oars', ',', 'with', 'little', 'skill', ','], ['By', 'little', 'arms', 'are', 'plied', ',', 'While', 'little', 'hands', 'make', 'vain', 'pretence'], ['Our', 'wanderings', 'to', 'guide', '.'], ['Ah', ',', 'cruel', 'Three', '!'], ['In', 'such', 'an', 'hour', '.', 'Beneath', 'such', 'dreamy', 'weather', '.'], ['To', 'beg', 'a', 'tale', 'of', 'breath', 'too', 'weak'], ['To', 'stir', 'the', 'tiniest', 'feather', '!'], ['Yet', 'what', 'can', 'one', 'poor', 'voice', 'avail'], ['Against', 'three', 'tongues', 'together', '?'], ['Imperious', 'Prima', 'flashes', 'forth', 'Her', 'edict', '"', 'to', 'begin', 'it', '"', '—'], ['In', 'gentler', 'tone', 'Secunda', 'hopes'], ['"', 'There', 'will', 'he', 'nonsense', 'in', 'it', '!', '"', '—'], ['While', 'Tertia', 'interrupts', 'the', 'tale'], ['Not', 'more', 'than', 'once', 'a', 'minute', '.'], ['Anon', ',', 'to', 'sudden', 'silence', 'won', ','

In [None]:
for src, tgt in zip(src_spacy_tokenized_sents, src_tokenized_sents):
    spacy_src_tokens = src
    mbert_src_tokens = tgt
    print(src, tgt)

    a2b, b2a = tokenizations.get_alignments(spacy_src_tokens, mbert_src_tokens)

    def transform(input_list):
        result = []
        for i, sublist in enumerate(input_list):
            for j in sublist:
                result.append([i, j])
        return result

    a = transform(a2b)
    display(show_word_alignments(src, tgt, a))