In [11]:
import spacy
import json
import spacy_alignments as tokenizations


In [9]:
src_text = "how funny it'll seem to come out among the people that walk with their heads downward!"
tgt_text = "¡Qué divertido sería salir donde vive esta gente que anda cabeza abajo!"
nlp_src = spacy.load("en_core_web_sm")
nlp_tgt = spacy.load("es_core_news_sm")

doc_src = nlp_src(src_text)
doc_tgt = nlp_tgt(tgt_text)

src_tokens = [token.text for token in doc_src]
tgt_tokens = [token.text for token in doc_tgt]
print("Source tokens:", json.dumps([i.strip() for i in src_tokens]))
print("Target tokens:", json.dumps([i.strip() for i in tgt_tokens]))

src_pos = [token.pos_ for token in doc_src]
tgt_pos = [token.pos_ for token in doc_tgt]
print("Source POS tags:", src_pos)
print("Target POS tags:", tgt_pos)

src_dep = [(token.text, token.dep_, token.head.text) for token in doc_src]
tgt_dep = [(token.text, token.dep_, token.head.text) for token in doc_tgt]
print("Source dependencies:", src_dep)
print("Target dependencies:", tgt_dep)

Source tokens: ["how", "funny", "it", "'ll", "seem", "to", "come", "out", "among", "the", "people", "that", "walk", "with", "their", "heads", "downward", "!"]
Target tokens: ["\u00a1", "Qu\u00e9", "divertido", "ser\u00eda", "salir", "donde", "vive", "esta", "gente", "que", "anda", "cabeza", "abajo", "!"]
Source POS tags: ['SCONJ', 'ADJ', 'PRON', 'AUX', 'VERB', 'PART', 'VERB', 'ADP', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'PRON', 'NOUN', 'ADV', 'PUNCT']
Target POS tags: ['PUNCT', 'DET', 'ADJ', 'AUX', 'VERB', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'NOUN', 'ADV', 'PUNCT']
Source dependencies: [('how', 'advmod', 'funny'), ('funny', 'acomp', 'seem'), ('it', 'nsubj', 'seem'), ("'ll", 'aux', 'seem'), ('seem', 'ROOT', 'seem'), ('to', 'aux', 'come'), ('come', 'xcomp', 'seem'), ('out', 'prt', 'come'), ('among', 'prep', 'come'), ('the', 'det', 'people'), ('people', 'pobj', 'among'), ('that', 'nsubj', 'walk'), ('walk', 'relcl', 'people'), ('with', 'prep', 'walk'), ('their', 'poss', '

In [15]:
spacy_src_tokens = src_tokens
mbert_src_tokens = ["how", "fun", "##ny", "it", "'", "ll", "seem", "to", "come", "out", "among", "the", "people", "that", "walk", "with", "their", "heads", "down", "##ward", "!"]

a2b, b2a = tokenizations.get_alignments(spacy_src_tokens, mbert_src_tokens)
print ("a2b:", a2b)
print ("b2a:", b2a)

a2b: [[0], [1, 2], [3], [4, 5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18, 19], [20]]
b2a: [[0], [1], [1], [2], [3], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [16], [17]]


In [1]:
from word_alignment_visualization import show_word_alignments

# Example data
src_tokens = ["I", "love", "coding"]
tgt_tokens = ["J'aime", "coder"]
alignment = [[0, 0], [1, 0], [2, 1]]

# Display word alignment visualization
show_word_alignments(src_tokens, tgt_tokens, alignment)