# Aligning Vector Spaces

In order to match the extracted one-word EN terms with their RU translations, I will be using the vector space alignment method as it is described here:
https://github.com/babylonhealth/fastText_multilingual
align_your_own.ipynb,
fasttext.py

The idea is to align the two vector spaces using anchors (a small dictionary of basic EN words and their RU translations) and then look for matches with cosine similarity.

In [1]:
import numpy as np
%run utility_file    # handles module imports and loading .csv files
from utility_file import Preprocess     # custom class for preprocessing text
import fasttext
import operator

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

class FastVector:
    def __init__(self, vector_file='', transform=None):
        self.word2id = {}
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
            (self.n_words, self.n_dim) = \
                (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])

        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)

    def apply_transform(self, transform):
        transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
        self.embed = np.matmul(self.embed, transmat)

    def export(self, outpath):

        fout = open(outpath, "w")

        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    def translate_nearest_neighbour(self, source_vector):
        """Obtain translation of source_vector using nearest neighbour retrieval"""
        similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
        target_id = np.argmax(similarity_vector)
        return self.id2word[target_id]

    def translate_inverted_softmax(self, source_vector, source_space, nsamples,
                                   beta=10., batch_size=100, recalculate=True):
        """
        Obtain translation of source_vector using sampled inverted softmax retrieval
        with inverse temperature beta.
        nsamples vectors are drawn from source_space in batches of batch_size
        to calculate the inverted softmax denominators.
        Denominators from previous call are reused if recalculate=False. This saves
        time if multiple words are translated from the same source language.
        """
        embed_normalised = FastVector.normalised(self.embed)
        # calculate contributions to softmax denominators in batches
        # to save memory
        if self.softmax_denominators is None or recalculate is True:
            self.softmax_denominators = np.zeros(self.embed.shape[0])
            while nsamples > 0:
                # get batch of randomly sampled vectors from source space
                sample_vectors = source_space.get_samples(min(nsamples, batch_size))
                # calculate cosine similarities between sampled vectors and
                # all vectors in the target space
                sample_similarities = \
                    np.matmul(embed_normalised,
                              FastVector.normalised(sample_vectors).transpose())
                # accumulate contribution to denominators
                self.softmax_denominators \
                    += np.sum(np.exp(beta * sample_similarities), axis=1)
                nsamples -= batch_size
        # cosine similarities between source_vector and all target vectors
        similarity_vector = np.matmul(embed_normalised,
                                      source_vector/np.linalg.norm(source_vector))
        # exponentiate and normalise with denominators to obtain inverted softmax
        softmax_scores = np.exp(beta * similarity_vector) / \
                         self.softmax_denominators
        # pick highest score as translation
        target_id = np.argmax(softmax_scores)
        return self.id2word[target_id]

    def get_samples(self, nsamples):
        """Return a matrix of nsamples randomly sampled vectors from embed"""
        sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
        return self.embed[sample_ids]
    
    @classmethod
    def normalised(cls, mat, axis=-1, order=2):
        """Utility function to normalise the rows of a numpy array."""
        norm = np.linalg.norm(
            mat, axis=axis, ord=order, keepdims=True)
        norm[norm == 0] = 1
        return mat / norm


    @classmethod
    def cosine_similarity(cls, vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_anchors(path):
    '''
    this function loads a .txt file with a bilingual dictionary and turns it into a list of tuples
    that will be used as anchors for the transformation
    '''

    file = open(path, 'r', encoding="utf-8")
    text = file.read()
    word_list = text.split()
    anchors = []
    for ind in range(len(word_list)):
        if ind % 2 == 0:
            pair = (word_list[ind+1], word_list[ind])
            anchors.append(pair)
    file.close()
    
    return anchors

In [3]:
# Loading the previously saved models and the achor file

en_dictionary = FastVector(vector_file='en_model_converted.vec')
ru_dictionary = FastVector(vector_file='ru_model_converted.vec')
enru_anchors = load_anchors('ru-en.txt')

reading word vectors from en_model_converted.vec
reading word vectors from ru_model_converted.vec


In [4]:
# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary, ru_dictionary, enru_anchors)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)

Below I check out how the system works - I enter any random word and see what translation the system finds for it, if finds any at all.

In [5]:
for x in range(100):
    word = input('Look up word or say stop: ')
    if word == 'stop':
        break
    else:
        try:
            print(ru_dictionary.translate_nearest_neighbour(en_dictionary[word]))
        except:
            print('No such word')

Look up word or say stop: beautiful
настрой
Look up word or say stop: building
здание
Look up word or say stop: day
день
Look up word or say stop: cat
котик
Look up word or say stop: sun
бесконечный
Look up word or say stop: mano
мано
Look up word or say stop: swim
штурвал
Look up word or say stop: ship
знать
Look up word or say stop: island
остров
Look up word or say stop: chest
сундук
Look up word or say stop: dorin
дорин
Look up word or say stop: monkey
рия
Look up word or say stop: workshop
настолько
Look up word or say stop: stop


# Finding matches

The system clearly doesn't work well, as the test in the previous cell shows. I tried to improve its performance by making it look for the closest matches only among 3 most common words appearing in the parallel target segments.

In [6]:
# loading .csv corpus

path = 'pi2.csv'
source_lang = 'English'
target_lang = 'Russian'

source_list, target_list = load_separate_corpora_from_csv(path, source_lang, target_lang)

In [13]:
import pickle

# loading pickled keywords

with open('keywords.pkl', 'rb') as f:
    keywords = pickle.load(f)

# loading pickled clean corpora

with open('clean_en_corpus.pkl', 'rb') as f:
       clean_en_corpus = pickle.load(f)
with open('clean_ru_corpus.pkl', 'rb') as f:
       clean_ru_corpus = pickle.load(f)

Next I'm building a dictionary where the keys are terms from the EN corpus and values are top 3 most common words in the parallel segments of the RU corpus. To narrow down the pool further, I will be only looking at words of the same POS (part of speech).

In [14]:
from collections import Counter
noun_list = ['NOUN']
verb_list = ['VERB', 'INFN']
ad_list = ['ADJ', 'ADJF', 'ADJS', 'ADVB', 'ADV', 'ADVB']     # both adverbs and adjectives

short_keywords = [keyword for keyword in keywords if len(keyword.split()) == 1]     # only looking at unigrams
clean_corpus_as_dict = dict(zip(clean_en_corpus, clean_ru_corpus))
keyword_pool_dict = {}
for keyword in short_keywords:
    source_pos = nlp(keyword)[0].pos_
    if source_pos in noun_list:
        pos = noun_list
    if source_pos in verb_list:
        pos = verb_list
    if source_pos in ad_list:
        pos = ad_list
    target_pool_sent = []
    for source_sent in clean_corpus_as_dict.keys():
        if (keyword in source_sent.split()) and (len(source_sent) < 80):   # only looking at strings < 80 symbols  
            target_pool_sent.append(clean_corpus_as_dict[source_sent])         # as longer strings are less likely to contain terms and will clutter the corpus
    target_pool_words = ' '.join(target_pool_sent).split()
    target_pool_pos = [word for word in target_pool_words if str(morph.parse(word)[0].tag.POS) in pos]
    most_common = Counter(target_pool_pos).most_common(3)
    keyword_pool_dict[keyword] = [key for key, val in most_common] 

In [15]:
# taking a look at the pool

keyword_pool_df = pd.DataFrame.from_dict(keyword_pool_dict, orient='index')
keyword_pool_df

Unnamed: 0,0,1,2
sky,небо,кристалл,дракон
flower,цвет,цветок,корзинка
banish,прогнать,прогонять,получить
bundle,комплект,снег,шиворот
lantern,отправить,получить,украсить
...,...,...,...
time,время,награда,прибыль
safe,безопасный,точно,любой
monkey,обезьянка,сезон,игрок
golem,голем,голь,сумка


In [16]:
# Now from the pool above I'll be picking the word with the highest cosine similarity to the source term

termbase = {}
for source_term in keyword_pool_dict.keys():
    cos_dict = {}
    for target_term in keyword_pool_dict[source_term]:
        try:
            ru_vector = ru_dictionary[target_term]
            cos_dict[target_term] = FastVector.cosine_similarity(en_dictionary[source_term], ru_vector)
        except:
            continue
    for item in cos_dict:
        try:
            translation = max(cos_dict.items(), key=operator.itemgetter(1))[0]
            termbase[source_term] = translation
        except:
            continue

In [17]:
# taking a look

termbase_df = pd.DataFrame.from_dict(termbase, orient='index')
termbase_df

Unnamed: 0,0
sky,кристалл
flower,цветок
banish,прогнать
bundle,комплект
lantern,украсить
...,...
time,награда
safe,точно
monkey,обезьянка
golem,голь


Looks more or less fine.
Now pickling the results.

In [35]:
import pickle
with open('termbase_unigrams.pkl', 'wb') as f:
       pickle.dump(termbase, f)
with open('termbase_top_3.pkl', 'wb') as f:
       pickle.dump(keyword_pool_dict, f)