In [3]:
import numpy as np
import fasttext
import operator

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

class FastVector:
    def __init__(self, vector_file='', transform=None):
        self.word2id = {}
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
            (self.n_words, self.n_dim) = \
                (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])

        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)

    def apply_transform(self, transform):
        transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
        self.embed = np.matmul(self.embed, transmat)

    def export(self, outpath):

        fout = open(outpath, "w")

        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()
    def translate_nearest_neighbour(self, source_vector):
        """Obtain translation of source_vector using nearest neighbour retrieval"""
        similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
        target_id = np.argmax(similarity_vector)
        return self.id2word[target_id]

    def translate_inverted_softmax(self, source_vector, source_space, nsamples,
                                   beta=10., batch_size=100, recalculate=True):
        """
        Obtain translation of source_vector using sampled inverted softmax retrieval
        with inverse temperature beta.
        nsamples vectors are drawn from source_space in batches of batch_size
        to calculate the inverted softmax denominators.
        Denominators from previous call are reused if recalculate=False. This saves
        time if multiple words are translated from the same source language.
        """
        embed_normalised = FastVector.normalised(self.embed)
        # calculate contributions to softmax denominators in batches
        # to save memory
        if self.softmax_denominators is None or recalculate is True:
            self.softmax_denominators = np.zeros(self.embed.shape[0])
            while nsamples > 0:
                # get batch of randomly sampled vectors from source space
                sample_vectors = source_space.get_samples(min(nsamples, batch_size))
                # calculate cosine similarities between sampled vectors and
                # all vectors in the target space
                sample_similarities = \
                    np.matmul(embed_normalised,
                              FastVector.normalised(sample_vectors).transpose())
                # accumulate contribution to denominators
                self.softmax_denominators \
                    += np.sum(np.exp(beta * sample_similarities), axis=1)
                nsamples -= batch_size
        # cosine similarities between source_vector and all target vectors
        similarity_vector = np.matmul(embed_normalised,
                                      source_vector/np.linalg.norm(source_vector))
        # exponentiate and normalise with denominators to obtain inverted softmax
        softmax_scores = np.exp(beta * similarity_vector) / \
                         self.softmax_denominators
        # pick highest score as translation
        target_id = np.argmax(softmax_scores)
        return self.id2word[target_id]

    def get_samples(self, nsamples):
        """Return a matrix of nsamples randomly sampled vectors from embed"""
        sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
        return self.embed[sample_ids]
    
    @classmethod
    def normalised(cls, mat, axis=-1, order=2):
        """Utility function to normalise the rows of a numpy array."""
        norm = np.linalg.norm(
            mat, axis=axis, ord=order, keepdims=True)
        norm[norm == 0] = 1
        return mat / norm


    @classmethod
    def cosine_similarity(cls, vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [61]:
ruen = open("ru-en2.txt", encoding="utf-8").read()

ruen_list = ruen.split()
enru_dict = []
for ind in range(len(ruen_list)):
    if ind % 2 == 0:
        pair = (ruen_list[ind+1], ruen_list[ind])
        enru_dict.append(pair)

In [63]:
en_dictionary = FastVector(vector_file='en_model_fb.vec')
ru_dictionary = FastVector(vector_file='ru_model_fb.vec')


reading word vectors from en_model_fb.vec
reading word vectors from ru_model_fb.vec


In [64]:
# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary, ru_dictionary, enru_dict)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)

In [74]:
en_vector = en_dictionary["key"]
ru_vector = ru_dictionary["ключ"]
print(FastVector.cosine_similarity(en_vector, ru_vector))

0.7919760657988609


In [66]:
ru_dictionary.translate_nearest_neighbour(en_dictionary["hotel"])

'отель'

In [32]:
import pickle
with open('D:\\python\\CL\\gob\\keywords_short.pkl', 'rb') as f:
    keywords = pickle.load(f)

In [20]:
import pandas as pd

data = pd.read_csv('D:\\python\\CL\\gob\\pi2.csv', sep=';', low_memory=False)
data = data[['string_id','Russian', 'English']]
index_names = data[data['Russian'].str.contains("тест" or "test")==True].index
# # index_names
data.drop(index_names, inplace = True)
data = data.dropna()
data = data.drop_duplicates()
data = data[data.Russian != data.English]
index_names2 = data[data['string_id'].str.contains("achieve", "title") ==True].index
data.drop(index_names2, inplace = True)
data = data[['Russian', 'English']].drop_duplicates()
data_dict = pd.Series(data.Russian.values,index=data.English).to_dict()

In [67]:
data_with_keyword = data[data['English'].str.lower().str.contains("key")]
russian = data_with_keyword.iloc[15]["Russian"]
for word in preprocess_text_ru(russian).split():
    ru_vector = ru_dictionary[word]
    print(word, FastVector.cosine_similarity(en_dictionary["key"], ru_vector))

открыть 0.6342611412358431
замок 0.5073786675186719
ключ 0.7919760657988609
спасти 0.4275333458327296
зверюшка 0.6906345955807068


In [21]:
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()
import re
import nltk
from nltk.corpus import stopwords

def preprocess_text_ru(item):
    item = re.sub(r"\d+%", " ", item)
    item = re.sub(r"x\d+", " ", item)
    item = re.sub(r"\d+", " ", item)
    item = re.sub(r"\n", " ", item)
    item = re.sub(r"\[.+\]", " ", item)
    item = re.sub(r"\\.+\\;", " ", item)
    item = re.sub(r"http.+", " ", item)
    item = re.sub(r"\{.*\}", " ", item)
    item = re.sub(r" [xX] ", " ", item)
    item = re.sub(r"%[sd]", " ", item)
    item = re.sub(r"<.+>", " ", item)
    item = re.sub(r"[\U00010000-\U0010ffff]", " ", item)
    item = re.sub(r"[!@#$%\^\&\*()_=+\?\!:;\",\.\\»«—№]", " ", item)
    item = re.sub(r"\s+", " ", item)
    item = item.strip(' ')
    item = item.lower()
    
     # Lemmatization
#     punct = ".!:?"
#     try:
#         if item[-1] not in punct:
#             item = item + "."
#     except Exception:
#         pass
    tokens = item.split()
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words('russian')]
#     tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess_text_en(item):
    item = re.sub(r"\d+%", " ", item)
    item = re.sub(r"x\d+", " ", item)
    item = re.sub(r"\d+", " ", item)
    item = re.sub(r"\n", " ", item)
    item = re.sub(r"\[.+\]", " ", item)
    item = re.sub(r"\\+.+;", " ", item)
    item = re.sub(r"http.+", " ", item)
    item = re.sub(r"\{.*\}", " ", item)
    item = re.sub(r" [xX] ", " ", item)
    item = re.sub(r"%[sd]", " ", item)
    item = re.sub(r"<.+>", " ", item)
    item = re.sub(r"[\U00010000-\U0010ffff]", " ", item)
    item = re.sub(r"[!@#$%\^\&\*()_=+\?\!:;\",\.\\»«—-]", " ", item)
    item = re.sub(r"\s+", " ", item)
    item = item.strip(' ')
    item = item.lower()
    
     # Lemmatization
#     punct = ".!:?"
#     try:
#         if item[-1] not in punct:
#             item = item + "."
#     except Exception:
#         pass
    tokens = item.split()
    tokens = [nlp(word)[0].lemma_ if word != "flowerbed" else "flowerbed" for word in tokens]
#     tokens = [word for word in tokens if nltk.pos_tag([word])[0][1] == "NN" or "NNP"]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
#     tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [30]:
from collections import Counter

In [68]:
termbase = {}
keyvsru = {}
for keyword in list(keywords):
#     print(keyword)
    data_with_keyword = data[data['English'].str.lower().str.contains(keyword) == True]
    data_dict = pd.Series(data_with_keyword.Russian.values,index=data_with_keyword.English).to_dict()
    data_ru = []
    for x in data_dict:
        clean_string = preprocess_text_en(x)
        if keyword in clean_string.split():
            data_ru.append(data_dict[x])
    term_corpus = [preprocess_text_ru(sentence) for sentence in data_ru if len(sentence) < 80]
    data_ru_str = ' '.join(term_corpus)
    ru_pool = Counter(data_ru_str.split()).most_common(3)
    ru_keys = [key for key, val in ru_pool]
    cos_dict = {}
    for word in ru_keys:
        try:
            ru_vector = ru_dictionary[word]
            cos_dict[word] = FastVector.cosine_similarity(en_dictionary[keyword], ru_vector)
        except:
            continue
    try:
        translation = max(cos_dict.items(), key=operator.itemgetter(1))[0]
        termbase[keyword] = translation
    except:
        continue
        
    keyvsru[keyword] = ru_keys
print(keyvsru)
termbase

{'turtle': ['черепаха', 'черепашка', 'черепашонка'], 'past': ['прошлое', 'старый', 'победитель'], 'head': ['голова', 'остров', 'беда'], 'statuette': ['статуэтка', 'подарок', 'уникальный'], 'reduce': ['прибыль', 'здание', 'персонал'], 'candy': ['леденец', 'сладкий', 'любовный'], 'shuttle': ['мяу-шаттл', 'шаттл', 'сломать'], 'bunch': ['гроздь', 'букет', 'цвет'], 'surprise': ['сюрприз', 'открыть', 'получить'], 'mistake': ['ошибка', 'сыграть', 'совершить'], 'area': ['территория', 'купить', 'восточный'], 'bridge': ['мост', 'спасти', 'получить'], 'tea': ['чай', 'чаепитие', 'чайный'], 'tap': ['нажать', 'начать', 'сундук'], 'heart': ['сердце', 'сердечко', 'шоколадный'], 'pirate': ['пиратский', 'яхта', 'пират'], 'clean': ['уборка', 'порядок', 'это'], 'museum': ['музей', 'остров', 'провести'], 'fairy': ['фея', 'винтербелла', 'получить'], 'break': ['разбить', 'хотеть', 'здание'], 'cake': ['торт', 'тропический', 'кусочек'], 'launch': ['запустить', 'нажать', 'событие'], 'winner': ['победитель', 'ме

{'turtle': 'черепашонка',
 'past': 'прошлое',
 'head': 'голова',
 'statuette': 'статуэтка',
 'reduce': 'персонал',
 'candy': 'леденец',
 'shuttle': 'шаттл',
 'bunch': 'букет',
 'surprise': 'сюрприз',
 'mistake': 'ошибка',
 'area': 'восточный',
 'bridge': 'мост',
 'tea': 'чай',
 'tap': 'нажать',
 'heart': 'сердце',
 'pirate': 'пиратский',
 'clean': 'это',
 'museum': 'музей',
 'fairy': 'винтербелла',
 'break': 'разбить',
 'cake': 'кусочек',
 'launch': 'событие',
 'winner': 'победитель',
 'merry': 'весёлый',
 'lightning': 'молния',
 'happen': 'случиться',
 'score': 'набрать',
 'mastery': 'мастерство',
 'cozy': 'уютный',
 'unioner': 'союз',
 'jam': 'варение',
 'news': 'новость',
 'architect': 'кастлинга',
 'expand': 'расширить',
 'float': 'замок',
 'package': 'зелёный',
 'cruise': 'лайнер',
 'florist': 'флорист',
 'remove': 'извлечь',
 'activate': 'активировать',
 'health': 'здоровье',
 'thank': 'это',
 'design': 'чертёж',
 'passion': 'страсть',
 'meyer': 'мейер',
 'lao': 'лао',
 'friend':

In [69]:
import pickle
with open('termbase_unigrams_X.pkl', 'wb') as f:
       pickle.dump(termbase, f)

In [70]:
with open('termbase_3_X.pkl', 'wb') as f:
       pickle.dump(keyvsru, f)