In [1]:
from gensim.models.fasttext import FastText
import fasttext
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer
import pandas as pd

import nltk
nltk.download('stopwords')

import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('D:\\python\\CL\\gob\\pi222.csv', sep=';', low_memory=False)
data = data[['Russian', 'English']]
index_names = data[data['Russian'].str.contains("тест" or "test")==True].index
data.drop(index_names, inplace = True)
data = data.dropna()
data = data[data.Russian != data.English]
data = data.drop_duplicates(subset=['English', 'Russian'])
data_ru = data['Russian'].tolist()

In [4]:
def preprocess_text(item):
    item = re.sub(r"\d+%", " ", item)
    item = re.sub(r"x\d+", " ", item)
    item = re.sub(r"\d+", " ", item)
    item = re.sub(r"\n", " ", item)
    item = re.sub(r"\[.+\]", " ", item)
    item = re.sub(r"\\.+\\;", " ", item)
    item = re.sub(r"http.+", " ", item)
    item = re.sub(r"\{.*\}", " ", item)
    item = re.sub(r" [xX] ", " ", item)
    item = re.sub(r"%[sd]", " ", item)
    item = re.sub(r"<.+>", " ", item)
    item = re.sub(r"[\U00010000-\U0010ffff]", " ", item)
    item = re.sub(r"[!@#$%\^\&\*()_=+\?\!:;\",\.\\»«—№]", " ", item)
    item = re.sub(r"\s+", " ", item)
    item = item.strip(' ')
    item = item.lower()
    
    tokens = item.split()
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words('russian')]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [5]:
final_corpus = [preprocess_text(sentence) for sentence in data_ru if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

In [20]:
embedding_size = 15
window_size = 60
min_word = 5
down_sampling = 1e-2

ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [11]:
print(ft_model.wv['ключ'])

[-0.8197248  -0.2606538   0.09210553  0.4402155  -0.90772134  0.3352683
 -1.8322273   0.1421681  -1.0818759   0.64429766  0.84142375 -0.5680857
 -0.01411205 -0.5607831   0.41943598 -0.7892568  -0.34489664  1.1795076
 -0.5877148   0.31188834 -0.46960866  0.53191334  0.39933002  0.08426717
  0.9956709   0.5206764  -0.5992941  -0.2144051  -0.96196294 -0.6895037
  0.5823305   0.25126904  0.07979515 -0.27547386  0.30598214  0.7957636
 -0.21333589  0.3610347  -0.7073324  -0.02736245  0.87782854 -0.38474396
 -0.42535922  0.33477142  0.30496755 -0.7367027   0.54787797  0.5072618
 -1.1048433  -0.88642305 -0.877676   -0.5686525   0.03365183 -0.33665943
 -0.08682787 -1.495623   -1.1425389  -1.8019029   0.4931613   0.25050402]


In [27]:
semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
                  for words in ['кристалл', 'наоми', 'ключ', 'остров', 'строить', 'награда']}

for k,v in semantically_similar_words.items():
    print(k+":"+str(v))

кристалл:['купить', 'ресурс', 'лавка', 'полно', 'докупить']
наоми:['казаться', 'решить', 'мано', 'забирать', 'рука']
ключ:['старинный', 'представитель', 'лежать', 'цивилизация', 'наслать']
остров:['решить', 'отметить', 'всё', 'начать', 'должный']
строить:['условие', 'поднебесный', 'аксессуар', 'пятизвёздочный', 'обладатель']
награда:['приз', 'отличный', 'получать', 'аплодисменты', 'стать']


In [26]:
print(ft_model.wv.similarity(w1='мано', w2='дорин'))
print(ft_model.wv.similarity(w1='отель', w2='событие'))
print(ft_model.wv.similarity(w1='ключ', w2='сундук'))

0.9062874
0.78880155
0.60436195


In [28]:
from gensim.models.fasttext import save_facebook_model
save_facebook_model(ft_model, "ru_model_fb.bin", encoding='utf-8')