In [1]:
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer
import pandas as pd

import re

stemmer = WordNetLemmatizer()

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
data = pd.read_csv('D:\\python\\CL\\gob\\pi2.csv', sep=';', low_memory=False)
data = data[['Russian', 'English']]
index_names = data[data['Russian'].str.contains("тест" or "test")==True].index
data.drop(index_names, inplace = True)
data = data.dropna()
data = data.drop_duplicates(subset=['English', 'Russian'])
data = data[data.Russian != data.English]
data_en = data['English'].tolist()

In [5]:
def preprocess_text(item):
    item = re.sub(r"\d+%", " ", item)
    item = re.sub(r"x\d+", " ", item)
    item = re.sub(r"\d+", " ", item)
    item = re.sub(r"\n", " ", item)
    item = re.sub(r"\[.+\]", " ", item)
    item = re.sub(r"\\+.+;", " ", item)
    item = re.sub(r"http.+", " ", item)
    item = re.sub(r"\{.*\}", " ", item)
    item = re.sub(r" [xX] ", " ", item)
    item = re.sub(r"%[sd]", " ", item)
    item = re.sub(r"<.+>", " ", item)
    item = re.sub(r"[\U00010000-\U0010ffff]", " ", item)
    item = re.sub(r"[!@#$%\^\&\*()_=+\?\!:;\",\.\\»«—-]", " ", item)
    item = re.sub(r"\s+", " ", item)
    item = item.strip(' ')
    item = item.lower()
    
    tokens = item.split()
    tokens = [nlp(word)[0].lemma_ if word != "flowerbed" else "flowerbed" for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [6]:
final_corpus = [preprocess_text(sentence) for sentence in data_en if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

In [9]:
embedding_size = 15
window_size = 60
min_word = 5
down_sampling = 1e-2

ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [14]:
print(ft_model.wv['piñata'])

[ 0.3109057   0.10911747 -0.17765515 -1.2655255  -0.30690217  0.73108137
 -0.9664295   0.19486967 -0.81640035  1.0871066   0.5006442  -0.6488135
  1.1173453   0.5123931  -0.86224246  0.3509511   1.8848457  -0.42018625
  0.59783083 -0.625262    0.62150514  1.252173   -0.1979469   0.493237
 -0.03667224 -0.9095915  -0.39532563 -1.1574185   0.03999989  1.137751
  0.05025969 -1.2376442  -0.4071073   0.28774914 -0.77259207 -1.1513922
 -1.3808985  -0.01335795 -0.6939756   1.2324158   0.14021489 -0.01564856
 -0.39438525 -0.4661787   0.32009938  0.0781133  -0.2658846   0.5526048
 -0.6981289  -0.89133275  0.25068483  0.25183654 -0.08277569 -0.3388166
 -0.40359524 -1.5942771   0.5266754  -0.6431995   0.29430228  1.3802017 ]


In [11]:
semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
                  for words in ['crystal', 'naomi', 'key', 'island', 'build', 'reward']}

for k,v in semantically_similar_words.items():
    print(k+":"+str(v))

crystal:['buy', 'wondershop', 'purchase', 'currency', 'bank']
naomi:['come', 'say', 'cause', 'suppose', 'since']
key:['lock', 'purr', 'mrrr', 'vial', 'odd']
island:['set', 'paradise', 'micro', 'ultrabonus', 'middle']
build:['upgrade', 'building', 'eco', 'entertainment', 'instal']
reward:['prize', 'awesome', 'earn', 'willy', 'get']


In [10]:
print(ft_model.wv.similarity(w1='dorin', w2='mano'))
print(ft_model.wv.similarity(w1='event', w2='hotel'))
print(ft_model.wv.similarity(w1='gold', w2='coin'))

0.8932379
0.87251574
0.60690033


In [12]:
from gensim.models.fasttext import save_facebook_model
save_facebook_model(ft_model, "en_model_fb.bin", encoding='utf-8')