In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np  
stemmer = WordNetLemmatizer()
import spacy
from nltk.corpus import stopwords
import torch
import tensorflow
from sklearn.metrics.pairwise import cosine_similarity
import string 
import random
import nltk

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2100000

In [3]:
def preprocess_text(item):
    item = re.sub(r"\d+%", " ", item)
    item = re.sub(r"x\d+", " ", item)
    item = re.sub(r"\d+", " ", item)
    item = re.sub(r"\n", " ", item)
    item = re.sub(r"\[.+\]", " ", item)
    item = re.sub(r"\\+.+;", " ", item)
    item = re.sub(r"http.+", " ", item)
    item = re.sub(r"\{.*\}", " ", item)
    item = re.sub(r" [xX] ", " ", item)
    item = re.sub(r"%[sd]", " ", item)
    item = re.sub(r"<.+>", " ", item)
    item = re.sub(r"[\U00010000-\U0010ffff]", " ", item)
    item = re.sub(r"[!@#$%\^\&\*()_=+\?\!:;\",\.\\»«—✨]", " ", item)
    item = re.sub(r"\s+", " ", item)
    item = item.strip(' ')
    item = item.lower()
    
    tokens = item.split()
    tokens = [nlp(word)[0].lemma_ if word != "flowerbed" else "flowerbed" for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [7]:
data = pd.read_csv('pi2.csv', sep=';', low_memory=False)
data = data[['string_id','Russian', 'English']]
index_names = data[data['Russian'].str.contains("тест" or "test")==True].index
data.drop(index_names, inplace = True)
data = data.dropna()
data = data.drop_duplicates()
data = data[data.Russian != data.English]
index_names2 = data[data['string_id'].str.contains("achieve", "title") ==True].index
data.drop(index_names2, inplace = True)
data = data[['Russian', 'English']].drop_duplicates()
data_en = data['English'].tolist()
final_corpus = [preprocess_text(sentence) for sentence in data_en if len(sentence) <= 80 and len(sentence) > 0]

In [9]:
data_str = ' '.join(final_corpus)

In [10]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2100000
doc = nlp(data_str)

In [11]:
nouns = set()
for token in doc:
    if token.pos_ == "NOUN":
        nouns.add(token.lemma_)

In [17]:
verbs = set()
for token in doc:
    if token.pos_ == "VERB":
        if token.text != "\'ve" and token.text != "s" and token.text != "\'m" and token.text != "\'re":
            if len(token.text) > 2:
                verbs.add(token.lemma_)

{'figure', 'handle', 'grow', 'click', 'solve', 'mind', 'florist', 'mask', 'generate', 'herme', 'bring', 'say', 'cover', 'change', 'coordinate', 'legend', 'flowerbe', 'dedicate', 'splendid', 'edge', 'inform', 'create', 'splurge', 'multicolore', 'live', 'crowd', 'approve', 'fire', 'nip', 'view', 'collapse', 'plan', 'egypt', 'envy', 'remind', 'summon', 'unlock', 'fly', 'eek', 'wave', 'ultraoffer', 'let', 'come', 'explode', 'distract', 'bet', 'abandon', 'bloom', 'stabilize', 'mess', 'bite', 'earn', 'fun', 'facebook', 'arrrghh', 'swat', 'bear', 'gurgle', 'harm', 'read', 'advance', 'amphitheater', 'smell', 'draw', 'nova', 'call', 'access', 'pose', 'jump', 'strengthen', 'know', 'sail', 'coffer', 'root', 'tell', 'flatter', 'exchange', 'apply', 'fish', 'elsa', 'invite', 'champion', 'dolphin', 'moor', 'envelope', 'weep', 'donate', 'rise', 'avoid', 'microphone', 'scare', 'apologize', 'fall', 'run', 'protect', 'intrigue', 'wanna', 'rate', 'wonder', 'repel', 'slip', 'remain', 'mistake', 'arrive', '

In [45]:
all_pos = nouns.union(verbs)

In [19]:
stop_words = "english"

# Extract candidate words/phrases
count = TfidfVectorizer(min_df = 5, stop_words=stop_words).fit(final_corpus)
bag_of_words = count.transform(final_corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
all_candidates = []
for item in words_freq[:700]:
    all_candidates.append(item[0])

In [None]:
candidates = [x for x in all_candidates in x in nouns]
candidates2 = [x for x in all_candidates in x in verbs]
keywords = set(candidates + candidates2)

In [23]:
print(keywords)

{'sky', 'grow', 'click', 'florist', 'development', 'beauty', 'renaissance', 'bring', 'say', 'change', 'item', 'test', 'legend', 'guy', 'calm', 'reserve', 'hologram', 'adventure', 'dynamite', 'stone', 'frost', 'earth', 'create', 'water', 'patrick', 'tonic', 'live', 'demolition', 'plan', 'truck', 'championship', 'friendship', 'road', 'shuttle', 'remind', 'summon', 'unlock', 'fly', 'traveler', 'palace', 'bakery', 'wave', 'eek', 'preparation', 'research', 'come', 'let', 'shaman', 'piggy', 'diner', 'device', 'abandon', 'plantation', 'club', 'spider', 'building', 'century', 'bite', 'pet', 'earn', 'fun', 'facebook', 'giraffe', 'market', 'player', 'breath', 'bear', 'mood', 'black', 'read', 'refill', 'smell', 'draw', 'museum', 'access', 'sand', 'opportunity', 'cream', 'game', 'know', 'badge', 'bunch', 'money', 'battle', 'bouncy', 'coffer', 'diving', 'cart', 'tell', 'unioner', 'operation', 'power', 'rescue', 'feature', 'fish', 'enlightenment', 'invite', 'champion', 'monkey', 'dolphin', 'fairy', 

In [24]:
import pickle
with open('keywords_short.pkl', 'wb') as f:
       pickle.dump(set(keywords), f)