# Эмбеддинги

In [None]:
!pip install --trusted-host pypi --trusted-host files.pythonhosted.org gensim
# либо
!conda install -c conda-forge gensim

# colab.research.google.com

In [1]:
import gensim.downloader as api
model = api.load('glove-twitter-100')

In [2]:
model.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7052315473556519),
 ('prince', 0.6666139364242554),
 ('mother', 0.6436765193939209),
 ('royal', 0.6417251229286194),
 ('father', 0.5952690243721008),
 ('african', 0.5883978009223938),
 ('princess', 0.5882176160812378),
 ('called', 0.5842776894569397),
 ('meets', 0.5840279459953308),
 ('american', 0.5815179944038391)]

In [3]:
words = sorted(model.vocab.keys(), 
               key=lambda word: model.vocab[word].count,
               reverse=True)[:1000]

print(words[::100])

['<user>', '_', 'please', 'apa', 'justin', 'text', 'hari', 'playing', 'once', 'sei']


In [4]:
import numpy as np

In [5]:
word_vectors = np.array([model.get_vector(word)
               for word in words])

## Визуализация эмбеддингов слов с помощью t-SNE

In [6]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [7]:
from sklearn.manifold import TSNE

# map word vectors onto 2d plane with TSNE. hint: use verbose=100 to see what it's doing.

word_tsne = TSNE(n_components=2, verbose=100).fit_transform(word_vectors)

word_tsne = (word_tsne - np.average(
    word_tsne, axis=0).reshape(1, 2)) / np.var(
    word_tsne, axis=0).reshape(1, 2) ** 0.5

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.011s...
[t-SNE] Computed neighbors for 1000 samples in 0.200s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 1.716134
[t-SNE] Computed conditional probabilities in 0.058s
[t-SNE] Iteration 50: error = 67.7214432, gradient norm = 0.3293672 (50 iterations in 5.143s)
[t-SNE] Iteration 100: error = 68.8186417, gradient norm = 0.2893867 (50 iterations in 5.612s)
[t-SNE] Iteration 150: error = 68.3477020, gradient norm = 0.3092389 (50 iterations in 5.808s)
[t-SNE] Iteration 200: error = 69.9302826, gradient norm = 0.2813768 (50 iterations in 5.928s)
[t-SNE] Iteration 250: error = 70.4856110, gradient norm = 0.2742253 (50 iterations in 6.047s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.485611
[t-SNE] Iteration 300: error = 1.2729193, gradient norm = 0.0031593 (50 iterations in 4.506s)
[t-SNE] Iteration 350: error = 1.1057683, gradient norm = 0.00172

In [8]:
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], color='green', token=words)

# Простая вопросно-ответная система

In [9]:
# download the data:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt
# alternative download link: https://yadi.sk/i/BPQrUu1NaTduEw

--2019-07-23 10:45:37--  https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.70.1, 2620:100:6026:1::a27d:4601
Connecting to www.dropbox.com (www.dropbox.com)|162.125.70.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/obaitrix9jyu84r/quora.txt [following]
--2019-07-23 10:45:38--  https://www.dropbox.com/s/dl/obaitrix9jyu84r/quora.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucee9f8d2a1436081d8f85aaac07.dl.dropboxusercontent.com/cd/0/get/AlO83dtRMXQI5keckJC6eUTuoeBPuzvvaYw6Unkc-QnDndkXNqpSHr96Wiiwa76n2AuJD6RA2lOF7zVTXWRRUHNCetHv0-IJE81GBBShj13s0Q/file?dl=1# [following]
--2019-07-23 10:45:38--  https://ucee9f8d2a1436081d8f85aaac07.dl.dropboxusercontent.com/cd/0/get/AlO83dtRMXQI5keckJC6eUTuoeBPuzvvaYw6Unkc-QnDndkXNqpSHr96Wiiwa76n2AuJD6RA2lOF7zVTXWRRUHNCetHv0-IJE81GBBShj13s0Q/file?dl=1
Resolving ucee9f8d

In [25]:
import numpy as np

data = list(open("./quora.txt", encoding='utf-8'))
data[50]

"What TV shows or books help you read people's body language?\n"

In [11]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(data[50]))

['What', 'TV', 'shows', 'or', 'books', 'help', 'you', 'read', 'people', "'", 's', 'body', 'language', '?']


In [13]:
# data_tok should be a list of lists of tokens for each line in data.

data_tok = [tokenizer.tokenize(piece_of_data.lower())
           for piece_of_data in data]

In [14]:
def get_phrase_embedding(phrase):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros
    
    
    words = tokenizer.tokenize(phrase.lower())
    known_vectors = []
    for word in words:
        if word in model.vocab:
            known_vectors.append(model.get_vector(word))
    known_vectors = np.array(known_vectors)
    if known_vectors.size != 0: 
        vector = known_vectors.mean(axis=0)
    else:
        vector = np.zeros([model.vector_size], dtype='float32')

    return vector

In [15]:
# let's only consider ~5k phrases for a first run.
chosen_phrases = data[::len(data) // 1000]

# compute vectors for chosen phrases
phrase_vectors = np.array([get_phrase_embedding(phrase)
                 for phrase in chosen_phrases])

In [16]:
# map vectors into 2d space with pca, tsne or your other method of choice
# don't forget to normalize

phrase_vectors_2d = TSNE(verbose=1000).fit_transform(phrase_vectors)

phrase_vectors_2d = (phrase_vectors_2d - 
                     np.mean(phrase_vectors_2d, axis=0)) / np.std(phrase_vectors_2d, axis=0)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1001 samples in 0.015s...
[t-SNE] Computed neighbors for 1001 samples in 0.234s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1001
[t-SNE] Computed conditional probabilities for sample 1001 / 1001
[t-SNE] Mean sigma: 0.477874
[t-SNE] Computed conditional probabilities in 0.065s
[t-SNE] Iteration 50: error = 77.5572433, gradient norm = 0.3269058 (50 iterations in 5.253s)
[t-SNE] Iteration 100: error = 78.5946274, gradient norm = 0.3329623 (50 iterations in 5.780s)
[t-SNE] Iteration 150: error = 81.2704926, gradient norm = 0.2874392 (50 iterations in 5.772s)
[t-SNE] Iteration 200: error = 79.5576019, gradient norm = 0.3191328 (50 iterations in 6.076s)
[t-SNE] Iteration 250: error = 82.7367554, gradient norm = 0.2854584 (50 iterations in 6.363s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.736755
[t-SNE] Iteration 300: error = 2.0265028, gradient norm = 0.0034143 (50 iterations in 4.063s)

In [17]:
draw_vectors(phrase_vectors_2d[:, 0], phrase_vectors_2d[:, 1],
             phrase=[phrase[:50] for phrase in chosen_phrases],
             radius=20,)

In [18]:
# compute vector embedding for all lines in data
data_vectors = np.array([get_phrase_embedding(l) for l in data])

In [19]:
from sklearn.metrics.pairwise import cosine_distances
def find_nearest(query, k=10):
    """
    given text line (query), return k most similar lines from data, sorted from most to least similar
    similarity should be measured as cosine between query and line embedding vectors
    hint: it's okay to use global variables: data and data_vectors. see also: np.argsort
    """
        
    query_vector = get_phrase_embedding(query)
    
    distances = cosine_distances(data_vectors, query_vector[None, :])
    
    indices = np.argsort(distances[:, 0])[:k]   

    return [data[index] 
            for index in indices]

In [20]:
results = find_nearest(query="How do i enter the matrix?", k=10)

print(''.join(results))

How do I get to the dark web?
What should I do to enter hollywood?
How do I use the Greenify app?
What can I do to save the world?
How do I win this?
How do I think out of the box? How do I learn to think out of the box?
How do I find the 5th dimension?
How do I use the pad in MMA?
How do I estimate the competition?
What do I do to enter the line of event management?



In [21]:
find_nearest(query="How does Trump?", k=10)

['What does Donald Trump think about Israel?\n',
 'What books does Donald Trump like?\n',
 'What does Donald Trump think of India?\n',
 'What does India think of Donald Trump?\n',
 'What does Donald Trump think of China?\n',
 'What does Donald Trump think about Pakistan?\n',
 'What companies does Donald Trump own?\n',
 'What does Dushka Zapata think about Donald Trump?\n',
 'How does it feel to date Ivanka Trump?\n',
 'What does salesforce mean?\n']

In [26]:
find_nearest(query="How to learn python?", k=10)

['Is Python easy to learn?\n',
 'How can I learn to write idiomatic Python?\n',
 'How to learn coding?\n',
 'How easy is it to learn Python?\n',
 'How long does it take to learn Python?\n',
 'How does one learn to learn?\n',
 'How important is it to learn Python?\n',
 'How can I learn to use Drupal?\n',
 'How difficult is it to learn Python?\n',
 'What is the easy way to learn python programming?\n']

In [None]:
!pip install nltk