Ноутбук подготовлен на основе материалов курса NLP Школы анализа данных: https://github.com/yandexdataschool/nlp_course

# Эмбеддинги

In [1]:
!pip install gensim



In [None]:
import gensim.downloader as api
model = api.load('glove-twitter-100')

In [None]:
model.most_similar(positive=["king", "woman"], negative=["man"])

In [None]:
words = sorted(model.vocab.keys(), 
               key=lambda word: model.vocab[word].count,
               reverse=True)[:1000]

print(words[::100])

In [None]:
import numpy as np

In [None]:
word_vectors = np.array([model.get_vector(word)
               for word in words])

## Визуализация эмбеддингов слов с помощью t-SNE

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [None]:
from sklearn.manifold import TSNE

# map word vectors onto 2d plane with TSNE. hint: use verbose=100 to see what it's doing.

word_tsne = TSNE(n_components=2, verbose=100).fit_transform(word_vectors)

word_tsne = (word_tsne - np.average(
    word_tsne, axis=0).reshape(1, 2)) / np.var(
    word_tsne, axis=0).reshape(1, 2) ** 0.5

In [None]:
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], color='green', token=words)

# Простая вопросно-ответная система

In [None]:
# download the data:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt
# alternative download link: https://yadi.sk/i/BPQrUu1NaTduEw

In [None]:
import numpy as np

data = list(open("./quora.txt"))
data[50]

In [None]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(data[50]))

In [None]:
# data_tok should be a list of lists of tokens for each line in data.

data_tok = [tokenizer.tokenize(piece_of_data.lower())
           for piece_of_data in data]

In [None]:
def get_phrase_embedding(phrase):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros
    
    
    words = tokenizer.tokenize(phrase.lower())
    known_vectors = []
    for word in words:
        if word in model.vocab:
            known_vectors.append(model.get_vector(word))
    known_vectors = np.array(known_vectors)
    if known_vectors.size != 0: 
        vector = known_vectors.mean(axis=0)
    else:
        vector = np.zeros([model.vector_size], dtype='float32')

    return vector

In [None]:
# let's only consider ~5k phrases for a first run.
chosen_phrases = data[::len(data) // 1000]

# compute vectors for chosen phrases
phrase_vectors = np.array([get_phrase_embedding(phrase)
                 for phrase in chosen_phrases])

In [None]:
# map vectors into 2d space with pca, tsne or your other method of choice
# don't forget to normalize

phrase_vectors_2d = TSNE(verbose=1000).fit_transform(phrase_vectors)

phrase_vectors_2d = (phrase_vectors_2d - 
                     np.mean(phrase_vectors_2d, axis=0)) / np.std(phrase_vectors_2d, axis=0)

In [None]:
draw_vectors(phrase_vectors_2d[:, 0], phrase_vectors_2d[:, 1],
             phrase=[phrase[:50] for phrase in chosen_phrases],
             radius=20,)

In [None]:
# compute vector embedding for all lines in data
data_vectors = np.array([get_phrase_embedding(l) for l in data])

In [None]:
from sklearn.metrics.pairwise import cosine_distances
def find_nearest(query, k=10):
    """
    given text line (query), return k most similar lines from data, sorted from most to least similar
    similarity should be measured as cosine between query and line embedding vectors
    hint: it's okay to use global variables: data and data_vectors. see also: np.argsort
    """
        
    query_vector = get_phrase_embedding(query)
    
    distances = cosine_distances(data_vectors, query_vector[None, :])
    
    indices = np.argsort(distances[:, 0])[:k]   

    return [data[index] 
            for index in indices]

In [None]:
results = find_nearest(query="How do i enter the matrix?", k=10)

print(''.join(results))

In [None]:
find_nearest(query="How does Trump?", k=10)

In [None]:
find_nearest(query="Why don't i ask a question myself?", k=10)