## Downloading the Dataset

In [1]:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./../../data/quora.txt

--2021-02-01 12:09:59--  https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.64.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.64.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/obaitrix9jyu84r/quora.txt [following]
--2021-02-01 12:10:00--  https://www.dropbox.com/s/dl/obaitrix9jyu84r/quora.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc01e1bf8c4067832692ff1b75a3.dl.dropboxusercontent.com/cd/0/get/BIFrdUrI7Hj8DThVykVCcG4aBJ8Ge8PAOlRxgi82sM-DD7Y_E9NOyGkAZhWgx2mrLYl4ztDHusnpKtbT0ZL4T0Z84JRFgCV96o0mZG3k0Zd7Tg/file?dl=1# [following]
--2021-02-01 12:10:01--  https://uc01e1bf8c4067832692ff1b75a3.dl.dropboxusercontent.com/cd/0/get/BIFrdUrI7Hj8DThVykVCcG4aBJ8Ge8PAOlRxgi82sM-DD7Y_E9NOyGkAZhWgx2mrLYl4ztDHusnpKtbT0ZL4T0Z84JRFgCV96o0mZG3k0Zd7Tg/file?dl=1
Resolving uc01e

In [65]:
import numpy as np

data = list(open("./quora.txt", encoding="utf-8"))
data[25]

'Will Windows run well enough to use Excel and Outlook in a VM on the new 2015 MacBook?\n'

## Dataset Preprocessing

### Tokenization and Lower-casing

In [66]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

data_tok = [tokenizer.tokenize(line.lower()) for line in data]
data_tok[25]

['will',
 'windows',
 'run',
 'well',
 'enough',
 'to',
 'use',
 'excel',
 'and',
 'outlook',
 'in',
 'a',
 'vm',
 'on',
 'the',
 'new',
 '2015',
 'macbook',
 '?']

## Word Embeddings

### Training a Word2Vec model

In [68]:
from gensim.models import Word2Vec

model = Word2Vec(data_tok,
                size=50,
                window=5,
                min_count=5).wv

#### Fetching word vector

In [69]:
model.get_vector('summer')

array([ 0.7385393 ,  0.63008845,  0.77939546,  1.4554639 , -3.1996126 ,
        2.0371144 ,  0.03636565,  0.23144506,  1.5959564 ,  0.22834463,
       -0.62894785,  0.61704826, -0.46078905, -0.0538972 , -0.94687885,
       -1.4508221 , -1.0214924 , -0.78742516, -1.816266  ,  1.0934535 ,
        2.9447496 , -0.2534724 ,  0.12067267,  0.88463753,  1.669909  ,
       -0.4169887 , -0.60448426,  0.19963191, -0.45109794,  1.0248686 ,
        0.28451452,  0.509868  ,  2.1150348 ,  0.00464265,  0.47768047,
       -2.369642  , -0.43056014, -0.15063794,  0.22011667,  0.2568941 ,
       -0.64014983,  1.1725929 ,  1.5738049 , -0.9799925 ,  0.29579413,
       -2.3261747 ,  1.1361938 ,  1.4870391 , -1.6521168 , -0.47875366],
      dtype=float32)

### Loading a Pre-trained model

In [76]:
import gensim.downloader as api

#Fetching list of all available pre-trained model on gensim
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [77]:
model = api.load('glove-twitter-100')

In [78]:
model.get_vector('summer')

array([-0.89576  , -0.92183  ,  0.14913  , -0.25121  ,  0.10397  ,
        0.46965  ,  0.70299  , -0.39807  , -0.65015  , -0.20826  ,
       -0.2851   , -0.11455  , -3.5304   ,  0.37717  , -0.11199  ,
       -0.019414 , -0.4496   ,  0.42657  , -0.39661  ,  0.13738  ,
       -0.50071  , -0.10388  ,  0.23345  , -0.21111  ,  0.25814  ,
       -0.14755  , -0.057121 , -0.48793  ,  0.28059  , -0.73449  ,
       -1.1377   ,  0.084311 , -0.31729  , -0.085831 , -0.052985 ,
        0.62416  ,  0.26254  ,  0.27203  ,  0.17596  ,  0.54661  ,
       -0.94407  , -0.020613 ,  0.62208  , -0.24703  ,  0.10993  ,
        0.61814  ,  0.027522 , -0.17343  , -0.70895  ,  0.34309  ,
        0.028017 ,  0.020393 ,  0.37327  ,  0.29174  ,  0.044602 ,
        0.016544 ,  0.37339  , -0.11816  ,  0.27341  , -0.46893  ,
       -0.25912  ,  0.10473  , -0.52668  , -0.71805  , -0.18869  ,
       -0.42315  , -0.26882  , -0.15147  ,  0.32614  , -0.36439  ,
        0.18147  , -0.0058194,  0.85306  ,  0.41213  ,  0.1851

#### Fetching most similar words

In [79]:
model.most_similar('food')

[('eat', 0.8108313083648682),
 ('breakfast', 0.7947741150856018),
 ('eating', 0.7805092930793762),
 ('lunch', 0.776797890663147),
 ('dinner', 0.7663010954856873),
 ('foods', 0.7606096267700195),
 ('coffee', 0.7571052312850952),
 ('meal', 0.7550615668296814),
 ('meat', 0.7423633337020874),
 ('cooking', 0.7363327741622925)]

#### Similarities between two words

In [80]:
model.similarity('pizza', 'pasta')

0.78086126

#### odd one out

In [81]:
model.doesnt_match(['burger', 'noodles', 'pizza', 'pasta', 'apple'])

'apple'

### Analogies

In [82]:
model.most_similar(positive=['king', 'queen'], negative=['woman'], topn=3)

[('prince', 0.6462740898132324),
 ('royal', 0.5924025774002075),
 ('aka', 0.5809680223464966)]

In [83]:
model.most_similar(positive=['car', 'plane', 'ship'], negative=['person'])

[('boat', 0.7703709006309509),
 ('helicopter', 0.7153111696243286),
 ('flight', 0.7139747142791748),
 ('jet', 0.6905894875526428),
 ('truck', 0.6806634068489075),
 ('crash', 0.6783006191253662),
 ('airplane', 0.6663592457771301),
 ('cruise', 0.650026798248291),
 ('aircraft', 0.6479072570800781),
 ('crashed', 0.6444786787033081)]

## Visualising Word Vectors

In [84]:
# 1000 Words with highest frequency
words = sorted(model.vocab.keys(),
               key = lambda word:model.vocab[word].count,
               reverse = True)[:1000]

words[:2500:50]

['<user>',
 'o',
 '_',
 '>>',
 'please',
 'itu',
 'apa',
 'their',
 'justin',
 'same',
 'text',
 'already',
 'hari',
 'head',
 'playing',
 'yet',
 'once',
 'sexy',
 'sei',
 'fast']

In [85]:
word_vectors = np.array([model.get_vector(word) for word in words])

### Principal Component Analysis (PCA)

PCA tries to find axes along which most of the variance occurs.

In [87]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [88]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
word_vectors_pca = pca.fit_transform(word_vectors)

In [89]:
draw_vectors(word_vectors_pca[:, 0], word_vectors_pca[:, 1], token=words)

### tSNE plot

In [90]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
word_vectors_tsne = tsne.fit_transform(word_vectors)

In [91]:
draw_vectors(word_vectors_tsne[:, 0], word_vectors_tsne[:, 1], color='green', token=words)