In [102]:
import pandas as pd
import gensim
from sklearn.manifold import TSNE

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import random

# [Word2Vec Algorithm](https://en.wikipedia.org/wiki/Word2vec)

### What is it?
It's a Natural Language Processing [NLP] algorithm that transforms words to vectors.

### When do I use it?
When I want to explore semantics of words. For example: find word's oposites, find context word etc.

### Why should I use it?
1] NLP

2] It's a general idea of mapping string of elements onto vectors (and vectors are good to work with). I can use it for recommending next product or a song in playlist.

# How does it work?

* Word2Vec is actualy shallow Neural Network [NN] (1 hidden layer).

* **Starting point:** 
    * We have N words. 
    * Each word is represented by N-dimensional vector with 1 on index position and 0s elsewhere (one-hot encoding).


* We let the NN predict word's naighbours.


* We cut out only the guts of the trained NN - scored hidden leayer values for each word.


* **End point:** 
    * Each word is represented by only M-dimension vector (M << N), that carries some context information.   **:-)**

### Why is it important to have vectors instead of words? Because we have the Algebra!



## Word2Vec: one-hot encoding
<img src="one_hot.png" alt="one_hot" style="width: 600px;"/>

## Word2Vec: Word2Vec output 
<img src="w2v_output.png" alt="w2v_output" style="width: 600px;"/>

## Approaches:
* **Skipgram** [SG]
    * Uses neighbour words as an input to NN and central word as an output.

* **Continuous Bag Of Words** [CBOW]
    * Uses central word as an input to NN and neighbour words as an output.

## Word2Vec: Word neighborhood
<img src="word_neighbour.png" alt="word_neighbour" style="width: 600px;"/>

## Word2Vec: Skipgram method
<img src="skipgram.png" alt="skipgram" style="width: 600px;"/>

## Word2Vec: CBOW method
<img src="cbow.png" alt="skipgram" style="width: 600px;"/>

## Dataset - QUORA

In [8]:
# Load the dataset
df = pd.read_csv("data/quora.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
# Transfer the dataset into list of lists
def read_questions(row,column_name):
    return gensim.utils.simple_preprocess(str(row[column_name]).encode('utf-8'))
    
documents = []
for index, row in df.iterrows():
    documents.append(read_questions(row,"question1"))
    if row["is_duplicate"] == 0:
        documents.append(read_questions(row,"question2"))

In [248]:
# document example 
documents[:4]

[['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  'in',
  'india'],
 ['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market'],
 ['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'noor', 'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'stole',
  'the',
  'kohinoor',
  'koh',
  'noor',
  'diamond',
  'back']]

## Model build

In [240]:
# Lets train the word2vec model using skipgram/cbow
w2v_model = gensim.models.Word2Vec(size=150, window=10, min_count=5, sg=0, workers=10)
w2v_model.build_vocab(documents) 
w2v_model.train(sentences=documents, total_examples=len(documents), epochs=w2v_model.epochs)

(25176933, 35144510)

## Model exploration

In [249]:
# Model vocabulary
print(f"Our vocabulary hase {len(w2v_model.wv.vocab)} words.")
w2v_model.wv.vocab

Our vocabulary hase 27775 words.


{'what': <gensim.models.keyedvectors.Vocab at 0x16a82f780>,
 'is': <gensim.models.keyedvectors.Vocab at 0x15c867400>,
 'the': <gensim.models.keyedvectors.Vocab at 0x15fd12ef0>,
 'step': <gensim.models.keyedvectors.Vocab at 0x15fd12d30>,
 'by': <gensim.models.keyedvectors.Vocab at 0x15fd12b70>,
 'guide': <gensim.models.keyedvectors.Vocab at 0x15fd12898>,
 'to': <gensim.models.keyedvectors.Vocab at 0x15fd12780>,
 'invest': <gensim.models.keyedvectors.Vocab at 0x15fd12668>,
 'in': <gensim.models.keyedvectors.Vocab at 0x15fd12550>,
 'share': <gensim.models.keyedvectors.Vocab at 0x15fd12438>,
 'market': <gensim.models.keyedvectors.Vocab at 0x15fd12320>,
 'india': <gensim.models.keyedvectors.Vocab at 0x15fd12208>,
 'story': <gensim.models.keyedvectors.Vocab at 0x15fd120f0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x153c38780>,
 'kohinoor': <gensim.models.keyedvectors.Vocab at 0x153c38ba8>,
 'koh': <gensim.models.keyedvectors.Vocab at 0x153c38080>,
 'noor': <gensim.models.keyedvectors.Voc

In [250]:
word = 'trump'
print(f"Word {word} is represented by {len(w2v_model.wv[word])}-dim vector:")
w2v_model.wv[word]

Word trump is represented by 150-dim vector:


array([ 3.3107355 ,  0.8425358 ,  4.1214967 ,  0.16413495, -1.8511533 ,
        0.5406486 , -1.0591987 ,  2.2244828 , -1.127227  , -2.043492  ,
        0.34423184, -0.58979374, -1.942646  , -2.915745  ,  2.1186101 ,
        1.3629088 , -2.0968988 , -1.2693385 ,  1.4190816 , -1.4458344 ,
       -2.2402678 , -1.7194806 ,  0.18369651, -1.0895953 , -0.17645195,
        4.461869  , -0.7583205 , -3.6195562 , -1.4466676 , -0.28347042,
        0.1111536 ,  2.6102455 ,  1.2059026 ,  2.3565419 ,  2.6914415 ,
       -1.6903051 ,  2.6847966 , -1.995088  ,  0.32408017,  1.7590213 ,
        1.528895  ,  1.3157607 , -1.6772395 , -0.38203248, -1.6400788 ,
       -0.43243533,  3.3234317 , -0.991555  , -2.2962174 ,  0.4065657 ,
        4.102692  ,  4.671234  , -3.3851779 , -3.5112364 , -1.0444226 ,
       -1.6023419 ,  0.40471596, -0.45547566, -2.298546  ,  1.613397  ,
        2.619818  ,  1.1592376 ,  1.7702707 , -1.5848597 , -0.74017453,
        0.68113613,  0.22783634,  0.5760455 ,  4.014598  ,  2.82

In [251]:
# TSNE representation of N words
N = 1000
wanted_vocab = random.sample(list(w2v_model.wv.vocab), N)
X = w2v_model[wanted_vocab] # X is an array of word vectors, each vector containing 150 tokens
tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=5000, random_state=23)
Y = tsne_model.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [252]:
# Plot
trace = go.Scatter(
    x = Y[:,0],
    y = Y[:,1],
    text = list(wanted_vocab),
    mode='text'
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

## Using the model

In [253]:
# Find similar word 1
words1 = ['trump']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('trumps', 0.7672150135040283),
 ('knuth', 0.6954089999198914),
 ('hoffman', 0.6228437423706055),
 ('bernie', 0.5904607772827148),
 ('obama', 0.5755990743637085),
 ('hillary', 0.5698307752609253),
 ('democrats', 0.5683139562606812),
 ('sanders', 0.5480324029922485),
 ('president', 0.5426574945449829),
 ('election', 0.5415182113647461)]

In [257]:
# Find similar word 2
words1 = ['panda', 'rabbit', 'dog']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('pug', 0.748351514339447),
 ('fleas', 0.7307619452476501),
 ('chow', 0.7273343801498413),
 ('rib', 0.7270840406417847),
 ('hamster', 0.7243354320526123),
 ('shih', 0.7230414748191833),
 ('brindle', 0.7135178446769714),
 ('snake', 0.7134368419647217),
 ('ketchup', 0.7119420170783997),
 ('broccoli', 0.7117798328399658)]

In [258]:
# Find similar word 4
words1 = ['husband', 'man']
words2 = ['woman']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('wife', 0.7469571828842163),
 ('son', 0.7107111811637878),
 ('daughter', 0.6980760097503662),
 ('mother', 0.6955826282501221),
 ('father', 0.6858428120613098),
 ('dad', 0.6769495010375977),
 ('brother', 0.6665967702865601),
 ('sister', 0.6537143588066101),
 ('mom', 0.6525180339813232),
 ('boyfriend', 0.6416940689086914)]

In [259]:
# Find similar word 5
words1 = ['president', 'trump']
words2 = ['strength']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('obama', 0.6424059271812439),
 ('trumps', 0.6129918098449707),
 ('congress', 0.5975081920623779),
 ('elected', 0.5833556056022644),
 ('barack', 0.5784204006195068),
 ('democrats', 0.5746055245399475),
 ('bernie', 0.5696306824684143),
 ('republican', 0.568333625793457),
 ('knuth', 0.5595422387123108),
 ('elect', 0.5541716814041138)]

In [261]:
# Find similar word 4
words1 = ['programming','beginner']
words2 = []
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('python', 0.7925356030464172),
 ('java', 0.7660596370697021),
 ('coding', 0.7453685402870178),
 ('javascript', 0.7324598431587219),
 ('php', 0.7028157114982605),
 ('programmer', 0.7022966146469116),
 ('framework', 0.6619861125946045),
 ('beginners', 0.654211163520813),
 ('learning', 0.6265166401863098),
 ('tutorial', 0.6224186420440674)]

In [265]:
# What should not be there? 1
w2v_model.wv.doesnt_match(['tesla', 'bmw', 'superman', 'mercedes'])


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



'superman'

In [267]:
# What should not be there? 2
w2v_model.wv.doesnt_match(['trump', 'president', 'wall', 'business'])

'business'

In [268]:
# What should not be there? 3
w2v_model.wv.doesnt_match(['weed', 'beer', 'herion', 'cocain'])

'weed'

## Sources
[Wiki](https://en.wikipedia.org/wiki/Word2vec)

[Good Article 1](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)

[Good Article 1](http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.XJfruC1_HUo)

[Tensor Flow Article](https://www.tensorflow.org/tutorials/representation/word2vec)