# Word2Vec
(by Tevfik Aytekin)

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import gutenberg
import gensim 
from gensim.models import Word2Vec 
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import RegexpTokenizer

# You need to call nltk.download() to download all the nltk corpora

In [3]:
text = gutenberg.raw('austen-emma.txt')
print("num_words: ", len(gutenberg.words('austen-emma.txt')))


num_words:  192427


In [5]:
text[:1000]

"[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nShe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.  Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.\n\nSixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.  Between _them_ it was more the intimacy\nof sisters.  Even before Miss Taylor had ceased to hold the nominal\noffice of 

In [4]:
text = remove_stopwords(text)

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
data = [] 
for i in sent_tokenize(text): 
    temp = [] 
    for j in tokenizer.tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

In [7]:
data[0]

['emma',
 'by',
 'jane',
 'austen',
 '1816',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 'handsome',
 'clever',
 'and',
 'rich',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 'seemed',
 'to',
 'unite',
 'some',
 'of',
 'the',
 'best',
 'blessings',
 'of',
 'existence',
 'and',
 'had',
 'lived',
 'nearly',
 'twenty',
 'one',
 'years',
 'in',
 'the',
 'world',
 'with',
 'very',
 'little',
 'to',
 'distress',
 'or',
 'vex',
 'her']

In [8]:
# Create model 
model = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 

In [9]:
print("Cosine similarity between 'home' " + 
               "and 'happy' - CBOW : ", 
    model.wv.similarity('home', 'happy')) 

Cosine similarity between 'home' and 'happy' - CBOW :  0.9942075


In [10]:
model.wv.most_similar(positive='friend', topn=5)

[('isabella', 0.9989608526229858),
 ('towards', 0.9989355802536011),
 ('looks', 0.9988465905189514),
 ('near', 0.9987842440605164),
 ('became', 0.9986475706100464)]

In [11]:
model.wv['friend']

array([ 0.45827785, -0.44172487, -0.12176165,  0.12233178,  0.05520872,
       -0.6077416 ,  0.7178411 , -0.2231695 ,  0.55209386, -0.14880037,
       -0.348411  ,  0.25484174, -0.43844402, -0.28176418,  0.38730022,
       -0.74136615, -0.19781218,  0.00579581, -0.05395631, -0.30161655,
       -0.2701442 ,  0.20550719,  0.09370674, -0.1381972 ,  0.4678049 ,
       -0.10224696, -0.2221067 ,  0.2768428 , -0.2958746 , -0.8448977 ,
       -0.5154456 , -0.6028929 ,  0.17660762, -0.33711678,  0.46171984,
        0.2075493 , -0.17261714, -0.3107516 ,  0.16176172, -0.3937709 ,
        0.04082135,  0.09299976,  0.17602955,  0.33017817, -0.19537713,
       -0.08492216,  0.97739416, -0.19566737,  0.44349873, -0.2778141 ,
       -0.27458706, -0.1363758 , -0.4018571 , -0.34044376,  0.1628924 ,
       -0.08806945,  0.5782594 ,  0.0487115 ,  0.29075357,  0.00820251,
       -0.5616262 , -0.53499365, -0.7801935 , -0.49364096, -0.15957908,
        0.35536233, -0.35399276, -0.10840175, -0.951173  , -0.21