# Word2Vec
(by Tevfik Aytekin)

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import gutenberg
import gensim 
from gensim.models import Word2Vec 
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import RegexpTokenizer

# You need to call nltk.download() to download all the nltk corpora

In [2]:
text = gutenberg.raw('austen-emma.txt')
print("num_words: ", len(gutenberg.words('austen-emma.txt')))


num_words:  192427


In [3]:
text

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nShe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister\'s marriage,\nbeen mistress of his house from a very early period.  Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.\n\nSixteen years had Miss Taylor been in Mr. Woodhouse\'s family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.  Between _them_ it was more the intimacy\nof sisters.  Even before Miss Taylor had ceased to hold the nominal\noffice o

In [4]:
text = remove_stopwords(text)

In [5]:
text

'[Emma Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome, clever, rich, comfortable home happy disposition, unite best blessings existence; lived nearly twenty-one years world little distress vex her. She youngest daughters affectionate, indulgent father; had, consequence sister\'s marriage, mistress house early period. Her mother died long ago indistinct remembrance caresses; place supplied excellent woman governess, fallen little short mother affection. Sixteen years Miss Taylor Mr. Woodhouse\'s family, governess friend, fond daughters, particularly Emma. Between _them_ intimacy sisters. Even Miss Taylor ceased hold nominal office governess, mildness temper hardly allowed impose restraint; shadow authority long passed away, living friend friend mutually attached, Emma liked; highly esteeming Miss Taylor\'s judgment, directed chiefly own. The real evils, indeed, Emma\'s situation power having way, disposition think little herself; disadvantages threatened alloy enjoyments.

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
data = [] 
for i in sent_tokenize(text): 
    temp = [] 
    for j in tokenizer.tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

In [7]:
data[0]

['emma',
 'jane',
 'austen',
 '1816',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 'handsome',
 'clever',
 'rich',
 'comfortable',
 'home',
 'happy',
 'disposition',
 'unite',
 'best',
 'blessings',
 'existence',
 'lived',
 'nearly',
 'twenty',
 'one',
 'years',
 'world',
 'little',
 'distress',
 'vex',
 'her']

In [8]:
# Create model 
model = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 

In [9]:
print("Cosine similarity between 'home' " + 
               "and 'happy' - CBOW : ", 
    model.wv.similarity('home', 'happy')) 

Cosine similarity between 'home' and 'happy' - CBOW :  0.99994093


In [10]:
model.wv.most_similar(positive='friend', topn=5)

[('till', 0.9999628067016602),
 ('family', 0.9999600052833557),
 ('and', 0.9999597072601318),
 ('little', 0.9999592304229736),
 ('mind', 0.9999588131904602)]

In [11]:
model.wv['friend']

array([-0.42059702,  0.2711903 ,  0.1519365 ,  0.5805409 ,  0.7753635 ,
        0.47260705, -0.08873856, -0.7698414 ,  0.347189  , -0.4282772 ,
        0.62105495,  0.20584102,  0.6597244 , -0.17318264,  0.43999222,
        0.1285696 ,  0.15118577, -0.09584107,  0.28068653, -0.2719674 ,
        0.00533978,  0.0937654 , -0.13289893, -0.51470137, -0.85861236,
        0.12996769,  0.1578027 , -0.67818445,  0.7893912 , -0.7245244 ,
        0.7225449 ,  0.7294694 , -0.60866046,  0.06454292,  0.7137646 ,
       -0.5325863 , -0.5337018 , -0.35156614,  0.1676913 ,  0.5829252 ,
        0.9811033 ,  0.29591152,  0.14874783,  0.4647443 , -0.65932107,
       -0.89643246, -0.32003412, -0.8863192 , -0.60318285,  0.66881526,
        0.27380848,  0.55726   , -0.10656347,  0.1258    , -0.5323947 ,
       -0.9232209 , -0.43012798, -0.35740045,  0.25187495, -0.03354563,
       -0.16953687,  0.86769694,  0.0524689 ,  0.09903435,  0.7810086 ,
        0.5834954 , -0.1495611 , -0.31491235, -0.4169392 , -0.16