In [15]:
import warnings
warnings.filterwarnings('ignore')

import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import os
import multiprocessing
from gensim.models.word2vec import Word2Vec

In [16]:
# Shakespeare.txt from Gutenberg open source http://norvig.com/ngrams/

# GenSim Word2Vec expects sentence to be fed sequentially, hence this construct for corpus sentences iterator class
class GetSentencesFromDir(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
 
sentences = GetSentencesFromDir('/shakespeare_dir') # a memory-friendly iterator

In [None]:
#from gensim.corpora.wikicorpus import WikiCorpus
#wiki = WikiCorpus("some_wiki_articles_dump.xml.bz2", lemmatize=False, dictionary={})
#sentences = list(wiki.get_texts())

In [29]:
params = {'size': 20, 'window': 5, 'min_count': 2, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3, 'iter': 2}

# gensim’s word2vec expects a sequence of sentences as its input
# gensim’s word2vec first pass collects words and their frequencies to build an internal dictionary tree structure
# Then, iter/epoch passes for training neural network model
# Trained model memory requirement is unique_tokens*nn_size*float_size*3_matrices
model = Word2Vec(sentences, **params)

2018-11-09 17:29:40,985 : INFO : collecting all words and their counts
2018-11-09 17:29:40,987 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-09 17:29:41,013 : INFO : PROGRESS: at sentence #10000, processed 82874 words, keeping 8218 word types
2018-11-09 17:29:41,040 : INFO : PROGRESS: at sentence #20000, processed 159884 words, keeping 12353 word types
2018-11-09 17:29:41,067 : INFO : PROGRESS: at sentence #30000, processed 240235 words, keeping 15238 word types
2018-11-09 17:29:41,090 : INFO : PROGRESS: at sentence #40000, processed 319260 words, keeping 18133 word types
2018-11-09 17:29:41,116 : INFO : PROGRESS: at sentence #50000, processed 394354 words, keeping 20518 word types
2018-11-09 17:29:41,145 : INFO : PROGRESS: at sentence #60000, processed 475506 words, keeping 22757 word types
2018-11-09 17:29:41,172 : INFO : PROGRESS: at sentence #70000, processed 553869 words, keeping 25222 word types
2018-11-09 17:29:41,196 : INFO : PROGRESS: at se

In [30]:
len(model.wv.vocab)

17786

In [21]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

2018-11-09 17:18:01,996 : INFO : precomputing L2-norms of word weight vectors


[('duke', 0.9445843696594238)]

In [23]:
model.doesnt_match("breakfast cereal dinner lunch".split())



'dinner'

In [25]:
model.similarity('woman', 'man')

0.9364297

In [26]:
model.most_similar("man")

[('thing', 0.9505487084388733),
 ('woman', 0.9364296793937683),
 ('fool', 0.8970590829849243),
 ('fellow', 0.8291076421737671),
 ('maid', 0.8163024187088013),
 ('gentleman', 0.80623459815979),
 ('word', 0.804107666015625),
 ('little', 0.7943037748336792),
 ('knave', 0.7773409485816956),
 ('bachelor', 0.771576464176178)]

In [27]:
model.most_similar("queen")

[('mistress', 0.9322229623794556),
 ('sister', 0.9245880842208862),
 ('prince', 0.9186954498291016),
 ('uncle', 0.9049043655395508),
 ('captain', 0.8951232433319092),
 ('nurse', 0.8933846354484558),
 ('daughter', 0.8857458829879761),
 ('servant', 0.884211003780365),
 ('errand', 0.881351113319397),
 ('mother', 0.88055020570755)]