In [1]:
import gensim
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models import Phrases, TfidfModel
from gensim.models.phrases import Phraser
from gensim.test.utils import datapath 
from gensim.models.word2vec import Word2Vec, Text8Corpus
import logging

In [2]:
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

In [3]:
path = '/home/stefano/text8.txt' 
sentences = Text8Corpus(datapath(path))

In [4]:
ngram=Phrases(sentences)

2019-03-17 00:20:58,385: INFO: collecting all words and their counts
2019-03-17 00:20:58,394: INFO: PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-03-17 00:21:32,770: INFO: collected 4400410 word types from a corpus of 17005207 words (unigram + bigrams) and 1701 sentences
2019-03-17 00:21:32,771: INFO: using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [5]:
phraser=Phraser(ngram)

2019-03-17 00:21:32,777: INFO: source_vocab length 4400410
2019-03-17 00:22:20,365: INFO: Phraser built with 46644 phrasegrams


In [6]:
ngram=Phrases(phraser[sentences])

2019-03-17 00:22:20,373: INFO: collecting all words and their counts
2019-03-17 00:22:20,414: INFO: PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-03-17 00:23:37,212: INFO: collected 5032987 word types from a corpus of 15682679 words (unigram + bigrams) and 1701 sentences
2019-03-17 00:23:37,213: INFO: using 5032987 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [7]:
phraser=Phraser(ngram)

2019-03-17 00:23:37,316: INFO: source_vocab length 5032987
2019-03-17 00:23:45,205: INFO: Phraser added 50000 phrasegrams
2019-03-17 00:24:19,796: INFO: Phraser added 100000 phrasegrams
2019-03-17 00:24:51,905: INFO: Phraser built with 106242 phrasegrams


In [8]:
import re
with open('trigramNew.txt','w') as outfile:
    for sent in sentences:
        tokens_ = phraser[sent]
        outfile.write("%s\n" % re.sub("[\'\[\]\,]", "",str(tokens_)))

In [42]:
model = Word2Vec(phraser[sentences], 
                 min_count=3,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=4,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=15)       # Number of epochs training over corpus

2019-03-16 23:49:23,933: INFO: collecting all words and their counts
2019-03-16 23:49:23,980: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-16 23:50:23,471: INFO: collected 303818 word types from a corpus of 15414447 raw words and 1701 sentences
2019-03-16 23:50:23,472: INFO: Loading a fresh vocabulary
2019-03-16 23:50:24,461: INFO: effective_min_count=3 retains 148861 unique words (48% of original 303818, drops 154957)
2019-03-16 23:50:24,462: INFO: effective_min_count=3 leaves 15223683 word corpus (98% of original 15414447, drops 190764)
2019-03-16 23:50:24,975: INFO: deleting the raw counts dictionary of 303818 items
2019-03-16 23:50:24,986: INFO: sample=0.001 downsamples 34 most-common words
2019-03-16 23:50:24,987: INFO: downsampling leaves estimated 11184712 word corpus (73.5% of prior 15223683)
2019-03-16 23:50:25,710: INFO: estimated required memory for 148861 words and 300 dimensions: 431696900 bytes
2019-03-16 23:50:25,711: INFO: resetting la

In [2]:
model = gensim.models.KeyedVectors.load('trigramNew.model')

In [3]:
model.similar_by_vector((model['rome']+model['italy'])/2)

[('rome', 0.8934516906738281),
 ('italy', 0.8622043132781982),
 ('sicily', 0.6930198073387146),
 ('constantinople', 0.6692287921905518),
 ('ravenna', 0.6685149669647217),
 ('venice', 0.6466781497001648),
 ('gaul', 0.6280035972595215),
 ('naples', 0.6222332119941711),
 ('carthage', 0.6148161888122559),
 ('greece', 0.6058224439620972)]

In [4]:
model.similarity('rome','sicily')

0.56155753

In [15]:
model.similar_by_word('apple', topn=50)

[('microsoft', 0.7188221216201782),
 ('macintosh', 0.664719820022583),
 ('amiga', 0.6572433114051819),
 ('intel', 0.6489650011062622),
 ('apple_computer', 0.6417526006698608),
 ('ibm', 0.633816123008728),
 ('hypercard', 0.6093382835388184),
 ('apple_ii', 0.6077905297279358),
 ('sgi', 0.6049947738647461),
 ('amd', 0.6048518419265747),
 ('operating_system', 0.578680157661438),
 ('os_x', 0.5765261650085449),
 ('motorola', 0.5752401947975159),
 ('ms_dos', 0.5723409056663513),
 ('internet_explorer', 0.5652309060096741),
 ('compaq', 0.5647855401039124),
 ('mac_os', 0.5640397667884827),
 ('ibm_pc', 0.5568265914916992),
 ('desktop', 0.5558806657791138),
 ('atari', 0.5543843507766724),
 ('coleco', 0.553312361240387),
 ('macromedia', 0.5501058101654053),
 ('sony', 0.5487701892852783),
 ('gui', 0.5475488305091858),
 ('os', 0.5457783937454224),
 ('personal_computer', 0.5415133237838745),
 ('powerpc', 0.5398414134979248),
 ('intel_based', 0.5353753566741943),
 ('cp_m', 0.5351274013519287),
 ('windo

In [14]:
model.doesnt_match(['apple', 'banana', 'computer'])

'banana'