In [62]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
import numpy as np

In [38]:
folder = "/mnt/c/Users/danis_000/NLPembeddings/"
file = "enwiki-latest-pages-articles1.xml-p10p30302.bz2"
model_name = "wiki.en.word2vec.model"

In [15]:
wiki = WikiCorpus(folder+file)

In [31]:
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield TaggedDocument([c for c in content], [title])

In [32]:
documents = TaggedWikiDocument(wiki)

In [33]:
cores = multiprocessing.cpu_count()

In [35]:
# PV-DBOW 
model = Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, epochs=5, workers=cores)

In [36]:
%time model.build_vocab(documents)

CPU times: user 44.6 s, sys: 3.06 s, total: 47.7 s
Wall time: 5min 24s


In [37]:
%time model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

  """Entry point for launching an IPython kernel.


CPU times: user 2h 2min 31s, sys: 20 s, total: 2h 2min 51s
Wall time: 57min


In [39]:
model.save(model_name)

In [40]:
model2 = Doc2Vec.load(model_name)

In [53]:
model.docvecs.most_similar(positive=["Atlantic Ocean"], topn=20)

[('Antarctic Circumpolar Current', 0.6999729871749878),
 ('North Atlantic Deep Water', 0.6625993847846985),
 ('Oligocene', 0.6617370843887329),
 ('Norwegian Sea', 0.6291526556015015),
 ('Pacific Ocean', 0.6272916793823242),
 ('Indian Ocean', 0.6067275404930115),
 ('North Atlantic Current', 0.5943110585212708),
 ('Strait of Gibraltar', 0.5925824046134949),
 ('Mediterranean Sea', 0.5882935523986816),
 ('Geography of the Falkland Islands', 0.5858149528503418),
 ('Caribbean Sea', 0.5847128629684448),
 ('North Sea', 0.5846424698829651),
 ('Paleogene', 0.5840442776679993),
 ('Miocene', 0.5774627327919006),
 ('Pleistocene', 0.5772634744644165),
 ('Pliocene', 0.5717372894287109),
 ('Red Sea', 0.5674933195114136),
 ('Holocene', 0.5654497146606445),
 ('Phanerozoic', 0.5631661415100098),
 ('Geography of Saint Pierre and Miquelon', 0.5591236352920532)]

In [58]:
model.docvecs.most_similar_to_given("Atlantic Ocean", ["Atlantic Ocean", "Atom"])

'Atlantic Ocean'

In [59]:
model.docvecs

<gensim.models.keyedvectors.Doc2VecKeyedVectors at 0x7f61c3bf82e8>

In [67]:
np.dot(model["machine"]/np.sum(model["machine"]**2), model["turing"]/np.sum(model["turing"]**2))

0.036783617

In [69]:
cosine = lambda x, y : np.dot(x/np.sum(x**2), y/np.sum(y**2))

In [72]:
car = model["car"]
turing = model["turing"]
machine = model["machine"]
print("Car and Turing", cosine(car,turing))
print("Machine and Turing", cosine(machine,turing))
print("Machine and Car", cosine(machine,car))

Car and Turing 0.009717448
Machine and Turing 0.036783617
Machine and Car 0.020174999
