In [10]:
#python example to train doc2vec model (with or without pre-trained word embeddings)

import gensim.models as g
import logging
import codecs

In [None]:

#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

#pretrained word embeddings
pretrained_emb = "toy_data/pretrained_word_embeddings.txt" #None if use without pretrained embeddings

#input corpus
train_corpus = "toy_data/train_docs.txt"

#output model
saved_path = "toy_data/model.bin"

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [6]:

#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)

docs

<gensim.models.doc2vec.TaggedLineDocument at 0x10c035780>

In [8]:
model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, iter=train_epoch)# pretrained_emb=pretrained_emb, iter=train_epoch)

#save model
model.save(saved_path)

2017-11-28 19:50:51,052 : INFO : collecting all words and their counts
2017-11-28 19:50:51,053 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-11-28 19:50:51,090 : INFO : collected 11097 word types and 1000 unique tags from a corpus of 1000 examples and 84408 words
2017-11-28 19:50:51,091 : INFO : Loading a fresh vocabulary
2017-11-28 19:50:51,130 : INFO : min_count=1 retains 11097 unique words (100% of original 11097, drops 0)
2017-11-28 19:50:51,131 : INFO : min_count=1 leaves 84408 word corpus (100% of original 84408, drops 0)
2017-11-28 19:50:51,181 : INFO : deleting the raw counts dictionary of 11097 items
2017-11-28 19:50:51,182 : INFO : sample=1e-05 downsamples 3599 most-common words
2017-11-28 19:50:51,183 : INFO : downsampling leaves estimated 22704 word corpus (26.9% of prior 84408)
2017-11-28 19:50:51,184 : INFO : estimated required memory for 11097 words and 300 dimensions: 33381300 bytes
2017-11-28 19:50:51,227 : INFO : resetting layer 

In [9]:
saved_path

'toy_data/model.bin'

In [11]:
#parameters
model="toy_data/model.bin"
test_docs="toy_data/test_docs.txt"
output_file="toy_data/test_vectors.txt"

#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000



In [12]:
#load model
m = g.Doc2Vec.load(model)
test_docs = [ x.strip().split() for x in codecs.open(test_docs, "r", "utf-8").readlines() ]



2017-11-28 19:54:23,473 : INFO : loading Doc2Vec object from toy_data/model.bin
2017-11-28 19:54:23,720 : INFO : loading wv recursively from toy_data/model.bin.wv.* with mmap=None
2017-11-28 19:54:23,721 : INFO : setting ignored attribute syn0norm to None
2017-11-28 19:54:23,721 : INFO : loading docvecs recursively from toy_data/model.bin.docvecs.* with mmap=None
2017-11-28 19:54:23,722 : INFO : setting ignored attribute cum_table to None
2017-11-28 19:54:23,723 : INFO : loaded toy_data/model.bin


In [13]:
#infer test vectors
output = open(output_file, "w")
for d in test_docs:
    output.write( " ".join([str(x) for x in m.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + "\n" )
output.flush()
output.close()

In [15]:
output_file

'toy_data/test_vectors.txt'