# Build and train doc2ve model

In [8]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec

from collections import OrderedDict
import multiprocessing
from gensim.models.doc2vec import TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec

import sys

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

from collections import namedtuple
import nltk
from gensim.models.doc2vec import  LabeledSentence
import sys
from os import listdir

NewsgroupDocument = namedtuple('NewsGroupDocument', 'words tags category')

import numpy as np

In [2]:
# to print current epochs
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        # print('Epoch #{} start'.format(self.epoch))

        pass

    def on_epoch_end(self, model):
        if self.epoch == 0:
            print('Epoch #{}'.format(self.epoch), end=' ')
        else:
            print('#{}'.format(self.epoch), end=' ')
        self.epoch += 1

In [1]:
class Model():
    def __init__(self, documents, labels, clean_param_str, model_name):
        self.documents = documents
        self.labels = labels
        self.clean_param_str = clean_param_str
        self.model_name = model_name
        self.doc2vec_model = None
        self.path_to_models = 'models/' # path to trained doc2vec models
        
        
        
        self.tagged_docs = self.convert_to_doc2vec_format()
        
        self.build_and_train()
        
    # return docs in form expected by doc2vec i.e. list of tagged documents
    def convert_to_doc2vec_format(self):
        tagged_docs = []

        for doc_id, doc_contents in enumerate(self.documents):
            tagged_docs.append(NewsgroupDocument(doc_contents, [doc_id+1], self.labels[doc_id]))

        return tagged_docs
    
    def train(self, parameters):
        # initialise model
        model = Doc2Vec(**parameters, callbacks=[EpochLogger()])
        
        print('training doc2vec model...', end="") # -------------------
        model.build_vocab(self.tagged_docs)
        model.train(self.tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)
    
        # save model to disk
        model.save(self.path_to_models + self.model_name)
        
        # write information about model to file
        f = open('models/model_info.txt', 'a')
        f.write(self.model_name + '\n')
        f.write(str(parameters) + '\n')
        f.write('clean: '+ self.clean_param_str) # how this doc2vec model was cleaned
        f.close()
        
        print('\ntrained model!', self.model_name) # -------------------
        
        self.doc2vec_model = model
    
    def build_and_train(self):
        parameters=dict(dm=0,
                    vector_size=100,
                    epochs=20,
                    min_count=4,
                    workers=multiprocessing.cpu_count(),
                    negative=5,
                    hs=0,
                    sample=0,
                    )
        
        if self.model_name in listdir(self.path_to_models):
            print("Overwrite danger! CHANGE MODEL NAME!")
        else:
            self.train(parameters=parameters)
           
    # return a list of infered vectors of the documents
    def infer_all_vectors(self):
        X = []

        model = Doc2Vec.load('models/' + self.model_name)
        for doc in self.tagged_docs:
            X.append(model.infer_vector(doc.words))
        X = np.asarray(X)

        return X