In [1]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import numpy as np
import sys
import random
from sklearn.linear_model import LogisticRegression



In [2]:
## the code for the doc2vec
class TaggedLineSentence(object):
    """
    sources: [file1 name: tag1 name, file2 name: tag2 name ...]
    privade two functions:
        to_array: transfer each line to a object of TaggedDocument and then add to a list
        perm: permutations
    """
    def __init__(self, sources):
        self.sources = sources

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    # TaggedDocument([word1, word2 ...], [tagx])
                    self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), 
                                       [prefix + '_%s' % item_no]))
        return self.sentences

    def perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)    # Note that this line does not return anything.
        return shuffled

In [3]:
sources = {'E:/kaggle/avito/imdb_testset/aclImdb_v1/test_neg.txt': 'TEST_NEG', 'E:/kaggle/avito/imdb_testset/aclImdb_v1/test_pos.txt': 'TEST_POS', 
           'E:/kaggle/avito/imdb_testset/aclImdb_v1/train_neg.txt': 'TRAIN_NEG','E:/kaggle/avito/imdb_testset/aclImdb_v1/train_pos.txt': 'TRAIN_POS', 
           'E:/kaggle/avito/imdb_testset/aclImdb_v1/train_unsup.txt': 'TRAIN_UNS'}

In [4]:
sentences = TaggedLineSentence(sources)

In [6]:
model = Doc2Vec(min_count=1, window=10, vector_size=100,sample=1e-4, negative=5, dm=1, workers=7)

In [7]:
model.build_vocab(sentences.to_array())

In [9]:
for epoch in range(2):
    print('epoch %d' % epoch)
    model.train(sentences.perm(),total_examples=model.corpus_count,epochs=model.epochs)

epoch 0


  This is separate from the ipykernel package so we can avoid doing imports until


epoch 1


In [10]:
train_arrays = np.zeros((25000, 100))
train_labels = np.zeros(25000)
test_arrays = np.zeros((25000, 100))
test_labels = np.zeros(25000)
for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    # note that the attribute is model.docvecs
    train_arrays[i], train_arrays[12500+i] = model.docvecs[prefix_train_pos], model.docvecs[prefix_train_neg]
    train_labels[i], train_labels[12500+i] = 1, 0

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i], test_arrays[12500 + i] = model.docvecs[prefix_test_pos], model.docvecs[prefix_test_neg]
    test_labels[i], test_labels[12500 + i] = 1, 0

In [11]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)
classifier.score(test_arrays, test_labels)

0.83692