In [30]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled
    
#sources = {'test_neg.txt':'TEST_NEG', 'test_pos.txt':'TEST_POS', 'train_neg.txt':'TRAIN_NEG', 'train_pos.txt':'TRAIN_POS'}

sources = {'test_neg.txt':'TEST_NEG', 'test_pos.txt':'TEST_POS', 'test_neu.txt': 'TEST_NEU','train_neg.txt':'TRAIN_NEG', 'train_pos.txt':'TRAIN_POS', 'train_neu.txt': 'TRAIN_NEU' }
sentences = LabeledLineSentence(sources)

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

for epoch in range(10):
    model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter)


model.save('./imdb2.d2v')
model = Doc2Vec.load('./imdb2.d2v')






In [25]:
#print(model.most_similar('книга'))
    

print(model['TEST_NEG_0'])

[ 0.47419265  0.22265063 -0.46373898  0.29644212  0.24933659 -0.03566697
  0.95894581  0.49829221  0.40344387 -0.21912631  0.00672929  0.45505264
  0.45625553  0.27178591  0.15036167 -0.711007    0.64220738 -0.25538936
 -0.19682243 -0.11161936 -0.55217767 -0.8003155  -0.23982289 -0.19997309
  0.09913149 -0.96925551  1.30868506 -0.25909501 -0.00620278 -0.63689172
 -0.04055128  0.00948371  0.95462632  0.15182315  0.16749103  0.13801953
  0.38410836 -0.80300552  0.40023613  0.68217188 -0.35519913 -0.80504477
 -0.26363084 -0.23686907  0.46063983 -0.69996244 -0.15165094  0.01434649
 -0.24660118 -0.34766579 -0.88195491 -0.37955654  0.2783905   0.29337656
  0.13391115 -0.20515758 -0.63516182  0.87443525 -0.90199155 -0.09723683
 -0.01127618 -0.02132701 -0.39249209 -0.67206371  0.19638811  0.4132849
 -0.13170499  0.59632611 -0.89579099 -0.76429737 -0.39151546 -0.53849131
  1.36337841 -0.51752335  0.04309887 -0.47788385  0.82222766 -0.75321454
 -0.07771119 -0.23919851  0.2561022  -0.47832492  0.

In [31]:
#train_arrays = numpy.zeros((800, 100))
#train_labels = numpy.zeros(800)

train_arrays = numpy.zeros((1200, 100))
train_labels = numpy.zeros(1200)

#for i in range(400):
for i in range(400):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    prefix_train_neu= 'TRAIN_NEU_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[400 + i] = model[prefix_train_neg]
    train_arrays[800 + i] = model[prefix_train_neu]
    train_labels[i] = 1
    train_labels[400 + i] = -1
    train_labels[800 + i] = 0
    
print(train_arrays)

print(train_labels)


test_arrays = numpy.zeros((300, 100))
test_labels = numpy.zeros(300)

for i in range(100):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    prefix_test_neu= 'TEST_NEU_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[100 + i] = model[prefix_test_neg]
    test_arrays[200 + i] = model[prefix_test_neu]
    test_labels[i] = 1
    test_labels[100 + i] = -1
    test_labels[200 + i] = 0
    
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

classifier.score(test_arrays, test_labels)

[[-1.06430757 -0.04599246 -1.07595217 ..., -1.65198135 -1.24342155
   0.30501071]
 [ 1.00206351  1.96291304  3.28503036 ...,  0.3572861  -1.08156443
  -1.51788533]
 [-0.56483889  0.20518304 -0.8927598  ..., -0.89522284 -1.06609261
  -0.84506726]
 ..., 
 [-0.45927173  0.1324628  -0.33953124 ...,  0.13001081 -0.48279086
  -1.35075331]
 [ 0.16524084 -1.84224975 -0.86276621 ...,  0.05120652 -0.90570652
  -0.10345212]
 [-0.40863401  0.82991469  1.13864255 ..., -0.28418246 -1.064291
   0.34514821]]
[ 1.  1.  1. ...,  0.  0.  0.]


0.55666666666666664