In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

#sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


Using TensorFlow backend.


In [2]:
#For each line, it stores the list of words and its label
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    print(item_no)
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    #When training the model is better that in each epoch the sequence of sentences is randomized.
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [3]:
sources = {'survey_data/test2_neg.txt':'TEST_NEG', 'survey_data/test2_pos.txt':'TEST_POS', 'IMDB_data/train-neg.txt':'TRAIN_NEG', 'IMDB_data/train-pos.txt':'TRAIN_POS', 'IMDB_data/train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

In [4]:
#Model: Building the Vocabulary Table
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

In [5]:
#Training
for epoch in range(10):
    model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter)

In [6]:
#Inspect the model
model.most_similar('good')

[('great', 0.7218390703201294),
 ('nice', 0.6715968251228333),
 ('decent', 0.6600950360298157),
 ('bad', 0.6057783961296082),
 ('excellent', 0.5706406831741333),
 ('fine', 0.5537208914756775),
 ('terrific', 0.5260274410247803),
 ('solid', 0.4898747205734253),
 ('ok', 0.4893348813056946),
 ('passable', 0.48271363973617554)]

In [7]:
model.most_similar('terrible')

[('horrible', 0.7575660347938538),
 ('awful', 0.6870864629745483),
 ('bad', 0.6592953205108643),
 ('horrendous', 0.6562106609344482),
 ('poor', 0.6472522020339966),
 ('atrocious', 0.6378101110458374),
 ('abysmal', 0.6051012277603149),
 ('dreadful', 0.5724244117736816),
 ('laughable', 0.5478890538215637),
 ('horrid', 0.5400127172470093)]

In [8]:
model.docvecs['TRAIN_NEG_0']

array([  1.32900566e-01,   1.28046945e-01,   8.17108691e-01,
        -1.28636205e+00,  -3.84286582e-01,  -1.72999933e-01,
        -8.85443110e-03,  -2.78399229e-01,   8.16115141e-02,
         2.15010598e-01,   1.33238006e+00,  -3.49347025e-01,
        -3.23957533e-01,   6.87140822e-01,  -7.11193621e-01,
        -1.46157312e+00,  -3.74920905e-01,  -1.07739776e-01,
         2.67666042e-01,  -2.92389154e-01,   5.87848604e-01,
        -4.88268673e-01,   3.62446487e-01,  -1.37731612e-01,
        -2.38904700e-01,  -5.42249858e-01,  -2.63350487e-01,
        -1.05008967e-01,  -5.50208986e-01,   1.36532769e-01,
         6.91218317e-01,   6.64953232e-01,  -5.26533425e-01,
         3.16222250e-01,  -6.38727784e-01,   2.78452002e-02,
         5.57127059e-01,   1.26426172e+00,  -4.30101156e-01,
         2.96567738e-01,  -1.18174350e+00,   1.73902422e-01,
         1.20896352e-02,   8.58125329e-01,  -5.18964171e-01,
         1.21728711e-01,  -5.90283096e-01,  -1.10897079e-01,
        -9.23990458e-02,

In [9]:
#Saving and Loading Models
model.save('./imdb2.d2v')

In [10]:
#And load it
model = Doc2Vec.load('./imdb2.d2v')

In [11]:
#Classifying Sentiments : use vectors to train a classifier

#create 2 parallel numpy arrays.
train_arrays = numpy.zeros((25000, 100)) #contains the vectors
train_labels = numpy.zeros(25000) #contains the labels

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[12500 + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [12]:
print(train_arrays)

[[ 0.68797094  0.76457405  0.32956964 ...,  0.15791994  0.28586188
   0.31513771]
 [ 1.22653186 -0.33190563  0.78921872 ..., -0.44848043 -1.05658019
   1.10037112]
 [-0.55923301 -1.30939567  0.12608586 ...,  0.4221797  -1.08332312
   0.34219408]
 ..., 
 [-0.60330433 -0.83032215  1.2487278  ...,  0.13907033  0.29023203
  -0.64993906]
 [ 1.22132766 -0.1648977  -1.46663046 ..., -0.24398085 -1.69610143
   1.56680501]
 [-0.20465086  0.29308414  0.39453363 ..., -0.35600445 -0.95190465
   0.24759872]]


In [13]:
print(train_labels)

[ 1.  1.  1. ...,  0.  0.  0.]


In [14]:
#Testing Vectors
test_arrays = numpy.zeros((40, 100))
test_labels = numpy.zeros(40)

for i in range(20):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test_pos]
    test_arrays[20 + i] = model.docvecs[prefix_test_neg]
    test_labels[i] = 1
    test_labels[20 + i] = 0

In [15]:
test_arrays[2]
test_labels[2]

1.0

In [16]:
#Classification
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
classifier.score(test_arrays, test_labels)

0.67500000000000004

In [18]:
predict_labels = classifier.predict(test_arrays)
print(predict_labels)

[ 1.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.
  0.  0.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.  1.
  0.  0.  0.  0.]


In [19]:
print(test_labels)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.]


In [20]:
print( confusion_matrix(test_labels, predict_labels))
print()
print( classification_report(test_labels, predict_labels))
print()
print ("Accuracy: ", accuracy_score(test_labels, predict_labels))

[[14  6]
 [ 7 13]]

             precision    recall  f1-score   support

        0.0       0.67      0.70      0.68        20
        1.0       0.68      0.65      0.67        20

avg / total       0.68      0.68      0.67        40


Accuracy:  0.675
