In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# shuffle
from random import shuffle

# logging
import logging
import os.path
import sys
import _pickle as pickle
#import cPickle as pickle   #Note: in python3, _pickle was used instead of cpickle

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

class LabeledLineSentence(object):

    def __init__(self, sources):
        self.sources = sources

        flipped = {}

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(
                        utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

sources = {'test-neg-headlines.txt':'TEST_NEG', 'test-pos-headlines.txt':'TEST_POS', 'train-neg-headlines.txt':'TRAIN_NEG', 'train-pos-headlines.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

for epoch in range(50):
    logger.info('Epoch %d' % epoch)
    model.train(sentences.sentences_perm(),
                total_examples=model.corpus_count,
                epochs=model.iter,
    )

model.save('./imdb.d2v')


2017-12-13 13:25:02,945 : INFO : running c:\anaconda2\envs\tensorflow\lib\site-packages\ipykernel_launcher.py -f C:\Users\tianx\AppData\Roaming\jupyter\runtime\kernel-cb96e0aa-82f2-459e-8d90-9a9e7b9aa454.json
2017-12-13 13:25:03,114 : INFO : collecting all words and their counts
2017-12-13 13:25:03,130 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-12-13 13:25:03,197 : INFO : collected 2998 word types and 1636 unique tags from a corpus of 1636 examples and 15172 words
2017-12-13 13:25:03,210 : INFO : Loading a fresh vocabulary
2017-12-13 13:25:03,249 : INFO : min_count=1 retains 2998 unique words (100% of original 2998, drops 0)
2017-12-13 13:25:03,256 : INFO : min_count=1 leaves 15172 word corpus (100% of original 15172, drops 0)
2017-12-13 13:25:03,309 : INFO : deleting the raw counts dictionary of 2998 items
2017-12-13 13:25:03,336 : INFO : sample=0.0001 downsamples 809 most-common words
2017-12-13 13:25:03,345 : INFO : downsampling leaves estim

2017-12-13 13:25:09,956 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:09,970 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:09,984 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:09,991 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:10,036 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:10,039 : INFO : training on 75860 raw words (48274 effective words) took 0.8s, 61430 effective words/s
2017-12-13 13:25:10,053 : INFO : Epoch 7
2017-12-13 13:25:10,064 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:10,729 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:10,744 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:10,755 : INFO : worker thread fini

2017-12-13 13:25:15,944 : INFO : Epoch 14
2017-12-13 13:25:15,954 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:16,697 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:16,719 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:16,735 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:16,747 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:16,759 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:16,780 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:16,902 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:16,912 : INFO : training on 75860 raw words (48053 effective words) took 0.9s, 51317 effective words/s
2017-12-13 13:25:16,921 : INFO : Epoch 15
2017-12-

2017-12-13 13:25:23,440 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:23,497 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:23,503 : INFO : training on 75860 raw words (48121 effective words) took 0.9s, 52887 effective words/s
2017-12-13 13:25:23,513 : INFO : Epoch 22
2017-12-13 13:25:23,523 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:24,203 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:24,209 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:24,220 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:24,240 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:24,251 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:24,263 : INFO : worker thread fin

2017-12-13 13:25:30,703 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:30,708 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:30,721 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:30,731 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:30,738 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:30,781 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:30,785 : INFO : training on 75860 raw words (48319 effective words) took 0.8s, 64260 effective words/s
2017-12-13 13:25:30,798 : INFO : Epoch 30
2017-12-13 13:25:30,806 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:31,484 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:31,499 : INFO : worker thread fin

2017-12-13 13:25:37,171 : INFO : Epoch 37
2017-12-13 13:25:37,179 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:37,939 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:37,992 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:38,015 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:38,031 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:38,038 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:38,053 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:38,129 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:38,138 : INFO : training on 75860 raw words (48244 effective words) took 0.9s, 52880 effective words/s
2017-12-13 13:25:38,160 : INFO : Epoch 38
2017-12-

2017-12-13 13:25:46,337 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:46,345 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:46,358 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:46,378 : INFO : PROGRESS: at 92.09% examples, 43949 words/s, in_qsize 1, out_qsize 1
2017-12-13 13:25:46,396 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:46,449 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:46,456 : INFO : training on 75860 raw words (47933 effective words) took 1.1s, 44227 effective words/s
2017-12-13 13:25:46,461 : INFO : Epoch 45
2017-12-13 13:25:46,468 : INFO : training model with 7 workers on 2998 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:47,282 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:47,329 : INFO : worker

In [4]:
model.most_similar('good')

2017-12-13 16:53:32,372 : INFO : precomputing L2-norms of word weight vectors


[('label', 0.9580812454223633),
 ('azd', 0.9504972696304321),
 ('selumetinib', 0.9473737478256226),
 ('orphan', 0.9285398721694946),
 ('status', 0.9002965688705444),
 ('immune', 0.8734671473503113),
 ('cancer', 0.8715850114822388),
 ('acerta', 0.8587213754653931),
 ('drug', 0.8533174991607666),
 ('divested', 0.8283589482307434)]

In [5]:
model.most_similar('increase')

[('death', 0.8398493528366089),
 ('suggests', 0.824703574180603),
 ('onglyza', 0.750079870223999),
 ('aims', 0.6878533363342285),
 ('diabetes', 0.6724579334259033),
 ('data', 0.633600115776062),
 ('rate', 0.6235688328742981),
 ('fda', 0.6177887916564941),
 ('azd', 0.6037763357162476),
 ('ratio', 0.599388599395752)]

In [6]:
train_arrays = numpy.zeros((1142,100))
train_labels = numpy.zeros((1142))
for i in range(691):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_labels[i] = 1
for i in range(451):
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i+451] = model.docvecs[prefix_train_neg]
    train_labels[i+451] = 0

In [7]:
model.docvecs['TRAIN_POS_690']

array([ 0.03945563, -0.36040515, -0.81756526, -0.24152143, -0.10632309,
       -0.16919744, -0.25610462, -0.49840897, -0.31717134,  0.39454591,
        0.14760627,  0.07900509, -0.20074853,  0.11773267, -0.11075872,
       -0.09591108, -0.21288222, -0.11662704, -0.09278214,  0.01988182,
        0.67003477,  0.44293499, -0.43041474,  0.10573954, -0.06735239,
        0.15747884,  0.20386039, -0.11460374, -0.03202608, -0.13767077,
       -0.09739126, -0.33748809, -0.26100436, -0.47627538,  0.12671883,
       -0.47341105, -0.45879179,  0.4798207 , -0.10793679, -0.16957298,
       -0.01653943, -0.06163348, -0.09538887,  0.40632832, -0.11863247,
       -0.08789644, -0.34918171, -0.1562492 ,  0.46208948,  0.1314804 ,
       -0.02277557,  0.1046523 , -0.11416169, -0.20348941, -0.21107732,
        0.11291379,  0.32570395,  0.50531346, -0.20624495, -0.15568005,
        0.30965835, -0.13383634, -0.44210556, -0.34768116,  0.5247438 ,
       -0.30657843, -0.17650133,  0.07744043, -0.22693506,  0.15

In [8]:
print(train_arrays)

[[ 0.1035524   0.32813051 -0.22517553 ..., -0.58696747 -0.46383634
  -0.06675959]
 [-0.50630361 -0.07826949 -0.25960752 ..., -0.03545469 -0.87155759
  -0.39803335]
 [ 0.06695969  0.11612151 -0.42143866 ...,  0.00394759 -0.33607072
   0.05373187]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [9]:
print(train_labels[450])

1.0


In [10]:
test_arrays = numpy.zeros((491,100))
test_labels = numpy.zeros((491))
for i in range(288):
    prefix_test_pos = 'TEST_POS_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test_pos]
    test_labels[i] = 1
for i in range(203):
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i+288] = model.docvecs[prefix_test_neg]
    test_labels[i+288] = 0

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
classifier.score(test_arrays, test_labels)

0.66191446028513234

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_arrays, train_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error
housing_predictions = tree_reg.predict(train_arrays)
tree_mse = mean_squared_error(train_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [16]:
from sklearn.model_selection import cross_val_score
precision_list = cross_val_score(classifier, test_arrays, test_labels, cv=10, scoring='precision')
precision_list

array([ 0.76666667,  0.74193548,  0.68571429,  0.64705882,  0.7       ,
        0.70967742,  0.71875   ,  0.75      ,  0.70588235,  0.64516129])

In [17]:
sum = 0
for item in precision_list:
    sum += item
precision = sum/len(precision_list)
precision

0.70708463223999285

In [18]:
recall_list = cross_val_score(classifier, test_arrays, test_labels, cv=10, scoring='recall')
recall_list

array([ 0.79310345,  0.79310345,  0.82758621,  0.75862069,  0.72413793,
        0.75862069,  0.79310345,  0.62068966,  0.85714286,  0.71428571])

In [19]:
sum_recall = 0
for item in recall_list:
    sum_recall += item
recall = sum_recall/len(recall_list)
recall

0.76403940886699506

In [20]:
f1 = 2*(precision * recall)/(precision + recall)
f1

0.73445951441193469