In [2]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# shuffle
from random import shuffle

# logging
import logging
import os.path
import sys
import _pickle as pickle
#import cPickle as pickle   #Note: in python3, _pickle was used instead of cpickle

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

class LabeledLineSentence(object):

    def __init__(self, sources):
        self.sources = sources

        flipped = {}

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(
                        utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

for epoch in range(50):
    logger.info('Epoch %d' % epoch)
    model.train(sentences.sentences_perm(),
                total_examples=model.corpus_count,
                epochs=model.iter,
    )

model.save('./imdb.d2v')


2017-12-13 13:24:57,241 : INFO : running c:\anaconda2\envs\tensorflow\lib\site-packages\ipykernel_launcher.py -f C:\Users\tianx\AppData\Roaming\jupyter\runtime\kernel-d2a6e91b-b10c-4dea-9b42-8b9f3076776b.json
2017-12-13 13:24:57,588 : INFO : collecting all words and their counts
2017-12-13 13:24:57,590 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-12-13 13:24:57,637 : INFO : collected 2712 word types and 2497 unique tags from a corpus of 2497 examples and 14434 words
2017-12-13 13:24:57,639 : INFO : Loading a fresh vocabulary
2017-12-13 13:24:57,661 : INFO : min_count=1 retains 2712 unique words (100% of original 2712, drops 0)
2017-12-13 13:24:57,663 : INFO : min_count=1 leaves 14434 word corpus (100% of original 14434, drops 0)
2017-12-13 13:24:57,703 : INFO : deleting the raw counts dictionary of 2712 items
2017-12-13 13:24:57,706 : INFO : sample=0.0001 downsamples 755 most-common words
2017-12-13 13:24:57,714 : INFO : downsampling leaves estim

2017-12-13 13:25:05,191 : INFO : Epoch 6
2017-12-13 13:25:05,205 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:06,363 : INFO : PROGRESS: at 13.84% examples, 6217 words/s, in_qsize 7, out_qsize 0
2017-12-13 13:25:06,377 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:06,387 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:06,408 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:06,418 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:06,435 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:06,445 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:06,468 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:06,470 : INFO : training on 72170 raw words (47174 ef

2017-12-13 13:25:13,426 : INFO : Epoch 13
2017-12-13 13:25:13,436 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:14,451 : INFO : PROGRESS: at 27.51% examples, 13086 words/s, in_qsize 6, out_qsize 1
2017-12-13 13:25:14,458 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:14,504 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:14,521 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:14,541 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:14,553 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:14,557 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:14,580 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:14,584 : INFO : training on 72170 raw words (47388 

2017-12-13 13:25:22,072 : INFO : Epoch 20
2017-12-13 13:25:22,085 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:23,215 : INFO : PROGRESS: at 13.69% examples, 6094 words/s, in_qsize 7, out_qsize 0
2017-12-13 13:25:23,233 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:23,270 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:23,279 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:23,295 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:23,301 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:23,341 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:23,347 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:23,356 : INFO : training on 72170 raw words (47379 e

2017-12-13 13:25:30,840 : INFO : Epoch 27
2017-12-13 13:25:30,851 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:31,873 : INFO : PROGRESS: at 13.94% examples, 6528 words/s, in_qsize 7, out_qsize 0
2017-12-13 13:25:31,887 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:31,895 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:31,900 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:31,911 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:31,918 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:31,926 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:31,951 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:31,953 : INFO : training on 72170 raw words (47457 e

2017-12-13 13:25:39,646 : INFO : Epoch 34
2017-12-13 13:25:39,655 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:40,923 : INFO : PROGRESS: at 13.78% examples, 5758 words/s, in_qsize 7, out_qsize 0
2017-12-13 13:25:40,974 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:40,978 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:41,031 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:41,069 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:41,073 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:41,089 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:41,100 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:41,109 : INFO : training on 72170 raw words (47349 e

2017-12-13 13:25:50,380 : INFO : Epoch 41
2017-12-13 13:25:50,391 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:51,559 : INFO : PROGRESS: at 27.75% examples, 11887 words/s, in_qsize 6, out_qsize 1
2017-12-13 13:25:51,562 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:51,597 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:51,629 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:51,637 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:51,648 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:51,657 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:51,677 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:51,680 : INFO : training on 72170 raw words (47571 

2017-12-13 13:25:58,697 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-12-13 13:25:58,701 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-12-13 13:25:58,710 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-13 13:25:58,718 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-13 13:25:58,723 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-13 13:25:58,746 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-13 13:25:58,749 : INFO : training on 72170 raw words (47278 effective words) took 0.9s, 51648 effective words/s
2017-12-13 13:25:58,756 : INFO : Epoch 49
2017-12-13 13:25:58,764 : INFO : training model with 7 workers on 2712 vocabulary and 100 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-12-13 13:25:59,605 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-12-13 13:25:59,621 : INFO : worker thread fin

In [13]:
model.most_similar('good')

[('qihu', 0.7555435299873352),
 ('load', 0.7273313999176025),
 ('automakers', 0.7238010168075562),
 ('spot', 0.7107664346694946),
 ('domestic', 0.7089092135429382),
 ('loaded', 0.6962469816207886),
 ('terribly', 0.6833244562149048),
 ('biotechs', 0.6804578304290771),
 ('some', 0.6749091148376465),
 ('note', 0.6718221306800842)]

In [9]:
model.most_similar('mispriced')

[('totally', 0.9983320236206055),
 ('absolute', 0.995948851108551),
 ('pure', 0.9928412437438965),
 ('garbage', 0.9846278429031372),
 ('stores', 0.9469236731529236),
 ('empty', 0.9068784713745117),
 ('merchant', 0.8186815977096558),
 ('ross', 0.7849423289299011),
 ('usio', 0.7547214031219482),
 ('rbc', 0.7533202767372131)]

In [10]:
model.most_similar('downtrend')

[('retest', 0.8763420581817627),
 ('helping', 0.8424099683761597),
 ('itting', 0.8358184099197388),
 ('broken', 0.771192729473114),
 ('line', 0.7534346580505371),
 ('average', 0.7364001274108887),
 ('resistance', 0.7189692854881287),
 ('trend', 0.7136515378952026),
 ('beautiful', 0.7084102630615234),
 ('bidders', 0.7053914070129395)]

In [11]:
model.most_similar('bullish')

[('awesomely', 0.8679783940315247),
 ('ief', 0.8635176420211792),
 ('tlt', 0.8232649564743042),
 ('crossovers', 0.8114673495292664),
 ('decreased', 0.797480583190918),
 ('abc', 0.7540429830551147),
 ('conversations', 0.7478561401367188),
 ('ewz', 0.7472898960113525),
 ('twitter', 0.729065477848053),
 ('dip', 0.7279219627380371)]

In [12]:
model.most_similar('increase')

[('steep', 0.8865960240364075),
 ('view', 0.8646053671836853),
 ('posts', 0.8605137467384338),
 ('rewards', 0.8602394461631775),
 ('patient', 0.8539063930511475),
 ('bright', 0.8084157705307007),
 ('wall', 0.8080934286117554),
 ('tops', 0.8000960350036621),
 ('percent', 0.7816009521484375),
 ('global', 0.7796220183372498)]

In [14]:
train_arrays = numpy.zeros((1700,100))
train_labels = numpy.zeros((1700))
for i in range(1119):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_labels[i] = 1
for i in range(581):
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i+1119] = model.docvecs[prefix_train_neg]
    train_labels[i+1119] = 0

In [15]:
print(train_arrays)

[[ 0.0177709  -0.22923303 -0.0342895  ...,  0.07055531  0.0153681
  -0.10305636]
 [ 0.06589964  0.17978436  0.21489561 ..., -0.29530606 -0.00678638
  -0.60204124]
 [ 0.00429939 -0.08192944  0.47957188 ..., -0.10725647  0.07766831
  -0.3334069 ]
 ..., 
 [-0.0291836   0.03448483  0.08596164 ..., -0.06617699  0.09165017
  -0.10532157]
 [-0.11976746  0.44390383  0.34118399 ..., -0.64246607 -0.22330751
  -0.11591343]
 [ 0.05145888  0.00953461  0.03342689 ..., -0.12771818 -0.00633792
  -0.06922003]]


In [16]:
model.docvecs['TRAIN_NEG_580']

array([ 0.05145888,  0.00953461,  0.03342689,  0.18303509,  0.12342305,
       -0.01732425, -0.01066432, -0.07760131,  0.1367396 , -0.09315624,
       -0.07442326, -0.13977994, -0.13990229,  0.09215616,  0.04037021,
       -0.03223286,  0.06288718, -0.07213554, -0.04514331,  0.00884892,
       -0.03508247,  0.06330335, -0.03663046, -0.02831447,  0.03311158,
       -0.06097967,  0.09175718, -0.00285468, -0.09703495, -0.06782337,
        0.00418406,  0.06110639,  0.23725958, -0.00133014, -0.00380763,
        0.00274535, -0.02215949, -0.06008369, -0.05410678, -0.04985861,
        0.22829747, -0.13805626, -0.10648111, -0.15145004,  0.06456409,
        0.01236629,  0.11522083, -0.15615945, -0.12437502,  0.01580986,
        0.07552686,  0.18161389, -0.02375856,  0.02106199, -0.00251829,
       -0.04400126,  0.09193997, -0.06060835,  0.0235026 , -0.04354072,
       -0.02132745, -0.05010912, -0.06865673,  0.15085252, -0.06799036,
       -0.05719998, -0.16104843, -0.18194994,  0.00066329,  0.10

In [17]:
print(train_labels[1118])

1.0


In [18]:
test_arrays = numpy.zeros((794,100))
test_labels = numpy.zeros((794))
for i in range(537):
    prefix_test_pos = 'TEST_POS_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test_pos]
    test_labels[i] = 1
for i in range(257):
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i+537] = model.docvecs[prefix_test_neg]
    test_labels[i+537] = 0

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
classifier.score(test_arrays, test_labels)

0.71410579345088165

In [22]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_arrays, train_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [72]:
import numpy as np
from sklearn.metrics import mean_squared_error
housing_predictions = tree_reg.predict(train_arrays)
tree_mse = mean_squared_error(train_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [25]:
classifier.score(test_arrays, test_labels)

0.71410579345088165

In [30]:
from sklearn.metrics import precision_score
precision_score(test_arrays, test_labels, average='weighted')

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [35]:
from sklearn.model_selection import cross_val_score
precision_list = cross_val_score(classifier, test_arrays, test_labels, cv=10, scoring='precision')
precision_list

array([ 0.72857143,  0.76470588,  0.7027027 ,  0.72881356,  0.72307692,
        0.72058824,  0.70967742,  0.81818182,  0.765625  ,  0.67164179])

In [44]:
sum = 0
for item in precision_list:
    sum += item
precision = sum/len(precision_list)
precision

0.73335847599015813

In [37]:
recall_list = cross_val_score(classifier, test_arrays, test_labels, cv=10, scoring='recall')
recall_list

array([ 0.94444444,  0.96296296,  0.96296296,  0.7962963 ,  0.87037037,
        0.90740741,  0.81481481,  0.8490566 ,  0.9245283 ,  0.8490566 ])

In [38]:
sum_recall = 0
for item in recall_list:
    sum_recall += item
recall = sum_recall/len(recall_list)
recall

0.88819007686932228

In [46]:
f1 = 2*(precision * recall)/(precision + recall)
f1

0.80338232242733598