In [1]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

import time
start = time.clock()

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with open(filename, 'wb') as f:
                f.write(r.content)
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()

    # Concatenate and normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    alldata = u''
    for fol in folders:
        temp = u''
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        for txt in txt_files:
            with smart_open.smart_open(txt, "rb") as t:
                t_clean = t.read().decode("utf-8")
                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')
                temp += t_clean
            temp += "\n"
        temp_norm = normalize_text(temp)
        with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
            n.write(temp_norm.encode("utf-8"))
        alldata += temp_norm

    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(alldata.splitlines()):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

end = time.clock()
print ("Total running time: ", end-start)


Total running time:  0.0004879999999998219


In [2]:
import os.path
assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"

In [3]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

In [4]:

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # Will hold all docs in original order
with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))


In [5]:
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [6]:

#Set-up Doc2Vec Training & Evaluation Models


In [7]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

In [8]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [9]:
simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [10]:
# Speed up setup by sharing results of the 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [11]:
# Le and Mikolov notes that combining a paragraph vector from Distributed Bag of Words 
# (DBOW) and Distributed Memory (DM) improves performance. We will follow, pairing the
# models together for evaluation. Here, we concatenate the paragraph vectors obtained from each model.

In [12]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

OrderedDict([('Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x1aeb7c518>),
             ('Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x1aeb7c5c0>),
             ('Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x1aeb7c668>),
             ('dbow+dmm',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x11d67b358>),
             ('dbow+dmc',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x11d67b320>)])

In [13]:

# Predictive Evaluation Methods

# Let's define some helper methods for evaluating the performance of our Doc2vec
# using paragraph vectors. We will classify document sentiments using a logistic 
# regression model based on our paragraph embeddings. We will compare the error 
# rates based on word embeddings from our various Doc2vec models.


In [14]:


import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 


  from pandas.core import datetools


In [15]:
@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [16]:

# Bulk Training

# We use an explicit multiple-pass, alpha-reduction approach as sketched in 
# this gensim doc2vec blog post with added shuffling of corpus on each pass.
# https://rare-technologies.com/doc2vec-tutorial/

In [17]:
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [18]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2017-11-29 01:12:56.100947
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                 0.04381
Time:                        01:13:41   Log-Likelihood:                -16569.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                4.597e-252
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0331      0.013     -2.501      0.012      -0.059      -0.007
x1            -5.1907      3.646     -1.424      0.155     -12.337       1.955
x2             6.97

*0.416800 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 44.5s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                 0.04381
Time:                        01:13:43   Log-Likelihood:                -16569.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                4.597e-252
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0331      0.013     -2.501      0.012      -0.059      -0.007
x1            -5.1907      3.646     -1.424      0.155    

*0.347600 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 44.5s 9.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2963
Time:                        01:14:07   Log-Likelihood:                -12194.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0719      0.137     -0.524      0.600      -0.341       0.197
x1            -3.9436      3.200     -1.232      

*0.247320 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.4s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2963
Time:                        01:14:08   Log-Likelihood:                -12194.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0719      0.137     -0.524      0.600      -0.341       0.197
x1            -3.9436      3.200     -1.232      0.218     -1

*0.206800 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 15.4s 3.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2412
Time:                        01:14:35   Log-Likelihood:                -13149.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0942      0.029     -3.302      0.001      -0.150      -0.038
x1            -0.8243      0.305     -2.707      0.0

*0.264080 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.5s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2412
Time:                        01:14:36   Log-Likelihood:                -13149.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0942      0.029     -3.302      0.001      -0.150      -0.038
x1            -0.8243      0.305     -2.707      0.007   

*0.209600 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 22.5s 4.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3536
Time:                        01:14:41   Log-Likelihood:                -11201.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0035      0.146     -0.024      0.981      -0.290       0.283
x1            -3.9408      3.436     -1.147     

*0.219080 : 1 passes : dbow+dmm 0.0s 2.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3536
Time:                        01:14:44   Log-Likelihood:                -11201.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0035      0.146     -0.024      0.981      -0.290       0.283
x1            -3.9408      3.436     -1.147      0.251     -10.675       2.793
x2        

*0.176000 : 1 passes : dbow+dmm_inferred 0.0s 8.6s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3034
Time:                        01:14:52   Log-Likelihood:                -12071.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0201      0.139     -0.145      0.885      -0.292       0.251
x1            -5.9038      4.903     -1.204      0.229     -15.513       3.705
x

*0.244920 : 1 passes : dbow+dmc 0.0s 2.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3034
Time:                        01:14:55   Log-Likelihood:                -12071.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0201      0.139     -0.145      0.885      -0.292       0.251
x1            -5.9038      4.903     -1.204      0.229     -15.513       3.705
x2        

*0.213200 : 1 passes : dbow+dmc_inferred 0.0s 12.9s
Completed pass 1 at alpha 0.025000
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.1080
Time:                        01:15:50   Log-Likelihood:                -15457.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0428      0.014     -3.095      0.002      -0.070      -0.016
x1            -3.7919      2.291     -1.655 

*0.354800 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 43.3s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5051
Time:                        01:16:07   Log-Likelihood:                -8575.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5331      0.277     -5.543      0.000      -2.075      -0.991
x1            -3.7674      1.046     -3.600      0.000    

*0.149160 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.3s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3330
Time:                        01:16:31   Log-Likelihood:                -11559.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0305      0.050      0.607      0.544      -0.068       0.129
x1            -0.7071      0.192     -3.691      0.000      -

*0.211440 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.7s 1.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5276
Time:                        01:16:33   Log-Likelihood:                -8185.9
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5936      0.315     -5.062      0.000      -2.211      -0.977
x1            -3.9364      1.131     -3.481      0.000   

*0.142680 : 2 passes : dbow+dmm 0.0s 2.1s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5098
Time:                        01:16:35   Log-Likelihood:                -8495.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5233      0.280     -5.442      0.000      -2.072      -0.975
x1            -5.4780      1.130     -4.848      0.000      -7.693      -3.264
x2        

*0.147200 : 2 passes : dbow+dmc 0.0s 2.7s
Completed pass 2 at alpha 0.022600
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.1608
Time:                        01:17:19   Log-Likelihood:                -14543.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0431      0.014     -2.999      0.003      -0.071      -0.015
x1            -2.0286      1.121     -1.810      0.070

*0.317120 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 42.4s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5607
Time:                        01:17:36   Log-Likelihood:                -7613.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.3637      0.334     -7.070      0.000      -3.019      -1.708
x1            -2.3339      0.318     -7.341      0.000    

*0.128080 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.1s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3753
Time:                        01:18:01   Log-Likelihood:                -10825.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0940      0.069      1.367      0.172      -0.041       0.229
x1            -0.6042      0.145     -4.162      0.000      -

*0.191160 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 24.2s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5763
Time:                        01:18:03   Log-Likelihood:                -7341.4
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1699      0.367     -5.911      0.000      -2.889      -1.450
x1            -2.2048      0.348     -6.337      0.000   

*0.125680 : 3 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5657
Time:                        01:18:05   Log-Likelihood:                -7525.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.3896      0.341     -7.016      0.000      -3.057      -1.722
x1            -2.5021      0.347     -7.220      0.000      -3.181      -1.823
x2        

*0.127600 : 3 passes : dbow+dmc 0.0s 2.2s
Completed pass 3 at alpha 0.020200
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2044
Time:                        01:18:50   Log-Likelihood:                -13787.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0353      0.015     -2.373      0.018      -0.064      -0.006
x1            -1.2703      0.522     -2.435      0.015

*0.289240 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 42.8s 1.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5897
Time:                        01:19:09   Log-Likelihood:                -7109.9
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4678      0.350     -7.052      0.000      -3.154      -1.782
x1            -2.1697      0.192    -11.283      0.000    

*0.117320 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 17.1s 1.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4004
Time:                        01:19:33   Log-Likelihood:                -10390.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0273      0.083      0.327      0.744      -0.136       0.191
x1            -0.5680      0.120     -4.738      0.000      -

*0.183760 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 23.6s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6012
Time:                        01:19:35   Log-Likelihood:                -6910.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1993      0.373     -5.904      0.000      -2.929      -1.469
x1            -2.0912      0.215     -9.745      0.000   

*0.115880 : 4 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.5954
Time:                        01:19:38   Log-Likelihood:                -7010.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4219      0.358     -6.765      0.000      -3.124      -1.720
x1            -2.2459      0.214    -10.498      0.000      -2.665      -1.827
x2        

*0.117040 : 4 passes : dbow+dmc 0.0s 2.2s
Completed pass 4 at alpha 0.017800
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2414
Time:                        01:20:21   Log-Likelihood:                -13146.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0351      0.015     -2.291      0.022      -0.065      -0.005
x1            -0.9399      0.289     -3.255      0.001

*0.269440 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 41.6s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2414
Time:                        01:20:22   Log-Likelihood:                -13146.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0351      0.015     -2.291      0.022      -0.065      -0.005
x1            -0.9399      0.289     -3.255      0.001    

*0.288000 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 41.6s 8.4s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6041
Time:                        01:20:46   Log-Likelihood:                -6861.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2424      0.361     -6.216      0.000      -2.949      -1.535
x1            -1.9405      0.157    -12.394      

*0.114920 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.2s 1.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6041
Time:                        01:20:47   Log-Likelihood:                -6861.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2424      0.361     -6.216      0.000      -2.949      -1.535
x1            -1.9405      0.157    -12.394      0.000      -

*0.130000 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 15.2s 3.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4151
Time:                        01:21:15   Log-Likelihood:                -10136.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0240      0.095      0.251      0.802      -0.163       0.211
x1            -0.4768      0.105     -4.534      0.0

*0.177680 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 23.5s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4151
Time:                        01:21:16   Log-Likelihood:                -10136.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0240      0.095      0.251      0.802      -0.163       0.211
x1            -0.4768      0.105     -4.534      0.000   

*0.202800 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 23.5s 4.6s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6144
Time:                        01:21:21   Log-Likelihood:                -6682.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9981      0.378     -5.291      0.000      -2.738      -1.258
x1            -1.9101      0.176    -10.859     

*0.112520 : 5 passes : dbow+dmm 0.0s 2.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6144
Time:                        01:21:23   Log-Likelihood:                -6682.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9981      0.378     -5.291      0.000      -2.738      -1.258
x1            -1.9101      0.176    -10.859      0.000      -2.255      -1.565
x2        

*0.122800 : 5 passes : dbow+dmm_inferred 0.0s 7.9s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6105
Time:                        01:21:32   Log-Likelihood:                -6748.8
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2080      0.371     -5.944      0.000      -2.936      -1.480
x1            -1.9751      0.178    -11.105      0.000      -2.324      -1.626
x

*0.114240 : 5 passes : dbow+dmc 0.0s 2.6s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6105
Time:                        01:21:34   Log-Likelihood:                -6748.8
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2080      0.371     -5.944      0.000      -2.936      -1.480
x1            -1.9751      0.178    -11.105      0.000      -2.324      -1.626
x2        

*0.128000 : 5 passes : dbow+dmc_inferred 0.0s 12.4s
Completed pass 5 at alpha 0.015400
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2704
Time:                        01:22:27   Log-Likelihood:                -12643.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0325      0.016     -2.073      0.038      -0.063      -0.002
x1            -0.9708      0.200     -4.864 

*0.251760 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 40.9s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6143
Time:                        01:22:43   Log-Likelihood:                -6684.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1593      0.368     -5.873      0.000      -2.880      -1.439
x1            -1.8993      0.141    -13.478      0.000    

*0.112160 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.2s 1.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4252
Time:                        01:23:07   Log-Likelihood:                -9961.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0963      0.105     -0.921      0.357      -0.301       0.109
x1            -0.4707      0.095     -4.934      0.000      -

*0.173360 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.4s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6233
Time:                        01:23:09   Log-Likelihood:                -6528.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.8504      0.381     -4.856      0.000      -2.597      -1.104
x1            -1.8499      0.159    -11.648      0.000   

*0.110400 : 6 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6208
Time:                        01:23:11   Log-Likelihood:                -6571.2
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0599      0.379     -5.431      0.000      -2.803      -1.317
x1            -1.9512      0.162    -12.053      0.000      -2.269      -1.634
x2        

*0.111680 : 6 passes : dbow+dmc 0.0s 2.3s
Completed pass 6 at alpha 0.013000
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.2927
Time:                        01:23:53   Log-Likelihood:                -12257.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0346      0.016     -2.162      0.031      -0.066      -0.003
x1            -0.8406      0.165     -5.087      0.000

*0.242040 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 40.2s 1.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6191
Time:                        01:24:10   Log-Likelihood:                -6600.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1833      0.372     -5.876      0.000      -2.912      -1.455
x1            -1.8718      0.132    -14.168      0.000    

*0.109640 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.2s 1.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4331
Time:                        01:24:34   Log-Likelihood:                -9824.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0982      0.111     -0.881      0.378      -0.317       0.120
x1            -0.4364      0.089     -4.895      0.000      -

*0.170000 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 23.0s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6277
Time:                        01:24:36   Log-Likelihood:                -6452.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0234      0.385     -5.255      0.000      -2.778      -1.269
x1            -1.8387      0.150    -12.287      0.000   

*0.108640 : 7 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6255
Time:                        01:24:38   Log-Likelihood:                -6490.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0865      0.385     -5.417      0.000      -2.841      -1.332
x1            -1.9900      0.153    -13.045      0.000      -2.289      -1.691
x2        

*0.109040 : 7 passes : dbow+dmc 0.0s 2.2s
Completed pass 7 at alpha 0.010600
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3106
Time:                        01:25:21   Log-Likelihood:                -11946.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0350      0.016     -2.151      0.031      -0.067      -0.003
x1            -0.9039      0.151     -6.004      0.000

*0.234320 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 41.3s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6229
Time:                        01:25:38   Log-Likelihood:                -6535.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1533      0.376     -5.721      0.000      -2.891      -1.416
x1            -1.8630      0.127    -14.642      0.000    

*0.108640 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.0s 1.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4379
Time:                        01:26:02   Log-Likelihood:                -9740.2
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1517      0.117     -1.297      0.195      -0.381       0.078
x1            -0.4138      0.085     -4.881      0.000      -

*0.166280 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.5s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6311
Time:                        01:26:04   Log-Likelihood:                -6391.8
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9748      0.389     -5.081      0.000      -2.737      -1.213
x1            -1.8434      0.145    -12.743      0.000   

*0.107040 : 8 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6291
Time:                        01:26:06   Log-Likelihood:                -6427.9
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0671      0.392     -5.276      0.000      -2.835      -1.299
x1            -1.9676      0.148    -13.282      0.000      -2.258      -1.677
x2        

*0.108040 : 8 passes : dbow+dmc 0.0s 2.2s
Completed pass 8 at alpha 0.008200
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3221
Time:                        01:26:48   Log-Likelihood:                -11748.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0348      0.016     -2.119      0.034      -0.067      -0.003
x1            -0.9476      0.144     -6.590      0.000

*0.226840 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 40.2s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6242
Time:                        01:27:03   Log-Likelihood:                -6511.8
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1240      0.379     -5.604      0.000      -2.867      -1.381
x1            -1.8258      0.124    -14.712      0.000    

*0.107160 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 14.6s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4412
Time:                        01:27:27   Log-Likelihood:                -9683.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1689      0.121     -1.396      0.163      -0.406       0.068
x1            -0.3972      0.082     -4.831      0.000      -

*0.165800 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.7s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6322
Time:                        01:27:30   Log-Likelihood:                -6373.2
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9893      0.392     -5.071      0.000      -2.758      -1.220
x1            -1.8160      0.141    -12.843      0.000   

*0.106840 : 9 passes : dbow+dmm 0.0s 2.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6302
Time:                        01:27:32   Log-Likelihood:                -6409.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0523      0.396     -5.184      0.000      -2.828      -1.276
x1            -1.9645      0.146    -13.496      0.000      -2.250      -1.679
x2        

*0.107400 : 9 passes : dbow+dmc 0.0s 2.2s
Completed pass 9 at alpha 0.005800
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3290
Time:                        01:28:14   Log-Likelihood:                -11628.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0326      0.017     -1.973      0.049      -0.065      -0.000
x1            -0.9564      0.141     -6.781      0.000

*0.223760 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 40.4s 1.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.3290
Time:                        01:28:15   Log-Likelihood:                -11628.
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0326      0.017     -1.973      0.049      -0.065      -0.000
x1            -0.9564      0.141     -6.781      0.000   

*0.242400 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 40.4s 8.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6253
Time:                        01:28:39   Log-Likelihood:                -6493.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1533      0.381     -5.656      0.000      -2.900      -1.407
x1            -1.8303      0.123    -14.887     

 0.107320 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 15.3s 1.3s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6253
Time:                        01:28:40   Log-Likelihood:                -6493.3
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1533      0.381     -5.656      0.000      -2.900      -1.407
x1            -1.8303      0.123    -14.887      0.000      

*0.114400 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 15.3s 3.8s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4429
Time:                        01:29:08   Log-Likelihood:                -9654.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1901      0.123     -1.541      0.123      -0.432       0.052
x1            -0.3903      0.081     -4.835      0.

*0.165560 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 22.9s 1.7s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24899
Method:                           MLE   Df Model:                          100
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.4429
Time:                        01:29:09   Log-Likelihood:                -9654.6
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1901      0.123     -1.541      0.123      -0.432       0.052
x1            -0.3903      0.081     -4.835      0.000  

*0.197600 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 22.9s 4.5s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6332
Time:                        01:29:14   Log-Likelihood:                -6357.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0169      0.395     -5.112      0.000      -2.790      -1.244
x1            -1.8222      0.140    -13.003    

*0.106280 : 10 passes : dbow+dmm 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6332
Time:                        01:29:16   Log-Likelihood:                -6357.0
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0169      0.395     -5.112      0.000      -2.790      -1.244
x1            -1.8222      0.140    -13.003      0.000      -2.097      -1.548
x2       

*0.120400 : 10 passes : dbow+dmm_inferred 0.0s 8.6s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6311
Time:                        01:29:25   Log-Likelihood:                -6392.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0795      0.399     -5.214      0.000      -2.861      -1.298
x1            -1.9797      0.145    -13.696      0.000      -2.263      -1.696


 0.107560 : 10 passes : dbow+dmc 0.0s 2.2s
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                25000
Model:                          Logit   Df Residuals:                    24799
Method:                           MLE   Df Model:                          200
Date:                Wed, 29 Nov 2017   Pseudo R-squ.:                  0.6311
Time:                        01:29:27   Log-Likelihood:                -6392.1
converged:                       True   LL-Null:                       -17329.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0795      0.399     -5.214      0.000      -2.861      -1.298
x1            -1.9797      0.145    -13.696      0.000      -2.263      -1.696
x2       

*0.105200 : 10 passes : dbow+dmc_inferred 0.0s 12.5s
Completed pass 10 at alpha 0.003400
END 2017-11-29 01:29:38.487366


In [19]:


# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))



Err rate Model
0.105200 dbow+dmc_inferred
0.106280 dbow+dmm
0.107160 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
0.107400 dbow+dmc
0.114400 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred
0.120400 dbow+dmm_inferred
0.165560 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)
0.197600 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred
0.223760 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
0.242400 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred


In [22]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc 24184...
Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8):
 [(24184, 0.4600438177585602), (20233, 0.38386455178260803), (68539, 0.3828004002571106)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8):
 [(24184, 0.8946358561515808), (16515, 0.6097397804260254), (64674, 0.6003652811050415)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):
 [(24184, 0.804354190826416), (24552, 0.7427728176116943), (33374, 0.7336629629135132)]


In [24]:


import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))



TARGET (99937): «as a classic storytale it has it all . a great plot , characters you fall in love with , evil bad guys etc but the film has aged well as a drama also , i reckon now way later . . . the cast is excellent . top quality actors all over the place . and olivia hussey in this role , breaks my heart every time . i'd say this is one of those classic films that you can watch as a kid , with your parents and their parents and you will all enjoy it ( for the same or different reasons ) it is a crying shame if it's not available on dvd . my vhs copy is falling apart , but i'm lucky since swedish national television airs ivanhoe every christmas . . . .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):

MOST (7286, 0.8346071243286133): «i´m from germany and i love the mvovies . i go 200 times a year . tonight i saw " pecker " , it was a wonderful evening . thank you , mr . waters . everybody who has a chance to see the movie , go ! ! !»

MEDIAN (73675, 0.5

In [25]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))


TARGET (71831): «went to the local film festival earlier tonight , or at least the kick-off for the local film festival . the venue couldn't have been worse , but it did aid in the general essence of the movie , hide and creep . the show was at sloss furnace in a concrete pit normally used for rock concerts and had small folding chairs for seating with a plethora of smoking and drinking . so what i am getting at is , it was a fun and unique experience ! now , as a preface i want to tell you that i am a huge fan of the horror genre . i like it all , shock-horror ( naked blood ) , gore-horror ( dead alive ) , classic-horror ( night of the living dead ) , sci-fi-horror ( event horizon ) , suspense-horror ( in the mouth of madness ) , standard-horror ( hell raiser ) , modern-horror ( ginger snaps ) and many others that i could go on listing forever . i like these movies for many different reasons and that is a whole other topic altogether , perhaps in the forums . . . this movie starts out

In [26]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))


TARGET (60292): «across the bridge has one of those titles that makes it sound like an arthur miller play but is actually based on one of graham greene's guilt-racked stories . and it's a corker , with a great premise that reminds you that before he moved on to guilt , infidelity and catholicism , greene wrote cracking pulp thrillers like a gun for sale . rod steiger is powerful and shady financier carl schaffner , on the run from the british police in america and trying to cross the border into mexico before he can be extradited . so he does what any one of us would do - kills another person who looks vaguely similar to steal his mexican passport and travel unhindered on that . naturally , things go wrong . he finds himself saddled with the dead man's dog . the dead man turns out to be a killer wanted by the mexican police . and the dead man turns out not to be dead . and that's not the least of it , as the unexpected plot twists mount while schaffner starts to look like the least cor