In [1]:
!pip install testfixtures

Collecting testfixtures
[?25l  Downloading https://files.pythonhosted.org/packages/cc/38/6a885903ede5e7155665b7d2a3a6fa9416df3c90498c6d3d68103a972a17/testfixtures-6.10.0-py2.py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 989kB/s eta 0:00:01
[?25hInstalling collected packages: testfixtures
Successfully installed testfixtures-6.10.0


In [2]:
!pip install statsmodels



In [4]:
%time 

import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
from smart_open import smart_open
import re

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')
all_lines = []

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with smart_open(filename, 'wb') as f:
                f.write(r.content)
        # if error here, try `tar xfz aclImdb_v1.tar.gz` outside notebook, then re-run this cell
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()
    else:
        print("IMDB archive directory already available without download.")

    # Collect & normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    for fol in folders:
        temp = u''
        newline = "\n".encode("utf-8")
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        print(" %s: %i files" % (fol, len(txt_files)))
        with smart_open(os.path.join(dirname, output), "wb") as n:
            for i, txt in enumerate(txt_files):
                with smart_open(txt, "rb") as t:
                    one_text = t.read().decode("utf-8")
                    for c in control_chars:
                        one_text = one_text.replace(c, ' ')
                    one_text = normalize_text(one_text)
                    all_lines.append(one_text)
                    n.write(one_text.encode("utf-8"))
                    n.write(newline)

    # Save to disk for instant re-use on any future runs
    with smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(all_lines):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"
print("Success, alldata-id.txt is available for next steps.")

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.11 µs
IMDB archive directory already available without download.
Cleaning up dataset...
 train/pos: 12500 files


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


 train/neg: 12500 files
 test/pos: 12500 files
 test/neg: 12500 files
 train/unsup: 50000 files
Success, alldata-id.txt is available for next steps.


In [5]:
%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
# plus adds other state helpful for our later evaluation/reporting
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []
with smart_open('aclImdb/alldata-id.txt', 'rb', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [6]:
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [7]:
%time
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
Doc2Vec(dbow,d100,n5,mc2,t8) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t8) vocabulary scanned & state initialized


In [8]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

In [9]:
import numpy as np
import statsmodels.api as sm
from random import sample
    
def logistic_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, 
                         reinfer_train=False, reinfer_test=False, 
                         infer_steps=None, infer_alpha=None, infer_subsample=0.2):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets = [doc.sentiment for doc in train_set]
    if reinfer_train:
        train_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in train_set]
    else:
        train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if reinfer_test:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [10]:
from collections import defaultdict
error_rates = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [11]:
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)
    
    print("\nEvaluating %s" % model)
    %time err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))

Training Doc2Vec(dbow,d100,n5,mc2,t8)
CPU times: user 20min 22s, sys: 37.9 s, total: 21min
Wall time: 5min 52s

Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)
CPU times: user 2.31 s, sys: 236 ms, total: 2.54 s
Wall time: 794 ms

0.101360 Doc2Vec(dbow,d100,n5,mc2,t8)

Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)
CPU times: user 33min 31s, sys: 1min 41s, total: 35min 13s
Wall time: 8min 59s

Evaluating Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)
CPU times: user 2 s, sys: 223 ms, total: 2.23 s
Wall time: 692 ms

0.153600 Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)

Training Doc2Vec(dm/c,d100,n5,w5,mc2,t8)
CPU times: user 1h 11min 38s, sys: 47.7 s, total: 1h 12min 26s
Wall time: 30min 33s

Evaluating Doc2Vec(dm/c,d100,n5,w5,mc2,t8)
CPU times: user 2.02 s, sys: 264 ms, total: 2.29 s
Wall time: 740 ms

0.223560 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)



In [12]:
for model in [models_by_name['dbow+dmm'], models_by_name['dbow+dmc']]: 
    print("\nEvaluating %s" % model)
    %time err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))


Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)
CPU times: user 4.44 s, sys: 928 ms, total: 5.37 s
Wall time: 1.6 s

0.103200 Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)


Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec(dm/c,d100,n5,w5,mc2,t8)
CPU times: user 4.79 s, sys: 1.04 s, total: 5.82 s
Wall time: 1.55 s

0.102360 Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec(dm/c,d100,n5,w5,mc2,t8)



In [13]:
# Compare error rates achieved, best-to-worst
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print("%f %s" % (rate, name))


Err_rate Model
0.101360 Doc2Vec(dbow,d100,n5,mc2,t8)
0.102360 Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec(dm/c,d100,n5,w5,mc2,t8)
0.103200 Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)
0.153600 Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)
0.223560 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)


In [14]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc 21677...
Doc2Vec(dbow,d100,n5,mc2,t8):
 [(21677, 0.959844708442688), (77987, 0.562900722026825), (49050, 0.5591022372245789)]
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8):
 [(21677, 0.9112704992294312), (90740, 0.5546159148216248), (15028, 0.5322123765945435)]
Doc2Vec(dm/c,d100,n5,w5,mc2,t8):
 [(21677, 0.9156535267829895), (27290, 0.4557366371154785), (33260, 0.4514159560203552)]


In [15]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))

TARGET (94775): «this was a great flick . i love the donald sutherland character . kind of reminds you of his role in invasion of the body snatchers . i saw this movie when it was released and have been looking for a copy of it on dvd since then ! right now it is only available on vhs .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8):

MOST (35409, 0.8156343102455139): «brilliant and moving performances by tom courtenay and peter finch .»

MEDIAN (96897, 0.4166574478149414): «i don't know what type of kool-aid everyone else has been drinking , but this movie was a total disappointment . brilliant ? huh ? are we watching the same movie ? because the movie i saw had a fairly odious protagonist , a supporting cast that existed solely to be ridiculed , a series of 'crazy' events befalling our 'hapless' protagonist that exist nowhere on the reality continuum , and seemed to have been invented by a corp committee composed of former frat 'dudes' pitching idea

In [16]:
word_models = simple_models[:]

In [17]:
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences
while True:
    word = random.choice(word_models[0].wv.index2word)
    if word_models[0].wv.vocab[word].count > 10:
        break
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'waqt' (19 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t8)","Doc2Vec(""alpha=0.05"",dm/m,d100,n5,w10,mc2,t8)","Doc2Vec(dm/c,d100,n5,w5,mc2,t8)"
"[('puzzlers', 0.4229496717453003), (""pedestrian's"", 0.4225466251373291), ('gilded', 0.4051579535007477), (""york'is"", 0.4004163146018982), (""shaft's"", 0.3847389221191406), ('issacs', 0.3838694393634796), ('buds--and', 0.37943464517593384), ('-5', 0.3792581260204315), ('sundowners', 0.3786245882511139), ('addle-brained', 0.37728971242904663), ('cloak', 0.3693162500858307), ('godhood', 0.36655810475349426), ('temple', 0.3651615083217621), ('olde', 0.36450743675231934), ('innate', 0.3610934615135193), ('über', 0.3605837821960449), ('renoir', 0.3604622483253479), ('windshields', 0.36016377806663513), ('spectacles', 0.35989752411842346), ('curitz', 0.3594284951686859)]","[('_full', 0.5873367786407471), (""'2001"", 0.577534019947052), ('farscape', 0.567756175994873), ('jodhaa-akbar', 0.563327431678772), ('wormhole', 0.5610390901565552), ('hex', 0.5601495504379272), ('ddlj', 0.5501421689987183), ('-zoom', 0.5493675470352173), ('dyrl', 0.5453404188156128), ('commedia', 0.5445129871368408), ('bb', 0.5419938564300537), ('kryten', 0.5395208597183228), ('spaceland', 0.5364852547645569), (""dell'arte"", 0.5360150337219238), ('byword', 0.5348269939422607), ('joystick', 0.5329856872558594), (""'2009"", 0.5298449993133545), ('*reason*', 0.5264933109283447), ('alfalfa', 0.5256202220916748), ('self-composed', 0.523798942565918)]","[('hddcs', 0.5533608198165894), ('eklavya', 0.5529837608337402), ('ju', 0.5513511896133423), ('scheherazade', 0.5221931338310242), ('jamel', 0.5171080827713013), ('pakeezah', 0.5130993127822876), ('kaurismaki', 0.5128692388534546), ('kabuto', 0.5099593997001648), ('frederik', 0.500555157661438), (""l'iceberg"", 0.49905508756637573), (""'nosferatu'"", 0.4987587630748749), ('yann', 0.4960111975669861), ('palm-reading', 0.4932093620300293), ('sahni', 0.49007648229599), ('startrek', 0.48972398042678833), ('gustave', 0.48845726251602173), ('sayonara', 0.4883417785167694), ('overdosing', 0.48698553442955017), ('rock-a-doodle', 0.4864930212497711), ('mileena', 0.4835045635700226)]"


In [18]:
# grab the file if not already local
questions_filename = 'questions-words.txt'
if not os.path.isfile(questions_filename):
    # Download IMDB archive
    print("Downloading analogy questions file...")
    url = u'https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt'
    r = requests.get(url)
    with smart_open(questions_filename, 'wb') as f:
        f.write(r.content)
assert os.path.isfile(questions_filename), "questions-words.txt unavailable"
print("Success, questions-words.txt is available for next steps.")

Downloading analogy questions file...
Success, questions-words.txt is available for next steps.


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [19]:
# Note: this analysis takes many minutes
for model in word_models:
    score, sections = model.wv.evaluate_word_analogies('questions-words.txt')
    correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])
    print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))

Doc2Vec(dbow,d100,n5,mc2,t8): 0.00% correct (0 of 14657)
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8): 16.72% correct (2451 of 14657)
Doc2Vec(dm/c,d100,n5,w5,mc2,t8): 18.89% correct (2769 of 14657)


In [20]:
for model in simple_models + [models_by_name['dbow+dmm'], models_by_name['dbow+dmc']]: 
    print("Evaluating %s re-inferred" % str(model))
    pseudomodel_name = str(model)+"_reinferred"
    %time err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs, reinfer_train=True, reinfer_test=True, infer_subsample=1.0)
    error_rates[pseudomodel_name] = err_rate
    print("\n%f %s\n" % (err_rate, pseudomodel_name))

Evaluating Doc2Vec(dbow,d100,n5,mc2,t8) re-inferred
CPU times: user 6min 29s, sys: 1.59 s, total: 6min 31s
Wall time: 6min 30s

0.102840 Doc2Vec(dbow,d100,n5,mc2,t8)_reinferred

Evaluating Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8) re-inferred
CPU times: user 8min 53s, sys: 2.61 s, total: 8min 56s
Wall time: 13min 31s

0.145960 Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)_reinferred

Evaluating Doc2Vec(dm/c,d100,n5,w5,mc2,t8) re-inferred
CPU times: user 14min 28s, sys: 2.57 s, total: 14min 31s
Wall time: 14min 31s

0.216600 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_reinferred

Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8) re-inferred
CPU times: user 15min 42s, sys: 3.63 s, total: 15min 46s
Wall time: 24min 46s

0.104080 Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8)_reinferred

Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)+Doc2Vec(dm/c,d100,n5,w5,mc2,t8) re-inferred
CPU times: user 20min 56s, sys: 3.97 s, total: 21min
Wall time: 3

In [None]:
# Compare error rates achieved, best-to-worst
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print("%f %s" % (rate, name))

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [None]:
%load_ext autoreload
%autoreload 2