In [1]:
import locale

#from matplotlib import pyplot as plt
import numpy as np

from db import psqlServer
sv = psqlServer()

control_chars = [chr(0x85)]
locale.setlocale(locale.LC_ALL, 'C')



'C'

In [2]:
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

In [3]:
AI = "arx_AI"        # artificial intelligence
FA = "arx_math_FA"   # functional analysis
GR = "arx_GRQC"      # general relativity and quantum cosmology
LG = "arx"           # learning
NT = "arx_math_NT"   # number theory

In [4]:
# How many items in all tables?
UNION = " UNION ALL "
SEL_C = "SELECT COUNT(*) as FOO FROM "
sum_statement = "SELECT SUM(FOO) FROM ("
sum_statement += SEL_C + AI
sum_statement += UNION + SEL_C + FA 
sum_statement += UNION + SEL_C + GR 
sum_statement += UNION + SEL_C + LG 
sum_statement += UNION + SEL_C + NT
sum_statement += ") as B;"

total = sv.execute(sum_statement)[0]['sum']
print(total)

116060


In [5]:
# Per table
SEL_C = "SELECT COUNT(*) FROM "
total_AI = sv.execute(SEL_C + AI)[0]['count']
total_FA = sv.execute(SEL_C + FA)[0]['count']
total_GR = sv.execute(SEL_C + GR)[0]['count']
total_LG = sv.execute(SEL_C + LG)[0]['count']
total_NT = sv.execute(SEL_C + NT)[0]['count']

print("AI:{}".format(total_AI))
print("FA:{}".format(total_FA))
print("GR:{}".format(total_GR))
print("LG:{}".format(total_LG))
print("NT:{}".format(total_NT))

AI:13000
FA:16000
GR:50000
LG:15060
NT:22000


In [6]:
# Get some rows from the AI table
rows = sv.execute("SELECT * FROM arx_GRQC")
corpus = [x['abstract'] for x in rows]
labels = [float(x['has_journal_ref']) for x in rows]

#rows2 = sv.execute("SELECT abstract FROM arx_math_FA LIMIT 10000")
#corpus += [x['abstract'] for x in rows2]
#labels += [x['has_journal_ref'] for x in rows2]

#rows3 = sv.execute("SELECT abstract FROM arx_GRQC LIMIT 30000")
#corpus += [x['abstract'] for x in rows3]
#labels += [x['has_journal_ref'] for x in rows2]


In [7]:
corpus_n = []
jj = 0
for corp in corpus:
    corpus_n.append(u"_*{0} {1}\n".format(jj, normalize_text(corp)))
    jj += 1

In [8]:
len(corpus_n)

50000

In [9]:
import gensim
from collections import namedtuple


In [10]:
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
for line_no, line in enumerate(corpus_n[0:12500]):
    tokens = gensim.utils.to_unicode(corpus_n[line_no]).split()
    words = tokens[1:]
    tags = [line_no]
    split = 'train'
    #split = ['train', 'test', 'extra', 'extra'][line_no//12500]
    sentiment = labels[line_no]
    #sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//6750]
    alldocs.append(SentimentDocument(words, tags, split, sentiment))

for line_no, line in enumerate(corpus_n[12500:25000]):
    tokens = gensim.utils.to_unicode(corpus_n[line_no]).split()
    words = tokens[1:]
    tags = [line_no]
    split = 'test'
    #split = ['train', 'test', 'extra', 'extra'][line_no//12500]
    sentiment = labels[line_no]
    #sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//6750]
    alldocs.append(SentimentDocument(words, tags, split, sentiment))
    
for line_no, line in enumerate(corpus_n[25000:]):
    tokens = gensim.utils.to_unicode(corpus_n[line_no]).split()
    words = tokens[1:]
    tags = [line_no]
    split = 'extra'
    #split = ['train', 'test', 'extra', 'extra'][line_no//12500]
    sentiment = None
    #sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//6750]
    alldocs.append(SentimentDocument(words, tags, split, sentiment))    

In [11]:
len(alldocs)

50000

In [12]:
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

50000 docs: 12500 train-sentiment, 12500 test-sentiment


In [13]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

In [14]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [15]:
simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [16]:
simple_models[0].build_vocab(alldocs)

In [17]:
print(simple_models[0])
for model in simple_models[1:]:
    """Reuse shareable structures from other_model."""
    model.reset_from(simple_models[0])
    print(model)

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [18]:
models_by_name = OrderedDict((str(model), model) for model in simple_models)
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [19]:
models_by_name

OrderedDict([('Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x116d075f8>),
             ('Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x116d076a0>),
             ('Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x116d07550>),
             ('dbow+dmm',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x12b90af28>),
             ('dbow+dmc',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x12b90aef0>)])

In [20]:

# Predictive Evaluation Methods

# Let's define some helper methods for evaluating the performance of our Doc2vec
# using paragraph vectors. We will classify document sentiments using a logistic 
# regression model based on our paragraph embeddings. We will compare the error 
# rates based on word embeddings from our various Doc2vec models.
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 


  from pandas.core import datetools


In [21]:
@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    
    # Here is the wtf: test_data is just [None]*500!!!
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [22]:

# Bulk Training

# We use an explicit multiple-pass, alpha-reduction approach as sketched in 
# this gensim doc2vec blog post with added shuffling of corpus on each pass.
# https://rare-technologies.com/doc2vec-tutorial/
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [23]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2017-11-30 23:38:14.995967
*0.255680 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 9.3s 0.5s
*0.271200 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 9.3s 2.2s
*0.252160 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 4.0s 0.5s
*0.284000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 4.0s 1.1s
*0.252400 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 7.1s 0.5s
*0.302400 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 7.1s 1.3s
*0.246080 : 1 passes : dbow+dmm 0.0s 1.0s
*0.313600 : 1 passes : dbow+dmm_inferred 0.0s 2.6s
*0.247840 : 1 passes : dbow+dmc 0.0s 1.0s
*0.316800 : 1 passes : dbow+dmc_inferred 0.0s 3.5s
Completed pass 1 at alpha 0.025000
*0.254320 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 11.7s 0.5s
*0.240320 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 4.4s 0.5s
*0.251440 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 7.3s 0.5s
*0.237120 : 2 passes : dbow+dmm 0.0s 0.9s
*0.240480 : 2 passes : dbow+dmc 0.

In [24]:
# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))


Err rate Model
0.222400 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred
0.233600 dbow+dmc
0.234400 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred
0.234640 dbow+dmm
0.237600 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
0.248400 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)
0.249600 dbow+dmm_inferred
0.252000 dbow+dmc_inferred
0.254000 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
0.271200 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred


In [25]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc 19145...
Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8):
 [(6645, 0.7364850044250488), (24693, 0.6765921711921692), (17275, 0.674149751663208)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8):
 [(6645, 0.8889714479446411), (24069, 0.6364575624465942), (22132, 0.6157433986663818)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):
 [(6645, 0.7635817527770996), (21852, 0.7499732375144958), (13736, 0.7415536642074585)]


In [26]:

import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))



TARGET (22506): «starting from a lagrangian we perform the full constraint analysis of the hamiltonian for general relativity in the tetrad-connection formulation for an arbitrary value of the immirzi parameter and solve the second class constraints , presenting the theory with a hamiltonian composed of first class constraints which are the generators of the gauge symmetries of the action . in the time gauge we then recover barbero's formulation of gravity .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):

MOST (17328, 0.7840209603309631): «for schwarzschild space-time , distributional expressions of energy-momentum densities and of scalar concomitants of the curvature tensors are examined for a class of coordinate systems which includes those of the schwarzschild and of kerr-schild types as special cases . the energy-momentum density $\tilde t_\mu^{\nu} ( x ) $ of the gravitational source and the gravitational energy-momentum pseudo-tensor density $\tilde 

In [27]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))



TARGET (8300): «recent observations of galactic black hole candidates ( bhcs ) suggest that those that are superluminal jet sources have more rapid black hole spin rates than otherwise normal bhcs . this provides observational support for models of relativistic jet formation that extract rotational energy of the central black hole . to investigate this mechanism , we have developed a new general relativistic magnetohydrodynamic code in kerr geometry . here we report on the first numerical simulation of the formation of a relativistic jet in a rapidly-rotating ( a=0 . 95 ) kerr black hole magnetosphere . we assume that the initial velocity of the disk is zero . we find that the maximum velocity of the jet reaches 0 . 93c ( lorentz factor , 2 . 7 ) and the terminal velocity of the jet is 0 . 85c ( lorentz factor , 1 . 9 ) . on the other hand , for a non-rotating ( a=0 ) schwarzschild black hole , the maximum outflow velocity is less than 0 . 6c for initial magnetospheric conditions simil

In [28]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))



TARGET (18881): «in recent work on black hole entropy in non-perturbative quantum gravity , an action for the black hole sector of the phase space is introduced and ( partially ) quantized . we give a number of observations on this and related works . in particular we show that ( i ) the entropy calculation applies without change to generally covariant theories having no black hole solutions , ( ii ) the phase space constraint used to select the black hole sector is not the apparent horizon equation , which is the natural phase space constraint separating trapped and untrapped regions on an initial data surface , and ( iii ) there appears to be at least one other phase space constraint which leads to the conclusion that the entropy associated with a bounding two-dimensional surface is proportional to its area .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8):

MOST (17928, 0.5639107823371887): «we study the extent to which d=11 supergravity can be deformed an