In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import db
import cache
import analysis
import settings
import pandas as pd
import gensim
import functools
import gc

Set up database connection

In [17]:
import settings

engine = create_engine(settings.SQL_URL, echo=False)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
Test database connection

In [3]:
brennan_schwartz = analysis.lookup_by_handle('RePEc:eee:jbfina:v:3:y:1979:i:2:p:133-155', session)
brennan_schwartz_desc = analysis.get_descendants(brennan_schwartz, session, 1)
len(brennan_schwartz_desc)

  util.warn_limited(
  util.warn_limited(


Get all abstracts and save to CSV

In [None]:
abstracts = analysis.build_abstracts_df(session)
abstracts.to_csv('../quantcites_data/abstracts_df.csv', index=None)

In [6]:
Reload abstracts

In [48]:
abstracts = pd.read_csv('../quantcites_data/abstracts_df.csv')

In [24]:
len(abstracts)

800212

In [49]:
abstracts.drop_duplicates(subset="Abstract")
abstracts = abstracts[abstracts.Abstract != "No abstract is available for this item."]
abstracts

Unnamed: 0,Handle,Year,Abstract
0,RePEc:ecm:emetrp:v:53:y:1985:i:2:p:385-407,1985,AbstractThis paper uses an intertemporal gener...
3,RePEc:ecm:emetrp:v:60:y:1992:i:1:p:77-105,1992,This paper presents a unifying theory for valu...
7,RePEc:ucp:jnlbus:v:60:y:1987:i:4:p:473-89,1987,This paper introduces a parametrically parsimo...
8,RePEc:bla:jfinan:v:47:y:1992:i:4:p:1259-82,1992,The authors develop a two-factor general equil...
9,RePEc:bla:mathfi:v:7:y:1997:i:2:p:127-155,1997,A class of term structure models with volatili...
...,...,...,...
800207,RePEc:pra:mprapa:99391,2020,We investigate whether joining the European Mo...
800208,RePEc:wrk:warwec:1280,2020,The United Kingdom (UK) reported record employ...
800209,RePEc:unu:wpaper:wp-2016-87,2016,We investigate whether there are racial and et...
800210,RePEc:sae:joupea:v:57:y:2020:i:1:p:126-139,2020,There is as yet little consensus in the litera...


Build the gensim word2vec pipeline

In [50]:
abstracts_only = list(map(lambda x: gensim.utils.simple_preprocess(x), abstracts['Abstract'].tolist()))

In [30]:
del abstracts
del brennan_schwartz

NameError: name 'abstracts' is not defined

In [31]:
gc.collect()

30

In [34]:
# build vocabulary and train model
model = gensim.models.Word2Vec(abstracts_only, 
                               vector_size=256,
                               window=10,
                               min_count=4,
                               workers=10,
                               epochs=20)

In [41]:
model.save("gensim_word_vectors_256_singleword.model")

In [45]:
len(model.wv)

129348

In [46]:
from gensim.models.phrases import Phrases, Phraser

In [70]:
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=3,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

In [71]:
phrases = build_phrases(abstracts_only)

In [65]:
phrases.save('../gensim_models/phrases_model_threshold7.txt')

In [73]:
phrases[abstracts_only[3]]
abstracts_phrased = list(map(lambda x: phrases[x], abstracts_only))

In [75]:
# Build phrase model
phrase_model = gensim.models.Word2Vec(abstracts_phrased, 
                                      vector_size=256,
                                      window=7,
                                      min_count=4,
                                      workers=10,
                                      epochs=20)

In [87]:
phrase_model.save("../gensim_models/gensim_word_vectors_256_phrases.model")

In [93]:
phrase_model.wv.distance("calibration", "implied_volatility")

0.7229732275009155

In [2]:
# Load existing models
phrases_model = gensim.models.Word2Vec.load("../gensim_models/gensim_word_vectors_256_phrases.model")
phrases = gensim.models.Phrases.load('../gensim_models/phrases_model_threshold7.txt')

In [39]:
def wm_distance_estimator(phrases_list, phrases_model):
    def estimate(abstract_1, abstract_2):
        processed_1 = phrases_list[gensim.utils.simple_preprocess(abstract_1)]
        processed_2 = phrases_list[gensim.utils.simple_preprocess(abstract_2)]
        return phrases_model.wv.wmdistance(processed_1, processed_2)
    return estimate
        
    

In [40]:
wm_estimator = wm_distance_estimator(phrases, phrases_model)

In [72]:
hjm = analysis.lookup_by_handle("RePEc:ecm:emetrp:v:60:y:1992:i:1:p:77-105", session).abstract
hunt_kennedy = analysis.lookup_by_handle("RePEc:spr:finsto:v:4:y:2000:i:4:p:391-408", session).abstract
nelson_siegel = analysis.lookup_by_handle("RePEc:ucp:jnlbus:v:60:y:1987:i:4:p:473-89", session).abstract
jamshidian = analysis.lookup_by_handle("RePEc:spr:finsto:v:1:y:1997:i:4:p:293-330", session).abstract
bradley_crane = analysis.lookup_by_handle("RePEc:inm:ormnsc:v:19:y:1972:i:2:p:139-151", session).abstract
bgm = analysis.lookup_by_handle("RePEc:bla:mathfi:v:7:y:1997:i:2:p:127-155", session).abstract
cir = analysis.lookup_by_handle("RePEc:ecm:emetrp:v:53:y:1985:i:2:p:385-407", session).abstract
andersen = analysis.lookup_by_handle("RePEc:taf:apmtfi:v:7:y:2000:i:1:p:1-32", session).abstract

In [42]:
dist1 = wm_estimator(hjm, hunt_kennedy)

In [43]:
dist1

0.841962866854799

In [44]:
dist2 = wm_estimator(hjm, nelson_siegel)

In [45]:
dist2

0.9102496451253718

In [51]:
dist3 = wm_estimator(hjm, jamshidian)

In [52]:
dist3

0.9191931830335097

In [59]:
dist4 = wm_estimator(hjm, bradley_crane)

In [60]:
dist4

0.8722548682633106

In [63]:
jamshidian
bgm

'A class of term structure models with volatility of lognormal type is analyzed in the general HJM framework. The corresponding market forward rates do not explode, and are positive and mean reverting. Pricing of caps and floors is consistent with the Black formulas used in the market. Swaptions are priced with closed formulas that reduce (with an extra assumption) to exactly the Black swaption formulas when yield and volatility are flat. A two–factor version of the model is calibrated to the U.K. market price of caps and swaptions and to the historically estimated correlation between the forward rates.'

In [66]:
dist5 = wm_estimator(jamshidian, bgm)

In [67]:
dist5

0.7984474484594413

In [70]:
dist6 = wm_estimator(jamshidian, cir)

In [71]:
dist6

0.9377679905619464

In [74]:
dist7 = wm_estimator(jamshidian, andersen)

In [75]:
dist7

0.8210132326610072