### Project Kojak

Using word2vec word embeddings, we will identify words that are homographs (spelled the same, but with multiple meanings) and determine the exact meaning of the word from a context window.

In [1]:
import gensim
import json
import os
import re
from nltk.corpus import stopwords
from pprint import pprint



Using Theano backend.


In [2]:
# Declare stopwords, preprocess the data from source file abstracts.json

stop = stopwords.words('english')
stop+=['?','!','.',',',':',';','[',']','[]','“' ]

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            #yield paper['full_text'].lower().split()
            line = [word for word in paper['full_text'].lower().split() if word not in stop]
            line = [re.sub(r'[?\.,!:;\(\)“]','',l) for l in line]
            yield line
                

In [3]:
#Instantiate iterable on the data

papers = MyPapers('abstract_scraper/full.json') 

In [4]:
#Initialize the model

model = gensim.models.word2vec.Word2Vec(sentences = papers, size=100, window=5, min_count=4, workers=2,sg=1)



In [6]:
model.corpus_count

705

In [5]:
vectors = model.wv

In [6]:
vectors['research']

array([ 0.17825891,  0.02763059, -0.38819501,  0.76435179, -0.1224629 ,
        0.20958181, -0.60579371, -0.10117049, -0.02767174, -0.16134006,
       -0.02301087,  0.43878481,  0.28521818, -0.04950119, -0.05156896,
        0.21588418,  0.2238759 ,  0.166512  , -0.45001572,  0.98030335,
        0.31795076, -0.03204121,  0.78066665,  0.21388581,  0.17579487,
       -0.55938214,  0.34476283,  0.1562061 , -0.21031822,  0.20882048,
        0.03267602, -0.3113665 ,  0.61904722, -0.15829873,  0.50858819,
       -0.12459898, -0.44563791,  0.05365699, -0.65739572,  0.36476001,
       -0.30184168,  0.53788775, -0.00974252,  0.0177775 , -0.12691692,
       -0.48073232,  0.39651164,  0.39430544,  0.27370721, -0.36667523,
       -0.16870221, -0.15307225,  0.17368501,  0.16112219, -0.26217571,
        0.13525565,  0.33305067, -0.11685678, -0.03128953,  0.43858287,
        0.0884467 , -0.26394305,  0.36755243,  0.21608105, -0.47787026,
        0.3113932 , -0.29664072,  0.09352776,  0.00372994, -0.42

In [7]:
model.most_similar('research' ,topn=8)


[(u'researches', 0.67867112159729),
 (u'literature', 0.6695749759674072),
 (u'recommendations', 0.6658555269241333),
 (u'highlight', 0.6631516218185425),
 (u'suggestions', 0.6628372073173523),
 (u'k-12', 0.658211350440979),
 (u'works', 0.6569048166275024),
 (u'methodology', 0.6568661332130432)]

In [8]:
model.most_similar('close' ,topn=8)


[(u'smallest', 0.941720187664032),
 (u'intermediate', 0.936434268951416),
 (u'-th', 0.9309316873550415),
 (u'say', 0.9297983646392822),
 (u'moves', 0.9295343160629272),
 (u'supremum', 0.9284391403198242),
 (u'imply', 0.9256318807601929),
 (u'bid', 0.9244518280029297)]

In [9]:
model.most_similar('use' ,topn=8)


[(u'used', 0.6165218353271484),
 (u'uses', 0.5388023257255554),
 (u'alternative', 0.5333783030509949),
 (u'addition', 0.5080673694610596),
 (u'useful', 0.4978896677494049),
 (u'appropriate', 0.4967584013938904),
 (u'unknown', 0.49574515223503113),
 (u'powerful', 0.49347132444381714)]

In [10]:
model.most_similar('wind' ,topn=8)


[(u'offshore', 0.8907073140144348),
 (u'farm', 0.8756291270256042),
 (u'renewable', 0.8563683032989502),
 (u'farms', 0.8489881157875061),
 (u'electricity', 0.8368750810623169),
 (u'turbine', 0.8103119730949402),
 (u'generation', 0.7943708896636963),
 (u'turbines', 0.7937995791435242)]

In [11]:
model.train(sentences = papers,total_examples=model.corpus_count, epochs = model.iter)

2898247

In [12]:
model.most_similar('research' ,topn=8)


[(u'exploring', 0.5740522146224976),
 (u'researches', 0.5566179752349854),
 (u'advance', 0.5519660711288452),
 (u'extant', 0.5498912930488586),
 (u'innovationthe', 0.5478745102882385),
 (u'discussing', 0.5470035076141357),
 (u'interdisciplinary', 0.5466692447662354),
 (u'suggestion', 0.546507716178894)]

In [13]:
model.most_similar('close' ,topn=8)


[(u'coincide', 0.6834981441497803),
 (u'attracting', 0.6830309629440308),
 (u'recognizable', 0.6794874668121338),
 (u'unequal', 0.6720108985900879),
 (u'ramps', 0.6675475835800171),
 (u'loses', 0.6657685041427612),
 (u'snic', 0.6635018587112427),
 (u'nebentypus', 0.6631600260734558)]

In [14]:
model.most_similar('use' ,topn=8)


[(u'uses', 0.47957053780555725),
 (u'illicit', 0.4514145255088806),
 (u'sure', 0.4439316987991333),
 (u'instead', 0.4426302909851074),
 (u'unavailable', 0.4142072796821594),
 (u'wrong', 0.4112270772457123),
 (u'zoning', 0.40751612186431885),
 (u'encryption', 0.40598928928375244)]

In [15]:
model.most_similar('wind' ,topn=8)


[(u'farm', 0.8397685289382935),
 (u'offshore', 0.8078193068504333),
 (u'farms', 0.760556161403656),
 (u'onshore', 0.757138192653656),
 (u'turbine', 0.7163775563240051),
 (u'lcoe', 0.7067546844482422),
 (u'turbines', 0.6901949644088745),
 (u'photovoltaic', 0.6621192693710327)]

In [16]:
model.train(sentences = papers,total_examples=model.corpus_count, epochs = model.iter)

2898224

In [17]:
model.most_similar('close' ,topn=20)


[(u'recognizable', 0.553083598613739),
 (u'loses', 0.5403499603271484),
 (u'minimizer', 0.538335919380188),
 (u'displayed', 0.5253382325172424),
 (u'terminates', 0.5204147696495056),
 (u'2\u20139', 0.5182613730430603),
 (u'disadvantaged', 0.5089924335479736),
 (u'nullcline', 0.5053083896636963),
 (u'stronger', 0.5035704970359802),
 (u'gotten', 0.503265917301178),
 (u'saddle-node', 0.5026227235794067),
 (u'2\u20132', 0.5018879175186157),
 (u'm-d', 0.5015227794647217),
 (u'tight', 0.5008443593978882),
 (u'periphery', 0.500777542591095),
 (u'neighbors', 0.49957799911499023),
 (u'deform', 0.49761340022087097),
 (u'attracting', 0.49637165665626526),
 (u'intercept', 0.4960039258003235),
 (u'harrison', 0.49500516057014465)]

In [18]:
model.most_similar('use' ,topn=20)


[(u'illicit', 0.5334938764572144),
 (u'uses', 0.5033749341964722),
 (u'sure', 0.47222280502319336),
 (u'rudimentary', 0.45067834854125977),
 (u'zoning', 0.443381667137146),
 (u'used', 0.4423278868198395),
 (u'tam', 0.4411676526069641),
 (u'utilize', 0.438945472240448),
 (u'gis-based', 0.43141502141952515),
 (u'revolutionized', 0.42527222633361816),
 (u'extant', 0.42283767461776733),
 (u'converse', 0.4222954511642456),
 (u'blend', 0.42210543155670166),
 (u'adopted', 0.42109984159469604),
 (u'assists', 0.4208453893661499),
 (u'one-step', 0.4201527237892151),
 (u'ubiquity', 0.4186035990715027),
 (u'gym-goers', 0.41832393407821655),
 (u'recommends', 0.41741371154785156),
 (u'efficient', 0.41677191853523254)]

In [20]:
model.most_similar('right' ,topn=20)


[(u'left', 0.7097992300987244),
 (u'volar', 0.6841639280319214),
 (u'forearm', 0.6467432379722595),
 (u'recognizable', 0.6323999166488647),
 (u'pitcher', 0.617566704750061),
 (u'extremities', 0.6084349155426025),
 (u'epicondyle', 0.6058075428009033),
 (u'palpation', 0.605347216129303),
 (u'softball', 0.5909222364425659),
 (u'corner', 0.590570330619812),
 (u'dorsal', 0.5805616974830627),
 (u'complaint', 0.5794276595115662),
 (u'harrison', 0.5707764625549316),
 (u'tight', 0.5695618391036987),
 (u'phronesis', 0.5656307935714722),
 (u'fold', 0.5650084614753723),
 (u'large-amplitude', 0.5597513914108276),
 (u'wound', 0.5591278076171875),
 (u'cervical', 0.5547938346862793),
 (u'dog', 0.5530991554260254)]

In [21]:
model.most_similar('finance' ,topn=20)


[(u'conic', 0.6877789497375488),
 (u'quants', 0.6690090894699097),
 (u'pricing', 0.6683673858642578),
 (u'neoclassical', 0.6548030376434326),
 (u'continuous-time', 0.6339081525802612),
 (u'dividend', 0.6310253143310547),
 (u'bank\u2019s', 0.6251856684684753),
 (u'liquidity', 0.6193803548812866),
 (u'proposals', 0.6192363500595093),
 (u'sis', 0.6175393462181091),
 (u'venture', 0.6157879829406738),
 (u'academies', 0.6118347644805908),
 (u'discounting', 0.6115192174911499),
 (u'tackling', 0.6090777516365051),
 (u'es', 0.607329785823822),
 (u'advocated', 0.6070413589477539),
 (u'economics', 0.6062716841697693),
 (u'del', 0.6061701774597168),
 (u'bachelier', 0.6060438752174377),
 (u'twenty-first-century', 0.6056631207466125)]

In [22]:
model.most_similar('record' ,topn=20)


[(u'upload', 0.6762716770172119),
 (u'annotate', 0.6587749719619751),
 (u'retrieve', 0.6553597450256348),
 (u'browser', 0.6459447741508484),
 (u'encrypted', 0.6331414580345154),
 (u'queries', 0.6285848617553711),
 (u'tld', 0.6193617582321167),
 (u'debugger', 0.6168504357337952),
 (u'babies\u2019', 0.6138877868652344),
 (u'sender', 0.6113730669021606),
 (u'readings', 0.6051900386810303),
 (u'searchable', 0.6034226417541504),
 (u'text-based', 0.6013210415840149),
 (u'authentication', 0.6010237336158752),
 (u'domain-specific', 0.5972707271575928),
 (u'readable', 0.5944960117340088),
 (u'sideline', 0.5907342433929443),
 (u'shopping', 0.5857497453689575),
 (u'headache\u201d', 0.5853506326675415),
 (u'pulled', 0.5846382975578308)]

In [23]:
model.train(sentences = papers,total_examples=model.corpus_count, epochs = model.iter)

2898434