In [1]:
import numpy as np
import spacy
from scipy.spatial import distance

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
noise = np.random.rand(300)
print(noise.shape)
print(noise[0:20])

(300,)
[0.0711693  0.81933373 0.59072348 0.16224422 0.40065475 0.93172185
 0.23323088 0.20852848 0.06736529 0.56871355 0.46128769 0.93672366
 0.38025336 0.57102121 0.59740994 0.34669868 0.19852334 0.89701945
 0.16326811 0.55943295]


In [4]:
ratio = 0.13

In [5]:
noise = noise * ratio
noise[0:20]

array([0.00925201, 0.10651339, 0.07679405, 0.02109175, 0.05208512,
       0.12112384, 0.03032001, 0.0271087 , 0.00875749, 0.07393276,
       0.0599674 , 0.12177408, 0.04943294, 0.07423276, 0.07766329,
       0.04507083, 0.02580803, 0.11661253, 0.02122485, 0.07272628])

In [6]:
doc = nlp("Wall Street did something 'highly unusual' in Tuesday's session")

In [7]:
token1 = doc[0]
token2 = doc[1]

In [8]:
distance.cosine(token1.vector, token1.vector)

6.568970367659688e-09

In [9]:
distance.cosine(token1.vector, token2.vector)

0.7888519109724299

In [10]:
def experiment_ratio(text, ratio):
    noise = np.random.rand(300)
    noise = noise * ratio
    for token in nlp(text):
        v1 = token.vector
        v2 = v1 + v1 * noise
        print('v2:', distance.cosine(v1, v2), '  noise:', distance.cosine(v1, noise))

In [11]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1000000)

v2: 0.1280623368207342   noise: 0.8412298048073837
v2: 0.14233414452981163   noise: 1.049486320848978
v2: 0.12440611660790579   noise: 0.9814242784290497
v2: 0.13110365333536522   noise: 1.05694023329993
v2: 0.12057261273500275   noise: 1.0641930715026997
v2: 0.13609344502383847   noise: 1.0440888135744641
v2: 0.13657389606860904   noise: 0.9734885084383437
v2: 0.1346077934536818   noise: 0.9147146624215928
v2: 0.13913614384378103   noise: 0.9293084919542522


In [12]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 300)

v2: 0.12360729564565653   noise: 0.8873918100364466
v2: 0.11552534132925185   noise: 1.0208559450056314
v2: 0.11277503857031468   noise: 0.9773640393616633
v2: 0.1156796864988685   noise: 0.9959145125283364
v2: 0.1308927326589555   noise: 1.0416027490451676
v2: 0.1250447608446179   noise: 1.0468487192340294
v2: 0.13001015918346637   noise: 1.0433823750691527
v2: 0.1345227900635203   noise: 0.9972898145585741
v2: 0.11780810551642473   noise: 0.9074664211047752


In [13]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 100)

v2: 0.13108662770116208   noise: 0.865466738041854
v2: 0.11602783523660365   noise: 1.1006019277991006
v2: 0.14067061885768184   noise: 1.0679462630683378
v2: 0.11863902485791256   noise: 1.021816038607252
v2: 0.1076177462658654   noise: 1.0160511154174228
v2: 0.11114332021014062   noise: 1.0129758815620382
v2: 0.12245247212238253   noise: 1.0141866207549757
v2: 0.12571467487219534   noise: 0.9856392601269206
v2: 0.12336552912909537   noise: 0.9192094816633581


In [14]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 10)

v2: 0.09118655595528047   noise: 0.8736301362737098
v2: 0.09151849252823863   noise: 1.0706652368094691
v2: 0.0940032388919595   noise: 0.9614349308911281
v2: 0.09619205588364421   noise: 1.0165811154307285
v2: 0.09967303967751728   noise: 1.0553814525071306
v2: 0.10018505244477316   noise: 1.0281573207973398
v2: 0.10178656169684586   noise: 1.0501610440924618
v2: 0.09643556013416277   noise: 0.9843037496948978
v2: 0.09774037652334622   noise: 0.984111201507666


In [18]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 4.9)

v2: 0.08031423937737137   noise: 0.8245536944420488
v2: 0.08949349773449788   noise: 1.0600799784657227
v2: 0.09328793164253146   noise: 1.0113522098146361
v2: 0.07794324305799083   noise: 1.0453533400916766
v2: 0.07095794443689807   noise: 0.9882234507613242
v2: 0.08388490043123531   noise: 1.067169318057164
v2: 0.07980560923850744   noise: 0.9680398233506209
v2: 0.08795861519517545   noise: 0.9678137314754763
v2: 0.07919805303822047   noise: 0.9031804233438553


In [55]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1)

v2: 0.016897980226474596   noise: 0.8797294674967083
v2: 0.015849536273135123   noise: 0.999734897868424
v2: 0.019468529837083692   noise: 1.019099461360974
v2: 0.015210140071027833   noise: 1.0209295085589083
v2: 0.015777117623011594   noise: 1.0256417359235719
v2: 0.01945440410146171   noise: 1.0366250786824465
v2: 0.016468104595710575   noise: 1.01479079769558
v2: 0.015891661403037216   noise: 0.9771103338300551
v2: 0.019023585062607817   noise: 0.9492001626219537


In [56]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 0.1)

v2: 0.0003969865225065261   noise: 0.9198229966070044
v2: 0.00037363356040009954   noise: 1.0566859096182768
v2: 0.0004029995190050206   noise: 1.0338967634926317
v2: 0.00035247268919758046   noise: 0.981684262422088
v2: 0.00035084243178873376   noise: 1.0276939871523694
v2: 0.00033966609406688164   noise: 1.0238632023374339
v2: 0.00037516479527266444   noise: 1.0645569510305153
v2: 0.00040498800552324354   noise: 1.0358677566239163
v2: 0.00038180414563704357   noise: 0.9734253716220406


In [57]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 300)

v2: 0.1386425437772001   noise: 0.9696912255956226
v2: 0.12945518492914265   noise: 0.861915955711775
v2: 0.12359748790077707   noise: 1.057544174654151
v2: 0.10350080538345519   noise: 0.9908744695293683
v2: 0.12442469271693135   noise: 0.880355924472626
v2: 0.12053827385993188   noise: 1.0374700347213583
v2: 0.14594456459791327   noise: 1.0501885649748217
v2: 0.12024872156519484   noise: 1.0560096796266316
v2: 0.11049166410903266   noise: 0.9756464005706124
v2: 0.12379258038819663   noise: 0.8890329445145828
v2: 0.11400151584770035   noise: 1.0126147754023278
v2: 0.10350080538345519   noise: 0.9908744695293683
v2: 0.13370173219289994   noise: 0.9263255203526843


In [58]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 50)

v2: 0.11200375406091889   noise: 0.9404770906591007
v2: 0.11493324186580689   noise: 0.8749726964532166
v2: 0.12882813714570107   noise: 1.0946500141882753
v2: 0.10684825832394851   noise: 0.9268044335761094
v2: 0.13211797613807785   noise: 0.9113979876610846
v2: 0.14509222049305837   noise: 1.0103738820652903
v2: 0.12113778619016369   noise: 1.0944102830299482
v2: 0.12452597948185529   noise: 1.0937137204055039
v2: 0.11740063058188133   noise: 0.976419533542694
v2: 0.15134624846571998   noise: 0.9626217349848665
v2: 0.12991650790189913   noise: 1.0385413581904304
v2: 0.10684825832394851   noise: 0.9268044335761094
v2: 0.11697168246491285   noise: 0.9488109520550206


In [64]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 10)

v2: 0.10910172828274478   noise: 0.9918126486741402
v2: 0.09593506674549224   noise: 0.9007787216833182
v2: 0.10012127150644023   noise: 1.0867410183027733
v2: 0.11002090394117214   noise: 0.9479564004940676
v2: 0.09517136751027488   noise: 0.9269384923466978
v2: 0.11600157267089173   noise: 1.0391689373468862
v2: 0.10306347412918682   noise: 1.0880311528791538
v2: 0.11104945641534014   noise: 1.0496866292874503
v2: 0.11657110801977755   noise: 1.0236474906519502
v2: 0.09017187854478503   noise: 0.9362426838461437
v2: 0.11684278691384409   noise: 1.061398706986406
v2: 0.11002090394117214   noise: 0.9479564004940676
v2: 0.09459708815512768   noise: 0.9541057202515284


In [65]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 1)

v2: 0.017186757014795107   noise: 0.9312264369533974
v2: 0.015043893390959018   noise: 0.8873463731479371
v2: 0.01701012800167989   noise: 1.0348997148932773
v2: 0.016675692374469975   noise: 0.9537679347738377
v2: 0.018533861890049907   noise: 0.906610775320838
v2: 0.017837134935857923   noise: 1.0530485609745464
v2: 0.018959388828086565   noise: 1.0937942662567142
v2: 0.018597441322357522   noise: 1.07717713197533
v2: 0.017726476562477944   noise: 0.9689512005256666
v2: 0.017801001922594817   noise: 0.9805172815745954
v2: 0.018494035172423207   noise: 1.0746835108985273
v2: 0.016675692374469975   noise: 0.9537679347738377
v2: 0.017915539919112544   noise: 0.9480579807195426


In [66]:
def experiment2_ratio(text, ratio):
    noise = np.random.rand(300)
    noise = noise * ratio
    for token in nlp(text):
        v1 = token.vector
        v2 = v1 + noise
        print('v2:', distance.cosine(v1, v2), '  noise:', distance.cosine(v1, noise))

In [68]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1000000)

v2: 0.9152088621381071   noise: 0.9152123284914833
v2: 1.070556210407779   noise: 1.070561652407139
v2: 0.9883181932184327   noise: 0.9883271549491298
v2: 0.9949536905376715   noise: 0.9949587051891523
v2: 1.0025278066344443   noise: 1.0025308738786756
v2: 1.0092902364927023   noise: 1.0092974503816123
v2: 0.983197569252202   noise: 0.9832016094018784
v2: 0.9383240850763344   noise: 0.9383302051457334
v2: 0.9437974968177358   noise: 0.9438020053383233


In [69]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 100)

v2: 0.8312187193859965   noise: 0.8646856822977205
v2: 0.972432088545038   noise: 1.0262369942213174
v2: 0.9439828086597537   noise: 1.032100565618671
v2: 0.9583231263913341   noise: 1.0076231591396951
v2: 1.0536501495386212   noise: 1.0837144706722512
v2: 0.9668702509221347   noise: 1.0378516971094385
v2: 1.0152677157767178   noise: 1.0550043263403508
v2: 0.9266437004209195   noise: 0.9868905788034364
v2: 0.9380775964954187   noise: 0.9824631453408906


In [70]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1)

v2: 0.03583615799899098   noise: 0.8425881822096555
v2: 0.017036316136244345   noise: 1.0295017767841772
v2: 0.006434116711719673   noise: 1.0448845997712852
v2: 0.020251688250806255   noise: 1.0392846360100845
v2: 0.05147129622130253   noise: 1.01999621300564
v2: 0.009963941546027755   noise: 1.0852355131333697
v2: 0.030256472175658078   noise: 0.9994917298370202
v2: 0.013080535179240127   noise: 0.9307139521550456
v2: 0.023195780021580292   noise: 0.9043225947177269


In [71]:
experiment2_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 5)

v2: 0.18520427562150765   noise: 0.9707630124327707
v2: 0.4728110244700925   noise: 0.9096868041863712
v2: 0.5028995324940628   noise: 1.0391795882162982
v2: 0.1804397156899975   noise: 0.9899341641904438
v2: 0.43610183859568197   noise: 0.9480234316648422
v2: 0.2538119695754032   noise: 1.0753200567200547
v2: 0.2647302603725741   noise: 1.0104353997666076
v2: 0.15067870470639733   noise: 1.0394201179955407
v2: 0.2773292343390318   noise: 0.9745582836686124
v2: 0.2895800885903028   noise: 0.9337691789398601
v2: 0.3650715449036127   noise: 1.0608421410557085
v2: 0.1804397156899975   noise: 0.9899341641904438
v2: 0.22811617361302694   noise: 0.9777790658682078
