In [1]:
import numpy as np
import spacy
from scipy.spatial import distance

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
noise = np.random.rand(300)
print(noise.shape)
print(noise[0:20])

(300,)
[0.47293428 0.53719633 0.42165152 0.37769479 0.85601732 0.60287564
 0.40167532 0.40782471 0.36169991 0.04687149 0.36860623 0.53159409
 0.98086109 0.66243621 0.35025546 0.80266486 0.97248829 0.09688613
 0.62758945 0.55446467]


In [4]:
ratio = 0.13

In [5]:
noise = noise * ratio
noise[0:20]

array([0.06148146, 0.06983552, 0.0548147 , 0.04910032, 0.11128225,
       0.07837383, 0.05221779, 0.05301721, 0.04702099, 0.00609329,
       0.04791881, 0.06910723, 0.12751194, 0.08611671, 0.04553321,
       0.10434643, 0.12642348, 0.0125952 , 0.08158663, 0.07208041])

In [6]:
doc = nlp("Wall Street did something 'highly unusual' in Tuesday's session")

In [13]:
token1 = doc[0]
token2 = doc[1]

In [14]:
distance.cosine(token1.vector, token1.vector)

6.568970367659688e-09

In [15]:
distance.cosine(token1.vector, token2.vector)

0.7888519109724299

In [50]:
def experiment_ratio(text, ratio):
    noise = np.random.rand(300)
    noise = noise * ratio
    for token in nlp(text):
        v1 = token.vector
        v2 = v1 + v1 * noise
        print('v2:', distance.cosine(v1, v2), '  noise:', distance.cosine(v1, noise))

In [63]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1000000)

v2: 0.13236163680167035   noise: 0.9004008878504814
v2: 0.11752191441701898   noise: 1.0197819492937412
v2: 0.12637305850104374   noise: 0.9516132576147319
v2: 0.10915060643189933   noise: 0.9426408131035274
v2: 0.13317400250035671   noise: 1.0146826394943447
v2: 0.13699565135352698   noise: 0.9897670760110596
v2: 0.12455529741881433   noise: 0.9887322464413941
v2: 0.13441279389719074   noise: 0.9762125836848314
v2: 0.1401217862832449   noise: 0.9328596572198454


In [59]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 300)

v2: 0.12791511201012373   noise: 0.9105860371126124
v2: 0.11983995169776274   noise: 0.9799985317370472
v2: 0.12940608187368874   noise: 0.9764827988254606
v2: 0.14440360697050914   noise: 0.9716322583814554
v2: 0.13062395095176793   noise: 1.001777706313417
v2: 0.1395417597333367   noise: 0.9926082913661297
v2: 0.136896253092485   noise: 0.9805154330648375
v2: 0.11150307637190993   noise: 0.9700964478468665
v2: 0.13970870810565672   noise: 0.9255095640723654


In [52]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 100)

v2: 0.12046466583960114   noise: 0.8613308674868216
v2: 0.15639273546899035   noise: 1.0188226034956145
v2: 0.1121265964210737   noise: 1.01914702231783
v2: 0.12510832453711396   noise: 0.9808360473737364
v2: 0.12940826797407345   noise: 0.9790525118341817
v2: 0.13820763359145016   noise: 1.0335245080692
v2: 0.13933669120172132   noise: 0.9983514008268366
v2: 0.1221235652822471   noise: 0.9827036336535182
v2: 0.11690690352549415   noise: 0.9082510013558791


In [53]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 10)

v2: 0.10915617670586619   noise: 0.897187516443729
v2: 0.10159284487793019   noise: 1.074457256370365
v2: 0.10584091029642662   noise: 1.0409338641225316
v2: 0.09383210377756102   noise: 1.0715554691375888
v2: 0.09405671443815955   noise: 1.0962999692582756
v2: 0.09531905889588455   noise: 1.0864878837134644
v2: 0.10723101546637304   noise: 1.0263525136603997
v2: 0.10152849746028436   noise: 0.9518359261787561
v2: 0.09829411619773587   noise: 0.979618646661605


In [54]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 3)

v2: 0.04979548037993198   noise: 0.8950942281330653
v2: 0.05861514234394527   noise: 1.0343294446452325
v2: 0.05445100469928543   noise: 1.0508321643846525
v2: 0.05768182649168885   noise: 1.0311467458661603
v2: 0.0515804292144022   noise: 1.029421053897179
v2: 0.047594595268201645   noise: 1.0380132547201726
v2: 0.05296209310896405   noise: 0.9962709311006526
v2: 0.0521773260613303   noise: 0.9511461639952181
v2: 0.05695941472566146   noise: 0.923170632003105


In [55]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1)

v2: 0.016897980226474596   noise: 0.8797294674967083
v2: 0.015849536273135123   noise: 0.999734897868424
v2: 0.019468529837083692   noise: 1.019099461360974
v2: 0.015210140071027833   noise: 1.0209295085589083
v2: 0.015777117623011594   noise: 1.0256417359235719
v2: 0.01945440410146171   noise: 1.0366250786824465
v2: 0.016468104595710575   noise: 1.01479079769558
v2: 0.015891661403037216   noise: 0.9771103338300551
v2: 0.019023585062607817   noise: 0.9492001626219537


In [56]:
experiment_ratio('Dividend cuts may mean rethinking your retirement income strategy', 0.1)

v2: 0.0003969865225065261   noise: 0.9198229966070044
v2: 0.00037363356040009954   noise: 1.0566859096182768
v2: 0.0004029995190050206   noise: 1.0338967634926317
v2: 0.00035247268919758046   noise: 0.981684262422088
v2: 0.00035084243178873376   noise: 1.0276939871523694
v2: 0.00033966609406688164   noise: 1.0238632023374339
v2: 0.00037516479527266444   noise: 1.0645569510305153
v2: 0.00040498800552324354   noise: 1.0358677566239163
v2: 0.00038180414563704357   noise: 0.9734253716220406


In [57]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 300)

v2: 0.1386425437772001   noise: 0.9696912255956226
v2: 0.12945518492914265   noise: 0.861915955711775
v2: 0.12359748790077707   noise: 1.057544174654151
v2: 0.10350080538345519   noise: 0.9908744695293683
v2: 0.12442469271693135   noise: 0.880355924472626
v2: 0.12053827385993188   noise: 1.0374700347213583
v2: 0.14594456459791327   noise: 1.0501885649748217
v2: 0.12024872156519484   noise: 1.0560096796266316
v2: 0.11049166410903266   noise: 0.9756464005706124
v2: 0.12379258038819663   noise: 0.8890329445145828
v2: 0.11400151584770035   noise: 1.0126147754023278
v2: 0.10350080538345519   noise: 0.9908744695293683
v2: 0.13370173219289994   noise: 0.9263255203526843


In [58]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 50)

v2: 0.11200375406091889   noise: 0.9404770906591007
v2: 0.11493324186580689   noise: 0.8749726964532166
v2: 0.12882813714570107   noise: 1.0946500141882753
v2: 0.10684825832394851   noise: 0.9268044335761094
v2: 0.13211797613807785   noise: 0.9113979876610846
v2: 0.14509222049305837   noise: 1.0103738820652903
v2: 0.12113778619016369   noise: 1.0944102830299482
v2: 0.12452597948185529   noise: 1.0937137204055039
v2: 0.11740063058188133   noise: 0.976419533542694
v2: 0.15134624846571998   noise: 0.9626217349848665
v2: 0.12991650790189913   noise: 1.0385413581904304
v2: 0.10684825832394851   noise: 0.9268044335761094
v2: 0.11697168246491285   noise: 0.9488109520550206


In [64]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 10)

v2: 0.10910172828274478   noise: 0.9918126486741402
v2: 0.09593506674549224   noise: 0.9007787216833182
v2: 0.10012127150644023   noise: 1.0867410183027733
v2: 0.11002090394117214   noise: 0.9479564004940676
v2: 0.09517136751027488   noise: 0.9269384923466978
v2: 0.11600157267089173   noise: 1.0391689373468862
v2: 0.10306347412918682   noise: 1.0880311528791538
v2: 0.11104945641534014   noise: 1.0496866292874503
v2: 0.11657110801977755   noise: 1.0236474906519502
v2: 0.09017187854478503   noise: 0.9362426838461437
v2: 0.11684278691384409   noise: 1.061398706986406
v2: 0.11002090394117214   noise: 0.9479564004940676
v2: 0.09459708815512768   noise: 0.9541057202515284


In [65]:
experiment_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 1)

v2: 0.017186757014795107   noise: 0.9312264369533974
v2: 0.015043893390959018   noise: 0.8873463731479371
v2: 0.01701012800167989   noise: 1.0348997148932773
v2: 0.016675692374469975   noise: 0.9537679347738377
v2: 0.018533861890049907   noise: 0.906610775320838
v2: 0.017837134935857923   noise: 1.0530485609745464
v2: 0.018959388828086565   noise: 1.0937942662567142
v2: 0.018597441322357522   noise: 1.07717713197533
v2: 0.017726476562477944   noise: 0.9689512005256666
v2: 0.017801001922594817   noise: 0.9805172815745954
v2: 0.018494035172423207   noise: 1.0746835108985273
v2: 0.016675692374469975   noise: 0.9537679347738377
v2: 0.017915539919112544   noise: 0.9480579807195426


In [66]:
def experiment2_ratio(text, ratio):
    noise = np.random.rand(300)
    noise = noise * ratio
    for token in nlp(text):
        v1 = token.vector
        v2 = v1 + noise
        print('v2:', distance.cosine(v1, v2), '  noise:', distance.cosine(v1, noise))

In [68]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1000000)

v2: 0.9152088621381071   noise: 0.9152123284914833
v2: 1.070556210407779   noise: 1.070561652407139
v2: 0.9883181932184327   noise: 0.9883271549491298
v2: 0.9949536905376715   noise: 0.9949587051891523
v2: 1.0025278066344443   noise: 1.0025308738786756
v2: 1.0092902364927023   noise: 1.0092974503816123
v2: 0.983197569252202   noise: 0.9832016094018784
v2: 0.9383240850763344   noise: 0.9383302051457334
v2: 0.9437974968177358   noise: 0.9438020053383233


In [69]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 100)

v2: 0.8312187193859965   noise: 0.8646856822977205
v2: 0.972432088545038   noise: 1.0262369942213174
v2: 0.9439828086597537   noise: 1.032100565618671
v2: 0.9583231263913341   noise: 1.0076231591396951
v2: 1.0536501495386212   noise: 1.0837144706722512
v2: 0.9668702509221347   noise: 1.0378516971094385
v2: 1.0152677157767178   noise: 1.0550043263403508
v2: 0.9266437004209195   noise: 0.9868905788034364
v2: 0.9380775964954187   noise: 0.9824631453408906


In [70]:
experiment2_ratio('Dividend cuts may mean rethinking your retirement income strategy', 1)

v2: 0.03583615799899098   noise: 0.8425881822096555
v2: 0.017036316136244345   noise: 1.0295017767841772
v2: 0.006434116711719673   noise: 1.0448845997712852
v2: 0.020251688250806255   noise: 1.0392846360100845
v2: 0.05147129622130253   noise: 1.01999621300564
v2: 0.009963941546027755   noise: 1.0852355131333697
v2: 0.030256472175658078   noise: 0.9994917298370202
v2: 0.013080535179240127   noise: 0.9307139521550456
v2: 0.023195780021580292   noise: 0.9043225947177269


In [71]:
experiment2_ratio('Jim Cramer deciphers the speculative and blue-chip stocks driving the market', 5)

v2: 0.18520427562150765   noise: 0.9707630124327707
v2: 0.4728110244700925   noise: 0.9096868041863712
v2: 0.5028995324940628   noise: 1.0391795882162982
v2: 0.1804397156899975   noise: 0.9899341641904438
v2: 0.43610183859568197   noise: 0.9480234316648422
v2: 0.2538119695754032   noise: 1.0753200567200547
v2: 0.2647302603725741   noise: 1.0104353997666076
v2: 0.15067870470639733   noise: 1.0394201179955407
v2: 0.2773292343390318   noise: 0.9745582836686124
v2: 0.2895800885903028   noise: 0.9337691789398601
v2: 0.3650715449036127   noise: 1.0608421410557085
v2: 0.1804397156899975   noise: 0.9899341641904438
v2: 0.22811617361302694   noise: 0.9777790658682078
