# Gensim word vector visualization of various word vectors

In [48]:
import numpy as np

# Get the interactive Tools for Matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity

For looking at word vectors, I'll use Gensim. We also use it in hw1 for word vectors. Gensim isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.

Our homegrown Stanford offering is GloVe word vectors. Gensim doesn't give them first class support, but allows you to convert a file of GloVe vectors into word2vec format. You can download the GloVe vectors from [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

(I use the 100d vectors below as a mix between speed and smallness vs. quality. If you try out the 50d vectors, they basically work for similarity but clearly aren't as good for analogy problems. If you load the 300d vectors, they're even better than the 100d vectors.)

### Load word2vec 100d and 300d and look at some of the word vectors

In [3]:
glove_file = datapath("glove.6B\glove.6B.100d.txt")
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


(400000, 100)

In [4]:
glove_file_300d = datapath("glove.6B\glove.6B.300d.txt")
word2vec_glove_file_300d = get_tmpfile("glove.6B.300d.word2vec.txt")
glove2word2vec(glove_file_300d, word2vec_glove_file_300d)

  glove2word2vec(glove_file_300d, word2vec_glove_file_300d)


(400000, 300)

In [5]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
model_300d = KeyedVectors.load_word2vec_format(word2vec_glove_file_300d)

In [21]:
model['bachelor']

array([ 0.8322   , -0.14642  , -0.2002   ,  0.32142  ,  0.31173  ,
       -0.030808 ,  0.65621  ,  0.15722  , -0.89315  ,  0.9287   ,
       -0.4073   , -0.012328 ,  0.26558  ,  1.5675   ,  0.54457  ,
       -0.31347  ,  0.57195  ,  0.17359  ,  0.22574  ,  1.0666   ,
       -1.2983   ,  0.2712   ,  0.15471  , -0.92268  ,  0.57024  ,
       -0.068484 ,  0.14368  , -0.94394  ,  0.02954  , -0.92536  ,
       -0.92351  ,  0.97338  , -0.3583   , -0.065045 , -0.51683  ,
        0.44481  , -0.51399  , -0.29038  , -0.59909  ,  0.47418  ,
       -0.41489  , -0.0016607, -0.27886  , -0.31239  , -0.037998 ,
       -0.0026353,  0.30399  ,  0.93702  ,  0.82374  ,  0.24486  ,
       -0.61944  , -0.52649  ,  0.56107  , -0.3573   ,  0.10587  ,
       -0.71975  ,  0.98628  , -0.40722  ,  1.2808   ,  0.063544 ,
        0.12206  ,  0.18842  , -0.82301  , -0.53925  ,  0.42135  ,
        0.01962  ,  0.27123  , -0.21934  ,  0.60616  ,  1.2176   ,
       -1.0249   ,  0.41784  , -0.12338  ,  0.97374  , -0.9841

In [6]:
model_300d['bachelor']

array([-0.6253   , -0.1771   ,  0.27251  ,  0.39703  , -0.64292  ,
        0.34638  ,  0.20717  , -0.54641  , -0.067688 ,  0.2473   ,
        0.31673  ,  0.35247  , -0.081559 ,  0.39658  ,  0.79198  ,
        0.52533  ,  0.32633  , -0.26819  ,  0.2159   ,  0.33554  ,
       -0.38797  ,  0.35297  ,  0.21187  ,  0.68271  , -0.80815  ,
        0.74676  ,  0.21105  ,  0.22228  ,  0.079729 , -0.57853  ,
       -0.31118  , -0.41152  ,  0.41781  ,  0.15691  ,  0.0098227,
        0.7202   ,  0.3779   ,  0.086    , -0.1633   ,  0.088281 ,
       -0.15639  ,  0.087224 , -0.22681  ,  0.068579 ,  0.16802  ,
       -0.46802  ,  0.29859  ,  0.36217  ,  0.39329  , -0.85394  ,
       -0.59082  , -0.37417  ,  0.57294  ,  0.41029  ,  0.42961  ,
       -0.33777  ,  0.85083  , -0.71015  ,  0.21539  ,  0.13623  ,
       -0.46485  ,  0.5929   ,  0.34484  ,  0.65869  , -0.25801  ,
       -0.69305  ,  0.5626   , -0.12317  , -0.042026 , -0.24511  ,
       -0.029207 , -0.34903  , -0.1432   , -0.01761  , -0.5315

In [7]:
model['sister']

array([ 4.8565e-01, -1.7018e-01, -1.6144e-01, -2.1791e-01, -1.0460e-02,
        4.8495e-01,  6.7276e-02,  2.9462e-01,  3.1315e-01,  4.9928e-02,
        1.0945e-01,  3.9305e-01,  1.1124e-01,  5.0850e-01,  4.2969e-02,
       -8.1070e-01,  3.6790e-01, -3.0680e-01, -4.8652e-01,  1.2743e+00,
       -5.1865e-01,  2.0353e-01,  3.4956e-01,  1.0083e+00,  5.8926e-01,
        2.9031e-01, -8.2186e-02, -1.2684e+00,  4.7815e-01,  1.3295e+00,
        2.8491e-01,  6.9858e-01,  1.0986e+00, -1.1188e-01, -9.4430e-02,
        1.1840e-01,  4.6057e-01, -3.0710e-01,  6.3531e-01, -8.4389e-02,
       -1.4146e-01,  2.0203e-01,  1.9946e-01, -2.5948e-01,  6.6133e-02,
        2.1409e-01, -6.4537e-01,  2.4769e-01,  8.2065e-01, -2.1496e-02,
       -4.6824e-01,  3.6111e-01,  8.8937e-01,  6.2047e-01,  4.1908e-01,
       -2.1499e+00, -5.1015e-01,  3.7842e-01,  1.4951e-01,  7.4265e-01,
        1.5995e-01,  6.7740e-01,  1.2485e-01, -3.6054e-02,  3.3998e-01,
        1.7371e-03, -2.3472e-01,  8.9622e-01,  2.8156e-01,  3.90

In [8]:
model_300d['sister']

array([-4.7697e-01,  7.6817e-02, -4.7216e-01, -1.3498e-01,  1.9208e-01,
        6.3901e-02, -2.5592e-01,  2.1369e-01, -1.5501e-02, -1.0011e+00,
        4.7837e-01, -1.0016e-01,  3.7329e-01,  2.5675e-02,  1.7184e-01,
       -1.7170e-01,  1.9915e-01, -7.9400e-01,  1.7587e-01, -2.4506e-01,
       -3.3253e-01,  1.1413e-01, -4.6364e-01, -1.6337e-01,  2.3461e-01,
        2.3505e-01, -4.4175e-01,  1.5266e-01, -2.6450e-01, -1.6728e-01,
       -3.7696e-01,  3.4513e-01,  4.6997e-01,  6.2069e-01, -5.8430e-01,
        7.0751e-01,  2.2559e-02, -8.8599e-02, -1.0567e-02,  2.4054e-01,
       -8.0747e-02, -3.0653e-01,  5.1538e-01,  2.4590e-02,  1.3628e-01,
        1.6749e-01,  7.5620e-01,  5.3949e-01,  2.1593e-01, -4.0733e-01,
        1.7660e-01, -3.1790e-02,  3.6881e-01, -2.7978e-01, -4.9880e-01,
       -1.7215e-01,  3.2854e-02,  5.6740e-01,  3.2670e-01, -9.0277e-01,
        1.7485e-01,  2.9574e-01,  5.8915e-01,  3.7608e-01,  8.1893e-03,
       -2.6108e-01,  5.4040e-02, -1.3468e-01, -4.5592e-01,  4.61

In [23]:
model['knows']

array([ 0.31026  ,  0.39302  ,  0.98134  , -0.61414  , -0.5776   ,
       -0.022865 , -0.34611  , -0.11463  ,  0.38457  , -0.30959  ,
        0.21686  ,  0.27442  ,  0.093896 ,  0.014929 , -0.35822  ,
        0.023941 , -0.15049  ,  0.55903  , -0.30462  ,  0.8508   ,
       -0.36188  ,  0.27116  , -0.42309  , -0.87782  , -0.24462  ,
        0.22018  , -0.30799  , -0.87968  ,  0.14946  , -0.43172  ,
       -0.46944  ,  1.115    ,  0.39214  ,  0.044364 ,  0.30981  ,
        0.19762  , -0.45981  ,  0.16763  ,  0.38016  , -0.2242   ,
       -0.4728   ,  0.29229  ,  0.62778  , -0.68372  , -0.97454  ,
       -0.018768 ,  0.18173  , -0.50506  , -0.057814 , -0.52812  ,
        0.46425  , -0.34489  ,  0.30607  ,  0.64536  ,  0.18735  ,
       -1.854    ,  0.0099879,  0.12502  ,  0.30881  ,  0.7356   ,
        0.37825  ,  0.65349  , -0.16762  , -0.57033  ,  0.59705  ,
        0.18893  ,  0.96182  ,  0.10457  , -0.47462  ,  0.54552  ,
        0.47311  , -0.34428  , -0.078226 , -0.22945  ,  0.8619

In [24]:
model['causes']

array([-0.27155  ,  0.6229   , -0.174    ,  0.095774 ,  0.21727  ,
        0.18718  , -0.060586 , -0.41622  ,  0.4103   , -0.38533  ,
       -0.56599  , -0.011766 ,  0.35273  ,  0.093323 ,  0.36715  ,
       -0.24886  , -1.0586   , -0.41443  ,  0.15717  ,  0.27091  ,
       -0.069785 , -0.69661  , -0.25355  , -0.037957 ,  0.0342   ,
        0.68938  ,  0.17122  , -0.33451  ,  0.3867   , -0.031507 ,
        0.43334  ,  0.85433  ,  0.10439  ,  0.16353  , -0.63141  ,
       -0.20468  ,  0.2494   , -0.038951 , -0.27215  ,  0.721    ,
       -0.64843  ,  0.2111   , -0.4947   , -1.1552   ,  0.21294  ,
        0.26758  ,  0.11159  ,  0.36198  , -1.0189   , -0.46101  ,
        0.67484  ,  0.047161 , -0.37284  ,  0.56961  ,  0.48131  ,
       -1.3026   ,  0.1731   , -0.20101  ,  1.2521   ,  0.9539   ,
       -0.018293 ,  1.099    ,  0.38064  ,  0.007869 ,  1.2553   ,
        0.23543  , -0.065692 , -0.94835  ,  0.88404  , -0.95202  ,
       -0.29859  , -0.43715  , -0.16622  ,  0.43485  ,  0.5609

### Compare combinations of words and their corresponding words in 100d and 300d respectively

Example: "man" + "unmarried" = "bachelor"

First we will see what words are most similar to "man" + "unmarried", using the 100d and 300d models respectively. 
Then we will calculate the similarity between "man" + "unmarried" and "bachelor"

In [9]:
def definition(x1, x2):
    result = model.most_similar(positive=[x1, x2])
    return result

In [10]:
def definition_300d(x1, x2):
    result = model_300d.most_similar(positive=[x1, x2])
    return result

In [57]:
# Use 100d model and cosine_similarity to calculate similarity between a defining 
# phrase and the word.
#
# You can use doc2vec for this but there isn't any good pre-trained doc2vec models 
# and it takes too long to train my own model so I just averaged the two defining 
# words. This should make some sense.
# 
# The Phrases model for word2vec doesn't support extracting individual word vectors

def calculate_similarity(x1, x2, x3):
    phrase = ((model[x1] + model[x2])/2).reshape(1, -1)
    word = model[x3].reshape(1, -1)
    result = cosine_similarity(phrase, word)
    return result

### 'man' + 'unmarried' = 'bachelor'

In [11]:
definition('man', 'unmarried')

[('woman', 0.7986403107643127),
 ('child', 0.7451243996620178),
 ('young', 0.7399294972419739),
 ('mother', 0.7386444807052612),
 ('daughter', 0.7077805399894714),
 ('pregnant', 0.7069243788719177),
 ('husband', 0.7020798921585083),
 ('wife', 0.6987427473068237),
 ('married', 0.6970356702804565),
 ('couple', 0.6967206597328186)]

In [12]:
definition_300d('man', 'unmarried')

[('woman', 0.6883418560028076),
 ('young', 0.5598209500312805),
 ('person', 0.5512592196464539),
 ('men', 0.5502604842185974),
 ('girl', 0.5314196944236755),
 ('father', 0.5271446704864502),
 ('mother', 0.5231339931488037),
 ('boy', 0.5209730267524719),
 ('husband', 0.5178412795066833),
 ('married', 0.5143395066261292)]

In [55]:
calculate_similarity('man', 'unmarried', 'bachelor')

array([[0.24648492]], dtype=float32)

In [34]:
definition('man', 'married')

[('wife', 0.8625936508178711),
 ('father', 0.8617380261421204),
 ('daughter', 0.8578408360481262),
 ('mother', 0.8562560081481934),
 ('husband', 0.8406583666801453),
 ('woman', 0.8323313593864441),
 ('son', 0.8255986571311951),
 ('brother', 0.8143599629402161),
 ('friend', 0.8068794012069702),
 ('couple', 0.7818995118141174)]

In [13]:
definition_300d('man', 'married')

[('wife', 0.7292513251304626),
 ('woman', 0.723426342010498),
 ('father', 0.6977101564407349),
 ('husband', 0.6957255601882935),
 ('daughter', 0.695209264755249),
 ('mother', 0.6820749640464783),
 ('son', 0.6648902893066406),
 ('whom', 0.6425804495811462),
 ('who', 0.6358213424682617),
 ('brother', 0.6327465176582336)]

In [58]:
calculate_similarity('man', 'married', 'husband')

array([[0.8438991]], dtype=float32)

### 'man' + 'sibling' = 'brother' and variations

In [35]:
deifnition('man', 'sibling')

[('woman', 0.7361669540405273),
 ('boy', 0.7141108512878418),
 ('sister', 0.7016093730926514),
 ('mother', 0.7012602686882019),
 ('father', 0.7009888887405396),
 ('brother', 0.7004384398460388),
 ('girl', 0.6949325203895569),
 ('old', 0.6894643306732178),
 ('cousin', 0.689385712146759),
 ('husband', 0.6875988841056824)]

In [15]:
definition_300d('man', 'sibling')

[('brother', 0.6034399271011353),
 ('woman', 0.5631415843963623),
 ('boy', 0.56154465675354),
 ('father', 0.5563958883285522),
 ('person', 0.545726478099823),
 ('siblings', 0.5419252514839172),
 ('son', 0.5360830426216125),
 ('sister', 0.5275253653526306),
 ('another', 0.523565948009491),
 ('cousin', 0.5221625566482544)]

In [59]:
calculate_similarity('man', 'sibling', 'brother')

array([[0.71456873]], dtype=float32)

In [38]:
definition('man', 'siblings')

[('mother', 0.831328272819519),
 ('father', 0.8232245445251465),
 ('brother', 0.8065662980079651),
 ('daughter', 0.7940962314605713),
 ('daughters', 0.7939693331718445),
 ('couple', 0.7888948321342468),
 ('parents', 0.7833898663520813),
 ('son', 0.777420163154602),
 ('woman', 0.776724636554718),
 ('boy', 0.7763882279396057)]

In [16]:
definition_300d('man', 'siblings')

[('brother', 0.7179766893386841),
 ('father', 0.7076143622398376),
 ('mother', 0.6765961050987244),
 ('boy', 0.6628767848014832),
 ('son', 0.6588661670684814),
 ('daughters', 0.6247597336769104),
 ('daughter', 0.6227795481681824),
 ('woman', 0.6163303852081299),
 ('parents', 0.6153721809387207),
 ('sons', 0.6127268075942993)]

In [63]:
calculate_similarity('man', 'siblings', 'brother')

array([[0.80870104]], dtype=float32)

In [40]:
definition('boys', 'siblings')

[('girls', 0.8442649841308594),
 ('daughters', 0.8137844800949097),
 ('parents', 0.8068130016326904),
 ('sisters', 0.7994521856307983),
 ('children', 0.7879793643951416),
 ('wives', 0.7606745958328247),
 ('sons', 0.7523584961891174),
 ('kids', 0.7394457459449768),
 ('mothers', 0.7384538054466248),
 ('cousins', 0.7275663614273071)]

In [17]:
definition_300d('boys', 'siblings')

[('girls', 0.7515750527381897),
 ('parents', 0.7100813984870911),
 ('children', 0.6979014873504639),
 ('daughters', 0.6948514580726624),
 ('sisters', 0.6566587686538696),
 ('kids', 0.6551515460014343),
 ('boy', 0.6459047198295593),
 ('sons', 0.6436418890953064),
 ('younger', 0.630037784576416),
 ('grandparents', 0.6026526689529419)]

In [61]:
calculate_similarity('boys', 'siblings', 'brother')

array([[0.6346437]], dtype=float32)

In [18]:
definition('male', 'sibling')

[('female', 0.792033314704895),
 ('offspring', 0.6855543851852417),
 ('siblings', 0.6600726842880249),
 ('adult', 0.6374537944793701),
 ('older', 0.6364030241966248),
 ('younger', 0.6293515563011169),
 ('teenage', 0.6042073965072632),
 ('child', 0.5940395593643188),
 ('age', 0.5822728872299194),
 ('heterosexual', 0.5779051184654236)]

In [19]:
definition_300d('male', 'sibling')

[('female', 0.6721418499946594),
 ('siblings', 0.5821443200111389),
 ('offspring', 0.5680257678031921),
 ('younger', 0.5557777881622314),
 ('adult', 0.5555258989334106),
 ('males', 0.5307093262672424),
 ('older', 0.5281656980514526),
 ('females', 0.47854527831077576),
 ('spouse', 0.46651503443717957),
 ('teenage', 0.46035030484199524)]

In [65]:
calculate_similarity('male', 'sibling', 'brother')

array([[0.45691442]], dtype=float32)

In [20]:
definition('male', 'siblings')

[('female', 0.8215169906616211),
 ('younger', 0.7605322003364563),
 ('daughters', 0.7517505288124084),
 ('older', 0.7497366666793823),
 ('offspring', 0.7340948581695557),
 ('children', 0.7283878326416016),
 ('wives', 0.7223488092422485),
 ('parents', 0.7122332453727722),
 ('child', 0.7094624042510986),
 ('mothers', 0.7060766220092773)]

In [21]:
definition_300d('male', 'siblings')

[('female', 0.6994714736938477),
 ('younger', 0.6667450070381165),
 ('daughters', 0.6269418597221375),
 ('older', 0.6267310380935669),
 ('parents', 0.6101293563842773),
 ('offspring', 0.6061515808105469),
 ('adult', 0.581416130065918),
 ('children', 0.5795009136199951),
 ('cousins', 0.5776026248931885),
 ('sibling', 0.5571399927139282)]

In [66]:
calculate_similarity('male', 'siblings', 'brother')

array([[0.5659557]], dtype=float32)

### 'female' + 'siblings' = 'sisters' and variations

In [41]:
definition('female', 'siblings')

[('male', 0.8359555006027222),
 ('daughters', 0.7697874307632446),
 ('younger', 0.7622777819633484),
 ('children', 0.7363816499710083),
 ('older', 0.733374297618866),
 ('sisters', 0.7206032276153564),
 ('parents', 0.7201247215270996),
 ('child', 0.7192729115486145),
 ('wives', 0.7162964344024658),
 ('mothers', 0.7024577856063843)]

In [22]:
definition_300d('female', 'siblings')

[('male', 0.7708662152290344),
 ('daughters', 0.6533423066139221),
 ('younger', 0.64931720495224),
 ('parents', 0.6057436466217041),
 ('older', 0.6045342683792114),
 ('children', 0.597601056098938),
 ('mother', 0.5867751240730286),
 ('offspring', 0.5863634943962097),
 ('adult', 0.5797701478004456),
 ('child', 0.5736057758331299)]

In [68]:
calculate_similarity('female', 'siblings', 'sisters')

array([[0.69978017]], dtype=float32)

In [28]:
definition('female', 'sibling')

[('male', 0.8382307887077332),
 ('siblings', 0.6623550653457642),
 ('offspring', 0.6500701308250427),
 ('adult', 0.6476441621780396),
 ('younger', 0.6384185552597046),
 ('teenage', 0.6355264782905579),
 ('woman', 0.6274023056030273),
 ('older', 0.6271515488624573),
 ('child', 0.6109919548034668),
 ('sister', 0.6051296591758728)]

In [29]:
definition_300d('female', 'sibling')

[('male', 0.7697254419326782),
 ('adult', 0.5595094561576843),
 ('siblings', 0.5565780401229858),
 ('offspring', 0.5531947016716003),
 ('younger', 0.5409057140350342),
 ('older', 0.5084829330444336),
 ('youngest', 0.49774616956710815),
 ('woman', 0.48929962515830994),
 ('spouse', 0.48802831768989563),
 ('child', 0.48531606793403625)]

In [71]:
calculate_similarity('female', 'sibling', 'sister')

array([[0.5975423]], dtype=float32)

In [42]:
definition('woman', 'siblings')

[('mother', 0.8779049515724182),
 ('daughters', 0.849395215511322),
 ('daughter', 0.834620475769043),
 ('parents', 0.8141969442367554),
 ('sister', 0.8087382912635803),
 ('wife', 0.8057864308357239),
 ('girl', 0.8014374375343323),
 ('child', 0.7983404994010925),
 ('children', 0.7874230742454529),
 ('couple', 0.785481333732605)]

In [23]:
definition_300d('woman', 'siblings')

[('mother', 0.8026262521743774),
 ('daughter', 0.7382708787918091),
 ('daughters', 0.7369828820228577),
 ('girl', 0.7010666131973267),
 ('grandmother', 0.692963719367981),
 ('wife', 0.6885506510734558),
 ('parents', 0.684693455696106),
 ('sister', 0.6800177693367004),
 ('child', 0.6582188606262207),
 ('husband', 0.6524820327758789)]

In [72]:
calculate_similarity('woman', 'siblings', 'sister')

array([[0.8058896]], dtype=float32)

In [30]:
definition('woman', 'sibling')

[('mother', 0.7573694586753845),
 ('sister', 0.7480304837226868),
 ('girl', 0.7454620599746704),
 ('daughter', 0.7285175919532776),
 ('husband', 0.7076904773712158),
 ('child', 0.7028422951698303),
 ('female', 0.6914132833480835),
 ('wife', 0.6902039647102356),
 ('male', 0.6832150816917419),
 ('boy', 0.6797403693199158)]

In [31]:
definition_300d('woman', 'sibling')

[('mother', 0.6591885089874268),
 ('sister', 0.6344946622848511),
 ('girl', 0.6248164772987366),
 ('daughter', 0.6147672533988953),
 ('child', 0.5877892374992371),
 ('siblings', 0.574110209941864),
 ('grandmother', 0.5471194386482239),
 ('male', 0.5452283024787903),
 ('spouse', 0.5451955199241638),
 ('female', 0.5398475527763367)]

In [73]:
calculate_similarity('woman', 'sibling', 'sister')

array([[0.7543833]], dtype=float32)

In [43]:
definition('girl', 'siblings')

[('daughters', 0.8481532335281372),
 ('mother', 0.8339701890945435),
 ('daughter', 0.8201621770858765),
 ('sister', 0.8141985535621643),
 ('boy', 0.8040753602981567),
 ('parents', 0.8025711178779602),
 ('child', 0.7809332609176636),
 ('girls', 0.7796710133552551),
 ('sisters', 0.7791102528572083),
 ('children', 0.7709638476371765)]

In [24]:
definition_300d('girl', 'siblings')

[('boy', 0.7384527325630188),
 ('daughters', 0.7363393306732178),
 ('mother', 0.7348366975784302),
 ('parents', 0.7229806184768677),
 ('daughter', 0.7228127717971802),
 ('sister', 0.6973134279251099),
 ('girls', 0.6810939311981201),
 ('child', 0.6802092790603638),
 ('grandmother', 0.6732290983200073),
 ('sisters', 0.6646929383277893)]

In [74]:
calculate_similarity('girl', 'siblings', 'sisters')

array([[0.7677466]], dtype=float32)

In [26]:
definition('girl', 'sibling')

[('sister', 0.7638739943504333),
 ('boy', 0.7581804394721985),
 ('daughter', 0.7243503332138062),
 ('teenage', 0.7229381203651428),
 ('mother', 0.7229083180427551),
 ('woman', 0.7120601534843445),
 ('siblings', 0.6974316835403442),
 ('child', 0.6950110793113708),
 ('toddler', 0.6934158802032471),
 ('baby', 0.6900186538696289)]

In [27]:
definition_300d('girl', 'sibling')

[('sister', 0.6665235757827759),
 ('boy', 0.6638965010643005),
 ('siblings', 0.6382166743278503),
 ('child', 0.6252896785736084),
 ('daughter', 0.6155028343200684),
 ('mother', 0.6058690547943115),
 ('teenage', 0.592934787273407),
 ('niece', 0.5739392638206482),
 ('girls', 0.5708860754966736),
 ('baby', 0.5577054619789124)]

In [75]:
calculate_similarity('girl', 'sibling', 'sister')

array([[0.7725565]], dtype=float32)

### 'girl' + 'children' = 'daughter' and variations

In [44]:
definition('girl', 'children')

[('child', 0.886604905128479),
 ('girls', 0.8535959124565125),
 ('boy', 0.8525450229644775),
 ('mother', 0.8281201720237732),
 ('woman', 0.8268085718154907),
 ('parents', 0.8134887218475342),
 ('boys', 0.7949939370155334),
 ('baby', 0.7723804116249084),
 ('daughter', 0.7696700692176819),
 ('pregnant', 0.7669316530227661)]

In [32]:
definition_300d('girl', 'children')

[('child', 0.7879396080970764),
 ('girls', 0.7792677879333496),
 ('boy', 0.7619722485542297),
 ('parents', 0.7411299347877502),
 ('kids', 0.7217090129852295),
 ('mother', 0.6994401216506958),
 ('boys', 0.6920749545097351),
 ('daughter', 0.6760267019271851),
 ('woman', 0.6596529483795166),
 ('daughters', 0.644815981388092)]

In [76]:
calculate_similarity('girl', 'children', 'daughter')

array([[0.7682451]], dtype=float32)

In [45]:
definition('woman', 'children')

[('child', 0.8859902620315552),
 ('mother', 0.8509006500244141),
 ('girl', 0.8303909301757812),
 ('parents', 0.8068933486938477),
 ('women', 0.7791352868080139),
 ('she', 0.7788202166557312),
 ('pregnant', 0.7787449955940247),
 ('boy', 0.7738879323005676),
 ('her', 0.769166111946106),
 ('young', 0.7673245072364807)]

In [33]:
definition_300d('woman', 'children')

[('child', 0.7599184513092041),
 ('mother', 0.7502972483634949),
 ('girl', 0.7349914908409119),
 ('parents', 0.694279134273529),
 ('daughter', 0.6775768399238586),
 ('her', 0.6636809706687927),
 ('women', 0.6613084673881531),
 ('she', 0.6520648002624512),
 ('girls', 0.6514615416526794),
 ('young', 0.647993803024292)]

In [77]:
calculate_similarity('woman', 'children', 'daughter')

array([[0.76445735]], dtype=float32)

In [46]:
definition('female', 'children')

[('male', 0.8574326038360596),
 ('child', 0.8380101323127747),
 ('women', 0.8125495314598083),
 ('young', 0.7888067960739136),
 ('girls', 0.7795611023902893),
 ('woman', 0.776369035243988),
 ('adult', 0.761245846748352),
 ('mothers', 0.7439872026443481),
 ('parents', 0.7428076267242432),
 ('pregnant', 0.7315382957458496)]

In [34]:
definition_300d('female', 'children')

[('male', 0.7566388249397278),
 ('child', 0.6960158944129944),
 ('women', 0.6858314275741577),
 ('young', 0.661391019821167),
 ('girls', 0.6555755138397217),
 ('adult', 0.6462522745132446),
 ('parents', 0.6341460347175598),
 ('woman', 0.6337342262268066),
 ('adults', 0.6003085374832153),
 ('mothers', 0.5944299101829529)]

In [78]:
calculate_similarity('female', 'children', 'daughter')

array([[0.64307594]], dtype=float32)

In [48]:
definition('female', 'offsprings')

[('male', 0.7095398306846619),
 ('offspring', 0.688175618648529),
 ('siblings', 0.600884199142456),
 ('spouses', 0.6006006598472595),
 ('wives', 0.5958591103553772),
 ('husbands', 0.5817289352416992),
 ('progeny', 0.5592339634895325),
 ('protagonists', 0.5539281368255615),
 ('genitalia', 0.5478838086128235),
 ('girlfriends', 0.5273348689079285)]

In [35]:
definition_300d('female', 'offsprings')

[('male', 0.6583625674247742),
 ('offspring', 0.5315585136413574),
 ('gametes', 0.43372949957847595),
 ('progeny', 0.41627082228660583),
 ('gonads', 0.411285936832428),
 ('females', 0.40803220868110657),
 ('genitalia', 0.40111616253852844),
 ('unmarried', 0.3996923267841339),
 ('protagonists', 0.3902529776096344),
 ('trainees', 0.38945841789245605)]

In [79]:
calculate_similarity('female', 'offsprings', 'daughter')

array([[0.39938354]], dtype=float32)

In [36]:
definition('woman', 'offsprings')

[('siblings', 0.6576163172721863),
 ('husbands', 0.6404382586479187),
 ('wives', 0.6365442872047424),
 ('daughters', 0.6234209537506104),
 ('offspring', 0.617749810218811),
 ('spouses', 0.588084876537323),
 ('pregnant', 0.5867756009101868),
 ('mother', 0.5764126777648926),
 ('female', 0.5692703723907471),
 ('male', 0.5688695907592773)]

In [37]:
definition_300d('woman', 'offsprings')

[('female', 0.4957338869571686),
 ('mother', 0.47116518020629883),
 ('pregnant', 0.47105303406715393),
 ('girl', 0.4570048749446869),
 ('male', 0.45187920331954956),
 ('daughter', 0.4516623914241791),
 ('daughters', 0.4442852735519409),
 ('unmarried', 0.4414706230163574),
 ('offspring', 0.4405021667480469),
 ('wife', 0.4342300593852997)]

In [80]:
calculate_similarity('woman', 'offsprings', 'daughter')

array([[0.69001687]], dtype=float32)

### 'woman' + 'spouses' = 'wives'/'wife', 'man' + 'spouses' = 'husband' and variations

In [49]:
definition('woman', 'spouses')

[('parents', 0.7681692242622375),
 ('mother', 0.7526997327804565),
 ('pregnant', 0.7449538707733154),
 ('child', 0.7423197031021118),
 ('spouse', 0.7361156344413757),
 ('wives', 0.7301071882247925),
 ('couple', 0.7288706302642822),
 ('husbands', 0.7234518527984619),
 ('children', 0.7203859686851501),
 ('wife', 0.7164435982704163)]

In [38]:
definition_300d('woman', 'spouses')

[('spouse', 0.6913601756095886),
 ('wives', 0.6204648017883301),
 ('husbands', 0.618625283241272),
 ('mother', 0.6058530807495117),
 ('wife', 0.6057422161102295),
 ('pregnant', 0.5965089797973633),
 ('couple', 0.586215615272522),
 ('parents', 0.5861546993255615),
 ('husband', 0.5838871598243713),
 ('women', 0.5832949876785278)]

In [81]:
calculate_similarity('woman', 'spouses', 'wives')

array([[0.7186432]], dtype=float32)

In [82]:
calculate_similarity('woman', 'spouses', 'wife')

array([[0.7275724]], dtype=float32)

In [53]:
definition('male', 'spouses')

[('female', 0.8126022219657898),
 ('wives', 0.7350186109542847),
 ('husbands', 0.7245387434959412),
 ('mothers', 0.7060080170631409),
 ('couples', 0.6785614490509033),
 ('offspring', 0.6780828237533569),
 ('unmarried', 0.6756021976470947),
 ('spouse', 0.6748630404472351),
 ('women', 0.6671302914619446),
 ('siblings', 0.6540225148200989)]

In [39]:
definition_300d('male', 'spouses')

[('female', 0.721327006816864),
 ('husbands', 0.6353976130485535),
 ('wives', 0.6263058185577393),
 ('spouse', 0.6175006031990051),
 ('couples', 0.5744408369064331),
 ('unmarried', 0.5724306702613831),
 ('heterosexual', 0.5606091618537903),
 ('males', 0.5482465028762817),
 ('mothers', 0.5427603125572205),
 ('sex', 0.5306220054626465)]

In [87]:
calculate_similarity('male', 'spouses', 'husbands')

array([[0.71036553]], dtype=float32)

In [89]:
calculate_similarity('male', 'spouse', 'husband')

array([[0.6500168]], dtype=float32)

In [90]:
calculate_similarity('male', 'spouses', 'husband')

array([[0.5309439]], dtype=float32)

In [84]:
definition('man', 'spouses')

[('woman', 0.7600047588348389),
 ('parents', 0.7566503286361694),
 ('couple', 0.7525718808174133),
 ('wives', 0.7497987151145935),
 ('friends', 0.7447857856750488),
 ('child', 0.7329953908920288),
 ('young', 0.7296339273452759),
 ('mother', 0.720712423324585),
 ('whom', 0.7118040919303894),
 ('person', 0.710668683052063)]

In [85]:
definition_300d('man', 'spouses')

[('woman', 0.6131700873374939),
 ('spouse', 0.6114341616630554),
 ('wives', 0.5958060026168823),
 ('husbands', 0.5797724723815918),
 ('person', 0.5757161378860474),
 ('elderly', 0.5619040727615356),
 ('whom', 0.5615295171737671),
 ('couple', 0.557274341583252),
 ('who', 0.5519247651100159),
 ('men', 0.5494856238365173)]

In [88]:
calculate_similarity('man', 'spouses', 'husbands')

array([[0.70117164]], dtype=float32)

### 'baby' + 'duck' = 'duckling' and variations

In [55]:
definition('baby', 'duck')

[('cat', 0.7357245087623596),
 ('rabbit', 0.7031602263450623),
 ('dog', 0.7006856799125671),
 ('boy', 0.671177327632904),
 ('pet', 0.6643341779708862),
 ('chicken', 0.6521880030632019),
 ('cow', 0.6426712274551392),
 ('pig', 0.6388294100761414),
 ('girl', 0.624531090259552),
 ('goose', 0.6224647760391235)]

In [43]:
definition_300d('baby', 'duck')

[('babies', 0.5189098715782166),
 ('chicken', 0.5070104598999023),
 ('boy', 0.5019789934158325),
 ('newborn', 0.49996501207351685),
 ('girl', 0.47992628812789917),
 ('infant', 0.4641655683517456),
 ('pig', 0.45999667048454285),
 ('dog', 0.45428943634033203),
 ('rabbit', 0.44439950585365295),
 ('goose', 0.44183123111724854)]

In [91]:
calculate_similarity('baby', 'duck', 'duckling')

array([[0.31431836]], dtype=float32)

In [57]:
definition('small', 'duck')

[('large', 0.7329149842262268),
 ('little', 0.6936571598052979),
 ('tiny', 0.6864888668060303),
 ('big', 0.6588542461395264),
 ('few', 0.6588490009307861),
 ('fish', 0.6484575271606445),
 ('one', 0.6448154449462891),
 ('cat', 0.6425182223320007),
 ('short', 0.6358485817909241),
 ('chicken', 0.6342458128929138)]

In [44]:
definition_300d('small', 'duck')

[('large', 0.60807865858078),
 ('tiny', 0.5721938014030457),
 ('little', 0.5320278406143188),
 ('smaller', 0.502325177192688),
 ('few', 0.49825578927993774),
 ('chicken', 0.4941973090171814),
 ('big', 0.483223557472229),
 ('sized', 0.48181411623954773),
 ('fish', 0.47811469435691833),
 ('larger', 0.47321608662605286)]

In [92]:
calculate_similarity('small', 'duck', 'duckling')

array([[0.14753377]], dtype=float32)

In [59]:
definition('little', 'duck')

[('bit', 0.7100890278816223),
 ('too', 0.7056362628936768),
 ('just', 0.69776850938797),
 ('big', 0.685026228427887),
 ('cat', 0.6744759678840637),
 ('dog', 0.6696208715438843),
 ('even', 0.6634716391563416),
 ('so', 0.663071870803833),
 ('rabbit', 0.6600933074951172),
 ('like', 0.6579574346542358)]

In [45]:
definition_300d('little', 'duck')

[('bit', 0.6069000363349915),
 ('just', 0.5427685379981995),
 ('too', 0.5328499674797058),
 ('lame', 0.5064114332199097),
 ('even', 0.5063546895980835),
 ('much', 0.4996291399002075),
 ('kind', 0.4988514184951782),
 ('lot', 0.4984596371650696),
 ('big', 0.49657073616981506),
 ('actually', 0.496439129114151)]

In [93]:
calculate_similarity('little', 'duck', 'duckling')

array([[0.2061248]], dtype=float32)

In [104]:
definition('child', 'duck')

[('baby', 0.7387073040008545),
 ('boy', 0.7287305593490601),
 ('dog', 0.694217324256897),
 ('girl', 0.6903077960014343),
 ('woman', 0.6817472577095032),
 ('cat', 0.6749307513237),
 ('children', 0.669579029083252),
 ('mother', 0.6604345440864563),
 ('man', 0.6539339423179626),
 ('bird', 0.6318632960319519)]

In [105]:
definition_300d('child', 'duck')

[('boy', 0.5751781463623047),
 ('baby', 0.5685468316078186),
 ('children', 0.5620506405830383),
 ('girl', 0.5435641407966614),
 ('mother', 0.5174210667610168),
 ('parents', 0.4905431866645813),
 ('infant', 0.4820095896720886),
 ('daughter', 0.47889819741249084),
 ('kids', 0.4774121940135956),
 ('woman', 0.4766649901866913)]

In [106]:
calculate_similarity('child', 'duck', 'duckling')

array([[0.24638756]], dtype=float32)

### 'small' + 'cat' and variations

In [60]:
definition('small', 'cat')

[('dog', 0.7789637446403503),
 ('large', 0.7542371153831482),
 ('tiny', 0.7404225468635559),
 ('little', 0.7348185181617737),
 ('big', 0.7019434571266174),
 ('dogs', 0.6901535391807556),
 ('like', 0.6812320351600647),
 ('few', 0.6762623190879822),
 ('one', 0.6740338802337646),
 ('larger', 0.6732674241065979)]

In [46]:
definition_300d('small', 'cat')

[('large', 0.6367977857589722),
 ('tiny', 0.6292120814323425),
 ('dog', 0.6141706109046936),
 ('little', 0.5644365549087524),
 ('cats', 0.5567792654037476),
 ('big', 0.5449622869491577),
 ('smaller', 0.5314121842384338),
 ('sized', 0.5182141065597534),
 ('pet', 0.5116186738014221),
 ('like', 0.5096837878227234)]

In [97]:
calculate_similarity('small', 'cat', 'kitten')

array([[0.34473705]], dtype=float32)

In [61]:
definition('baby', 'cat')

[('dog', 0.8127073049545288),
 ('boy', 0.7638528943061829),
 ('girl', 0.7336031198501587),
 ('pet', 0.7272219061851501),
 ('rabbit', 0.6875886917114258),
 ('cats', 0.6796918511390686),
 ('dogs', 0.6777850389480591),
 ('puppy', 0.6669325232505798),
 ('kid', 0.664391279220581),
 ('mom', 0.6632115244865417)]

In [47]:
definition_300d('baby', 'cat')

[('dog', 0.639026939868927),
 ('cats', 0.609463632106781),
 ('pet', 0.5913442373275757),
 ('girl', 0.5715005397796631),
 ('boy', 0.5681838393211365),
 ('newborn', 0.5611903071403503),
 ('babies', 0.550870954990387),
 ('dogs', 0.5117906928062439),
 ('child', 0.5101261138916016),
 ('mom', 0.5076110363006592)]

In [96]:
calculate_similarity('baby', 'cat', 'kitten')

array([[0.57189286]], dtype=float32)

In [102]:
definition('child', 'cat')

[('boy', 0.814297616481781),
 ('dog', 0.8051776885986328),
 ('baby', 0.7981764078140259),
 ('girl', 0.793245255947113),
 ('mother', 0.7382760047912598),
 ('children', 0.7323399186134338),
 ('woman', 0.7310007214546204),
 ('man', 0.7049850225448608),
 ('animal', 0.6739353537559509),
 ('pet', 0.6734364628791809)]

In [103]:
definition_300d('child', 'cat')

[('dog', 0.6304463148117065),
 ('boy', 0.6217886209487915),
 ('girl', 0.6180351376533508),
 ('baby', 0.5958205461502075),
 ('children', 0.5954334139823914),
 ('mother', 0.5814193487167358),
 ('cats', 0.5577535629272461),
 ('daughter', 0.5479570627212524),
 ('parents', 0.5415760278701782),
 ('pet', 0.5365710854530334)]

In [101]:
calculate_similarity('child', 'cat', 'kitten')

array([[0.49331015]], dtype=float32)

### 'baby' + 'dog' and variations

In [62]:
definition('baby', 'dog')

[('cat', 0.8454183340072632),
 ('boy', 0.7890797257423401),
 ('dogs', 0.752583384513855),
 ('pet', 0.7509226202964783),
 ('girl', 0.7379759550094604),
 ('puppy', 0.6998087167739868),
 ('mom', 0.68051677942276),
 ('child', 0.6800344586372375),
 ('babies', 0.6709420084953308),
 ('kid', 0.6680909991264343)]

In [40]:
definition_300d('baby', 'dog')

[('dogs', 0.6589652299880981),
 ('cat', 0.6271884441375732),
 ('pet', 0.6137261390686035),
 ('boy', 0.5887221693992615),
 ('babies', 0.5866448879241943),
 ('girl', 0.5715811848640442),
 ('newborn', 0.5535208582878113),
 ('mom', 0.5532939434051514),
 ('puppy', 0.547976553440094),
 ('child', 0.5454639792442322)]

In [98]:
calculate_similarity('baby', 'dog', 'puppy')

array([[0.69795763]], dtype=float32)

In [41]:
definition('small', 'dog')

[('cat', 0.7745434641838074),
 ('dogs', 0.7584885358810425),
 ('large', 0.7511091828346252),
 ('little', 0.73993980884552),
 ('one', 0.699674129486084),
 ('big', 0.6948302984237671),
 ('few', 0.6936855912208557),
 ('tiny', 0.6898406744003296),
 ('like', 0.6831692457199097),
 ('well', 0.6819196343421936)]

In [42]:
definition_300d('small', 'dog')

[('dogs', 0.6499183177947998),
 ('large', 0.6301786303520203),
 ('tiny', 0.6044901609420776),
 ('little', 0.5914078950881958),
 ('smaller', 0.5697966814041138),
 ('cat', 0.5677505731582642),
 ('big', 0.5529771447181702),
 ('few', 0.5376969575881958),
 ('like', 0.5277330875396729),
 ('pet', 0.5271344184875488)]

In [99]:
calculate_similarity('small', 'dog', 'puppy')

array([[0.529829]], dtype=float32)

In [107]:
definition('child', 'dog')

[('boy', 0.8168787956237793),
 ('girl', 0.7750166058540344),
 ('cat', 0.7632631659507751),
 ('baby', 0.7616083025932312),
 ('children', 0.7390168905258179),
 ('woman', 0.7302547693252563),
 ('dogs', 0.7273513674736023),
 ('mother', 0.7245290279388428),
 ('man', 0.7153592705726624),
 ('animal', 0.6984661221504211)]

In [108]:
definition_300d('child', 'dog')

[('dogs', 0.6415126919746399),
 ('boy', 0.6307135820388794),
 ('children', 0.6303172707557678),
 ('girl', 0.606192409992218),
 ('mother', 0.6040725111961365),
 ('parents', 0.5956936478614807),
 ('baby', 0.5901288986206055),
 ('cat', 0.5764565467834473),
 ('kids', 0.5733127593994141),
 ('animal', 0.5565695762634277)]

In [109]:
calculate_similarity('child', 'dog', 'puppy')

array([[0.591485]], dtype=float32)

In [73]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.key_to_index.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [74]:
display_pca_scatterplot(model, 
                        ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

<IPython.core.display.Javascript object>

In [68]:
display_pca_scatterplot(model, sample=300)

<IPython.core.display.Javascript object>

# BERT language modeling

Using pre-trained BERT langauge model to create word vectors for words and check their cosine similarity 

### Preparing Word Vectors

words = 'man', 'married', 'unmarried', 'woman', 'female', 'male', 'girls', 'boys', 'girl', 'boy', 'spouse', 'sibling', 'siblings', 'offspring', 'wives', 'wife', 'husband', 'husbands', 'brother', 'brothers', 'sister', 'sisters', 'daughter', 'daughters', 'child', 'children', 'bachelor', 'baby', 'small', 'little', 'duck', 'cat', 'kitten', 'dog', 'puppy'

In [112]:
#!pip install torch
!pip install transformers

Collecting torch
  Downloading torch-1.10.0-cp38-cp38-win_amd64.whl (226.6 MB)
Installing collected packages: torch
Successfully installed torch-1.10.0
Collecting transformers

ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

huggingface-hub 0.0.19 requires packaging>=20.9, but you'll have packaging 20.4 which is incompatible.



  Downloading transformers-4.12.0-py3-none-any.whl (3.1 MB)
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
Installing collected packages: huggingface-hub, tokenizers, sacremoses, transformers
Successfully installed huggingface-hub-0.0.19 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.0


In [113]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import nltk
import torch

In [114]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=28.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466062.0), HTML(value='')))




In [115]:
# create vocab
texts = ['man', 'married', 'unmarried', 'woman', 'female', 'male', 'girls', 'boys', 'girl', 'boy', 'spouse', 'spouses', 
         'sibling', 'siblings', 'offspring', 'offsprings', 'wives', 'wife', 'husband', 'husbands', 'brother', 'brothers',
         'sister', 'sisters', 'daughter', 'daughters', 'child', 'children', 'bachelor', 'bachelors', 'baby', 'small', 
         'little', 'duck', 'duckling', 'cat', 'kitten', 'dog', 'puppy']

In [116]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [121]:
# Getting embeddings for the target word
target_word_embeddings = []

for text in texts:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    
    # Find the position text in list of tokens
    word_embedding = None
    if text in tokenized_text:
        word_index = tokenized_text.index(text)
        # Get the embedding for text
        word_embedding = list_token_embeddings[word_index]

    target_word_embeddings.append(word_embedding)

In [194]:
model_bert = dict(zip(texts, target_word_embeddings))

In [195]:
keys = list(model_bert)
nonekeys = []
for key in keys:
    if model_bert[key] == None:
        nonekeys.append(key)
        del model_bert[key]

In [196]:
len(model_bert)

35

In [197]:
nonekeys

['spouses', 'offsprings', 'bachelors', 'duckling']

In [198]:
model_bert.keys()

dict_keys(['man', 'married', 'unmarried', 'woman', 'female', 'male', 'girls', 'boys', 'girl', 'boy', 'spouse', 'sibling', 'siblings', 'offspring', 'wives', 'wife', 'husband', 'husbands', 'brother', 'brothers', 'sister', 'sisters', 'daughter', 'daughters', 'child', 'children', 'bachelor', 'baby', 'small', 'little', 'duck', 'cat', 'kitten', 'dog', 'puppy'])

### Compare similarity

word = 'man', 'married', 'unmarried', 'woman', 'female', 'male', 'girls', 'boys', 'girl', 'boy', 'spouse', 'sibling', 'siblings', 'offspring', 'wives', 'wife', 'husband', 'husbands', 'brother', 'brothers', 'sister', 'sisters', 'daughter', 'daughters', 'child', 'children', 'bachelor', 'baby', 'small', 'little', 'duck', 'cat', 'kitten', 'dog', 'puppy'

#### BERT model performs much better than GloVe in general

In [149]:
def calculate_similarity_bert(x1, x2, x3):
    phrase = (np.array(model_bert[x1]) + np.array(model_bert[x2])).reshape(1, -1)
    word = np.array(model_bert[x3]).reshape(1, -1)
    result = cosine_similarity(phrase, word)
    return result

In [150]:
calculate_similarity_bert('man', 'unmarried', 'bachelor')

array([[0.72141254]])

In [157]:
calculate_similarity_bert('man', 'married', 'husband')

array([[0.81838927]])

In [158]:
calculate_similarity_bert('man', 'sibling', 'brother')

array([[0.90269145]])

In [159]:
calculate_similarity_bert('man', 'siblings', 'brother')

array([[0.89054641]])

In [160]:
calculate_similarity_bert('boys', 'siblings', 'brother')

array([[0.68358772]])

In [161]:
calculate_similarity_bert('male', 'sibling', 'brother')

array([[0.73682338]])

In [162]:
calculate_similarity_bert('male', 'siblings', 'brother')

array([[0.70625922]])

In [163]:
calculate_similarity_bert('female', 'siblings', 'sisters')

array([[0.77218837]])

In [164]:
calculate_similarity_bert('female', 'sibling', 'sister')

array([[0.73355772]])

In [165]:
calculate_similarity_bert('woman', 'siblings', 'sister')

array([[0.83401117]])

In [166]:
calculate_similarity_bert('woman', 'sibling', 'sister')

array([[0.86178458]])

In [167]:
calculate_similarity_bert('girl', 'siblings', 'sisters')

array([[0.88138312]])

In [168]:
calculate_similarity_bert('girl', 'sibling', 'sister')

array([[0.86372976]])

In [169]:
calculate_similarity_bert('girl', 'children', 'daughter')

array([[0.79813281]])

In [170]:
calculate_similarity_bert('woman', 'children', 'daughter')

array([[0.80072848]])

In [171]:
calculate_similarity_bert('female', 'children', 'daughter')

array([[0.62771365]])

In [172]:
calculate_similarity_bert('female', 'offspring', 'daughter')

array([[0.66243071]])

In [173]:
calculate_similarity_bert('woman', 'offspring', 'daughter')

array([[0.85383232]])

In [174]:
calculate_similarity_bert('woman', 'spouse', 'wives')

array([[0.8179141]])

In [175]:
calculate_similarity_bert('woman', 'spouse', 'wife')

array([[0.90737613]])

In [176]:
calculate_similarity_bert('male', 'spouse', 'husbands')

array([[0.69979336]])

In [177]:
calculate_similarity_bert('male', 'spouse', 'husband')

array([[0.75144243]])

In [178]:
calculate_similarity_bert('man', 'spouse', 'husband')

array([[0.82231791]])

In [179]:
calculate_similarity_bert('small', 'cat', 'kitten')

array([[0.85065687]])

In [180]:
calculate_similarity_bert('baby', 'cat', 'kitten')

array([[0.88181678]])

In [181]:
calculate_similarity_bert('child', 'cat', 'kitten')

array([[0.81294964]])

In [182]:
calculate_similarity_bert('baby', 'dog', 'puppy')

array([[0.827689]])

In [183]:
calculate_similarity_bert('small', 'dog', 'puppy')

array([[0.8285944]])

In [184]:
calculate_similarity_bert('child', 'dog', 'puppy')

array([[0.71537523]])

# Quantify the complexity of word vectors

### Setting up the environment again

In [204]:
glove_file = datapath("glove.6B\glove.6B.100d.txt")
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


(400000, 100)

In [205]:
glove_file_300d = datapath("glove.6B\glove.6B.300d.txt")
word2vec_glove_file_300d = get_tmpfile("glove.6B.300d.word2vec.txt")
glove2word2vec(glove_file_300d, word2vec_glove_file_300d)

  glove2word2vec(glove_file_300d, word2vec_glove_file_300d)


(400000, 300)

In [206]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
model_300d = KeyedVectors.load_word2vec_format(word2vec_glove_file_300d)

### Functions that calculate the complexity metrics

In [217]:
# sum of the word vector

def sum_complexity(x1):
    models = ['BERT', 'GloVe_100d', 'GloVe_300d']
    data = [sum([abs(x) for x in model_bert[x1]]), sum(model[x1]), sum(model_300d[x1])]
    return dict(zip(models, data))

In [218]:
# count the number of features that has significant information
# threshold = how far away a feature is from 0 (0 <= threshold <= 1). default = 0.5

def above_zero_complexity(x1, threshold = 0.5):
    models = ['BERT', 'GloVe_100d', 'GloVe_300d']
    bert = len([x for x in model_bert[x1] if abs(x) >= threshold])/len(model_bert[x1])
    glove_100d = len([x for x in model[x1] if x >= threshold])/len(model[x1])
    glove_300d = len([x for x in model_300d[x1] if x >= threshold])/len(model_300d[x1])
    data = [bert, glove_100d, glove_300d]
    return dict(zip(models, data))

### Results

In [219]:
for word in model_bert.keys():
    print(word)
    print("Sum Complexity: \n", sum_complexity(word))
    print("Above Zero Complexity: (threshold = 0.5)\n", above_zero_complexity(word))
    print("Above Zero Complexity: (threshold = 0.3)\n", above_zero_complexity(word, 0.3))
    print()

man
Sum Complexity: 
 {'BERT': 332.63076185714453, 'GloVe_100d': 3.922419093782082, 'GloVe_300d': 4.971497090766206}
Above Zero Complexity: (threshold = 0.5)
 {'BERT': 0.3424479166666667, 'GloVe_100d': 0.18, 'GloVe_300d': 0.04666666666666667}
Above Zero Complexity: (threshold = 0.3)
 {'BERT': 0.57421875, 'GloVe_100d': 0.31, 'GloVe_300d': 0.17}

married
Sum Complexity: 
 {'BERT': 344.30526093184017, 'GloVe_100d': -4.701409709639847, 'GloVe_300d': 1.218420646211598}
Above Zero Complexity: (threshold = 0.5)
 {'BERT': 0.3723958333333333, 'GloVe_100d': 0.15, 'GloVe_300d': 0.10666666666666667}
Above Zero Complexity: (threshold = 0.3)
 {'BERT': 0.5833333333333334, 'GloVe_100d': 0.31, 'GloVe_300d': 0.21}

unmarried
Sum Complexity: 
 {'BERT': 324.8027933984995, 'GloVe_100d': -1.1166526627494022, 'GloVe_300d': 0.37002692744135857}
Above Zero Complexity: (threshold = 0.5)
 {'BERT': 0.3502604166666667, 'GloVe_100d': 0.18, 'GloVe_300d': 0.10666666666666667}
Above Zero Complexity: (threshold = 0.3)
