### GloVe Embeddings
### Trained on Wikipedia 2014 corpus of 6 billion words with embedding size = 100

In [1]:
import torch
import torchtext

In [2]:
# Define Glove
glove = torchtext.vocab.GloVe(name="6B", dim=50)

# test 
glove['cat']

tensor([ 0.4528, -0.5011, -0.5371, -0.0157,  0.2219,  0.5460, -0.6730, -0.6891,
         0.6349, -0.1973,  0.3368,  0.7735,  0.9009,  0.3849,  0.3837,  0.2657,
        -0.0806,  0.6109, -1.2894, -0.2231, -0.6158,  0.2170,  0.3561,  0.4450,
         0.6089, -1.1633, -1.1579,  0.3612,  0.1047, -0.7832,  1.4352,  0.1863,
        -0.2611,  0.8328, -0.2312,  0.3248,  0.1449, -0.4455,  0.3350, -0.9595,
        -0.0975,  0.4814, -0.4335,  0.6945,  0.9104, -0.2817,  0.4164, -1.2609,
         0.7128,  0.2378])

In [3]:
# Distance measurement

x = glove['cat']
y = glove['dog']
torch.norm(y - x)

tensor(1.8846)

In [4]:
# Cosine Similarity 

x = torch.tensor([1., 1., 1.]).unsqueeze(0)
y = torch.tensor([2., 2., 2.]).unsqueeze(0)
torch.cosine_similarity(x, y) 

x = glove['cat']
y = glove['dog']
torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))

tensor([0.9218])

In [5]:
# Word Similarity

word = 'cat'
other = ['dog', 'bike', 'kitten', 'puppy', 'kite', 'computer', 'neuron']
for w in other:
    dist = torch.norm(glove[word] - glove[w])  
    print(w, float(dist))

dog 1.8846031427383423
bike 5.048375129699707
kitten 3.5068609714508057
puppy 3.0644655227661133
kite 4.210376262664795
computer 6.030652046203613
neuron 6.228669166564941


In [6]:
def print_closest_words(vec, n=5):
    dists = torch.norm(glove.vectors - vec, dim=1)     
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1]) 
    for idx, difference in lst[1:n+1]:                         
        print(glove.itos[idx], difference)

In [7]:
print_closest_words(glove["cat"], n=10)

dog 1.8846031
rabbit 2.4572797
monkey 2.8102052
cats 2.8972247
rat 2.9455352
beast 2.9878407
monster 3.0022194
pet 3.0396757
snake 3.0617998
puppy 3.0644655


In [15]:
print_closest_words(glove['nurse'])

doctor 3.1274529
dentist 3.1306612
nurses 3.26872
pediatrician 3.3212206
counselor 3.3987114


In [16]:
print_closest_words(glove['computer'])

computers 2.4362664
software 2.926823
technology 3.190351
electronic 3.5067408
computing 3.5999784


In [17]:
print_closest_words((glove['happy'] + glove['sad']) / 2)

happy 1.9199749
feels 2.3604643
sorry 2.4984782
hardly 2.52593
imagine 2.5652788


In [18]:
print_closest_words((glove['lake'] + glove['building']) / 2)

surrounding 3.0698414
nearby 3.1112068
bridge 3.1585503
along 3.1610188
shore 3.1618817


In [9]:
# King Queen example

print_closest_words(glove['king'] - glove['man'] + glove['woman'])


queen 2.8391209
prince 3.6610038
elizabeth 3.7152522
daughter 3.8317878
widow 3.8493774


In [10]:
print_closest_words(glove['queen'] - glove['woman'] + glove['man'])
print_closest_words(glove['king'] - glove['prince'] + glove['princess'])

king 2.8391209
prince 3.2508988
crown 3.4485192
knight 3.5587437
coronation 3.6198905
queen 3.1845968
king 3.9103293
bride 4.285721
lady 4.299571
sister 4.421178


In [22]:
def print_closest_words(vec, n=1):
    dists = torch.norm(glove.vectors - vec, dim=1)     
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1]) 
    for idx, difference in lst[1:n+1]:                         
        print(glove.itos[idx], difference)


print_closest_words(glove['uncle'] - glove['man'] + glove['woman'])
print_closest_words(glove['grandmother'] - glove['mother'] + glove['father'])
print_closest_words(glove['old'] - glove['young'] + glove['father'])
print_closest_words(glove['programmer'] - glove['bad'] + glove['good'])
print_closest_words(glove['programmer'] - glove['good'] + glove['bad'])
print_closest_words(glove['doctor'] - glove['man'] + glove['woman'])
print_closest_words(glove['doctor'] - glove['woman'] + glove['man'])
print_closest_words(glove['programmer'] - glove['man'] + glove['woman'])
print_closest_words(glove['programmer'] - glove['woman'] + glove['man'])
print_closest_words(glove['engineer'] - glove['man'] + glove['woman'])
print_closest_words(glove['engineer'] - glove['woman'] + glove['man'])

grandmother 2.323353
uncle 2.0784423
father 4.0326614
versatile 4.381561
hacker 3.8383653
nurse 3.1355345
man 3.9335632
prodigy 3.6688528
setup 4.002241
technician 3.6926973
builder 4.3523865
