# Importing Important Libraries

In [1]:
!pip install -q gensim

In [2]:
import os
import numpy as np
import torch
from gensim.models import Word2Vec
import gensim.downloader as api

ModuleNotFoundError: No module named 'torch'

# One-Hot Encoding

In [None]:
#Defining a vocabulary of words for one-hot encoding
VOCAB = ["cat", "dog", "car", "bus", "train"]
VOCAB_SIZE = len(VOCAB)

In [None]:
#Create a dictionary which stores a unique integer ID for each word
word_to_index = {}
id = 0 #First word gets an ID of 0

for word in VOCAB:

    #Assign ID to word
    word_to_index[word] = id
    print(f"ID of '{word}' is {word_to_index[word]}")

    # Increment ID for the next word
    id += 1

ID of 'cat' is 0
ID of 'dog' is 1
ID of 'car' is 2
ID of 'bus' is 3
ID of 'train' is 4


In [None]:
# Creating a one-hot encoded vector for each word
vectors_onehot = {} #Dictionary to store one-hot encoded representation

for word in VOCAB:
    #Create a vector of size VOCAB_SIZE with all elements set to 0
    vec = torch.zeros(VOCAB_SIZE)

    #Get the word's integer ID and set the vector component at that index equal to 1, rest remain equal to 0
    #Eg: For a word with ID = 0, its one-hot encoded representation should be [1, 0, 0, 0, 0] (assuming 5 words in vocab)
    word_id = word_to_index[word]
    vec[word_id] = 1.0

    #Store one-hot encoded vector to dictionary
    vectors_onehot[word] = vec.view(1, -1)

    #View one-hot encoded vectors
    print(f"One-hot encoded representation for '{word}' is {vectors_onehot[word]}")

One-hot encoded representation for 'cat' is tensor([[1., 0., 0., 0., 0.]])
One-hot encoded representation for 'dog' is tensor([[0., 1., 0., 0., 0.]])
One-hot encoded representation for 'car' is tensor([[0., 0., 1., 0., 0.]])
One-hot encoded representation for 'bus' is tensor([[0., 0., 0., 1., 0.]])
One-hot encoded representation for 'train' is tensor([[0., 0., 0., 0., 1.]])


# Word2Vec Representation

In [None]:
#Defining a set of sentences for our Word2Vec model to learn representations from
SENTENCES = [
    ["the", "dog", "chased", "the", "cat"],
    ["a", "cat", "is", "a", "pet"],
    ["dogs", "and", "cats", "are", "animals"],
    ["the", "bus", "stopped", "at", "the", "station"],
    ["a", "train", "carries", "many", "passengers"],
    ["cars", "and", "buses", "are", "used", "for", "transport"],
    ["the", "dog", "ran", "beside", "the", "car"],
]

In [None]:
#Create a Word2Vec model and train it to learn representations using SENTENCES
w2v_model = Word2Vec(
    sentences=SENTENCES, #List of sentences to learn from
    vector_size=10, #Size of the embedding vectors
    min_count=1, #Ignores words with frequency less than min_count
    sg=1 #Uses skip-gram training algorithm (replace with 0 to use CBOW)
)

In [None]:
#Dictionary to store word2vec representations of each word
vectors_w2v = {}
for word in VOCAB:
    vectors_w2v[word] = torch.tensor(w2v_model.wv[word]).view(1, -1)
    print(f"Word2Vec representation for '{word}' is {vectors_w2v[word]}")

Word2Vec representation for 'cat' is tensor([[-0.0960,  0.0501, -0.0876, -0.0439, -0.0004, -0.0030, -0.0766,  0.0961,
          0.0498,  0.0923]])
Word2Vec representation for 'dog' is tensor([[-0.0816,  0.0449, -0.0413,  0.0082,  0.0851, -0.0446,  0.0451, -0.0679,
         -0.0355,  0.0940]])
Word2Vec representation for 'car' is tensor([[-0.0158,  0.0032, -0.0414, -0.0769, -0.0149,  0.0247, -0.0090,  0.0553,
         -0.0274,  0.0227]])
Word2Vec representation for 'bus' is tensor([[-0.0464, -0.0316,  0.0931,  0.0087,  0.0749, -0.0607,  0.0516,  0.0992,
         -0.0846, -0.0514]])
Word2Vec representation for 'train' is tensor([[-0.0371, -0.0875,  0.0544,  0.0651, -0.0079, -0.0671, -0.0709, -0.0250,
          0.0514, -0.0367]])


# GloVe Embeddings

In [None]:
#Load a pre-trained GloVe model
glove = api.load("glove-wiki-gigaword-50")

In [None]:
#Dictionary to store GloVe representations of each word
vectors_glove = {}

#Get the GloVe embedding for each word and store it in the dictionary
for word in VOCAB:
  vectors_glove[word] = torch.tensor(glove[word]).view(1, -1)
  print(f"GloVe representation for '{word}' is {vectors_glove[word]}")

GloVe representation for 'cat' is tensor([[ 0.4528, -0.5011, -0.5371, -0.0157,  0.2219,  0.5460, -0.6730, -0.6891,
          0.6349, -0.1973,  0.3368,  0.7735,  0.9009,  0.3849,  0.3837,  0.2657,
         -0.0806,  0.6109, -1.2894, -0.2231, -0.6158,  0.2170,  0.3561,  0.4450,
          0.6089, -1.1633, -1.1579,  0.3612,  0.1047, -0.7832,  1.4352,  0.1863,
         -0.2611,  0.8328, -0.2312,  0.3248,  0.1449, -0.4455,  0.3350, -0.9595,
         -0.0975,  0.4814, -0.4335,  0.6945,  0.9104, -0.2817,  0.4164, -1.2609,
          0.7128,  0.2378]])
GloVe representation for 'dog' is tensor([[ 0.1101, -0.3878, -0.5762, -0.2771,  0.7052,  0.5399, -1.0786, -0.4015,
          1.1504, -0.5678,  0.0039,  0.5288,  0.6456,  0.4726,  0.4855, -0.1841,
          0.1801,  0.9140, -1.1979, -0.5778, -0.3799,  0.3361,  0.7720,  0.7556,
          0.4551, -1.7671, -1.0503,  0.4257,  0.4189, -0.6833,  1.5673,  0.2768,
         -0.6171,  0.6464, -0.0770,  0.3712,  0.1308, -0.4514,  0.2540, -0.7439,
         -0.

# Evaluation of Representations (Cosine Similarity)

In [None]:
#Defining pairs of words whose embeddings we will compare (evaluate) with respect to each other
pairs = [
  # Animal v/s animal
  ("cat", "dog"),

  # Animal v/s vehicle
  ("cat", "car"),
  ("cat", "bus"),
  ("cat", "train"),
  ("dog", "car"),
  ("dog", "bus"),
  ("dog", "train"),

  # Vehicle v/s vehicle
  ("car", "bus"),
  ("car", "train"),
  ("bus", "train"),

  # Word compared to itself
  ("cat", "cat"),
  ("dog", "dog"),
  ("car", "car"),
  ("bus", "bus"),
  ("train", "train"),
]

In [None]:
#Define a function that evaluates pair-wise similarities given a dictionary of representation vectors
def evaluate_similarity(vectors):
  for w1, w2 in pairs:
    v1 = vectors[w1] #Representation of first word
    v2 = vectors[w2] #Representation of second word

    #Compute cosine similarity of both vectors and print it out
    sim = torch.nn.functional.cosine_similarity(v1, v2).item()
    print(f"s({w1}, {w2}) = {sim:.3f}")

## Evaluating One-Hot Encoded Vectors
We see that when a word is compared to itself, its similarity is always 1 (a word is identical to itself).

However, it has 0 similarity whenever compared to a different word. This indicates a lack of semantic meaning in one-hot representations.

In [None]:
evaluate_similarity(vectors_onehot)

s(cat, dog) = 0.000
s(cat, car) = 0.000
s(cat, bus) = 0.000
s(cat, train) = 0.000
s(dog, car) = 0.000
s(dog, bus) = 0.000
s(dog, train) = 0.000
s(car, bus) = 0.000
s(car, train) = 0.000
s(bus, train) = 0.000
s(cat, cat) = 1.000
s(dog, dog) = 1.000
s(car, car) = 1.000
s(bus, bus) = 1.000
s(train, train) = 1.000


## Evaluating Word2Vec Encoded Vectors
Here, different words have non-zero similarities when compared to each other denoting that these representations store some semantic information about words in them.

In [None]:
evaluate_similarity(vectors_w2v)

s(cat, dog) = 0.250
s(cat, car) = 0.615
s(cat, bus) = -0.196
s(cat, train) = -0.159
s(dog, car) = -0.041
s(dog, bus) = 0.036
s(dog, train) = -0.212
s(car, bus) = -0.014
s(car, train) = -0.575
s(bus, train) = 0.138
s(cat, cat) = 1.000
s(dog, dog) = 1.000
s(car, car) = 1.000
s(bus, bus) = 1.000
s(train, train) = 1.000


## Evaluating GloVe Vectors
Here, different words have non-zero similarities when compared to each other and these similarity values even correspond to our intuitive sense of how similar two words might be.

In [None]:
evaluate_similarity(vectors_glove)

s(cat, dog) = 0.922
s(cat, car) = 0.364
s(cat, bus) = 0.307
s(cat, train) = 0.316
s(dog, car) = 0.464
s(dog, bus) = 0.406
s(dog, train) = 0.418
s(car, bus) = 0.821
s(car, train) = 0.766
s(bus, train) = 0.902
s(cat, cat) = 1.000
s(dog, dog) = 1.000
s(car, car) = 1.000
s(bus, bus) = 1.000
s(train, train) = 1.000
