[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/xiptos/is_notes/blob/main/word_embeddings_glove.ipynb)

# Set the dependencies and download

We start by importing the dependencies and download the _GloVe_ embeddings

In [None]:
import os
import urllib.request
import zipfile

# Directory for GloVe
glove_dir = "glove"
zip_path = os.path.join(glove_dir, "glove.6B.zip")

# Files we expect after extraction
expected_files = [
    "glove.6B.50d.txt",
    "glove.6B.100d.txt",
    "glove.6B.200d.txt",
    "glove.6B.300d.txt",
]

# 1. Create directory if missing
os.makedirs(glove_dir, exist_ok=True)

# 2. Check if the ZIP file exists
if not os.path.exists(zip_path):
    print("GloVe ZIP not found. Downloading…")
    url = "https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip"
    urllib.request.urlretrieve(url, zip_path)
    print("Download complete.")
else:
    print("ZIP file already exists — skipping download.")

# 3. Check if files are already extracted
if not all(os.path.exists(os.path.join(glove_dir, f)) for f in expected_files):
    print("Extracting GloVe files…")
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(glove_dir)
    print("Extraction complete.")
else:
    print("GloVe text files already extracted — skipping extraction.")

## Load and use the embeddings

Load the 50 dimension vector embedding.

In [None]:
import numpy as np

def load_glove(path):
    word2vec = {}
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            word2vec[word] = vector
    return word2vec

glove50 = load_glove("glove/glove.6B.50d.txt")
print("Loaded vocabulary size:", len(glove50))

## Build the matrix and index

In [None]:
import torch

# Build a matrix and index mapping
vocab = list(glove50.keys())
word_to_idx = {w: i for i, w in enumerate(vocab)}

embedding_matrix = torch.tensor(
    np.stack([glove50[w] for w in vocab]),
    dtype=torch.float
)

embedding_matrix = embedding_matrix / embedding_matrix.norm(dim=1, keepdim=True)
print(embedding_matrix.shape)

## Some auxiliarey functions

To find the most similar word

In [None]:
def get_vector(word):
    if word not in word_to_idx:
        raise ValueError(f"'{word}' not in vocabulary")
    return embedding_matrix[word_to_idx[word]]

def most_similar(word, top_k=10):
    v = get_vector(word)
    sims = torch.mv(embedding_matrix, v)
    best = torch.topk(sims, top_k + 1).indices.tolist()
    best = [i for i in best if vocab[i] != word][:top_k]
    print(f"\nWords most similar to '{word}':")
    for i in best:
        print(f"{vocab[i]:>10s}   (cosine={sims[i]:.3f})")

In [None]:
most_similar("king")
most_similar("paris")
most_similar("apple")

## Find analogies

In [None]:
def analogy(a, b, c, top_k=5):
    for w in [a, b, c]:
        if w not in word_to_idx:
            print(f"'{w}' not in vocab")
            return
    va = get_vector(a)
    vb = get_vector(b)
    vc = get_vector(c)

    query = vb - va + vc
    # Normalize
    query = query / query.norm()

    sims = torch.mv(embedding_matrix, query)
    best = torch.topk(sims, top_k + 3).indices.tolist()
    exclude = {word_to_idx[w] for w in [a, b, c]}
    best = [i for i in best if i not in exclude][:top_k]

    print(f"\nAnalogy: {a} → {b} :: {c} → ?")
    for i in best:
        print(f"{vocab[i]:>10s}   (cosine={sims[i]:.3f})")

In [None]:
analogy("man", "king", "woman")
analogy("paris", "france", "rome")
analogy("big", "bigger", "small")