# NLP: Embeddings

Welcome to the embedding portion of this session!

## Part 1: word2vec word embeddings

We will start off by using word2vec to take words and embed them into a vector space.

In [53]:
import spacy

# Load the spaCy model
word_embedder = spacy.load("en_core_web_lg")

# Create a test sentence
sentence = "hello world word embedding"

# This is how spaCy wants to embed words
embedded_sentence = word_embedder(sentence)

# Here we can see the embedding for this first word
print(embedded_sentence[0].vector)

[ 0.25233    0.10176   -0.67485    0.21117    0.43492    0.16542
  0.48261   -0.81222    0.041321   0.78502   -0.077857  -0.66324
  0.1464    -0.29289   -0.25488    0.019293  -0.20265    0.98232
  0.028312  -0.081276  -0.1214     0.13126   -0.17648    0.13556
 -0.16361   -0.22574    0.055006  -0.20308    0.20718    0.095785
  0.22481    0.21537   -0.32982   -0.12241   -0.40031   -0.079381
 -0.19958   -0.015083  -0.079139  -0.18132    0.20681   -0.36196
 -0.30744   -0.24422   -0.23113    0.09798    0.1463    -0.062738
  0.42934   -0.078038  -0.19627    0.65093   -0.22807   -0.30308
 -0.12483   -0.17568   -0.14651    0.15361   -0.29518    0.15099
 -0.51726   -0.033564  -0.23109   -0.7833     0.018029  -0.15719
  0.02293    0.49639    0.029225   0.05669    0.14616   -0.19195
  0.16244    0.23898    0.36431    0.45263    0.2456     0.23803
  0.31399    0.3487    -0.035791   0.56108   -0.25345    0.051964
 -0.10618   -0.30962    1.0585    -0.42025    0.18216   -0.11256
  0.40576    0.11784 

### Word Embeddings, how can we use them?

Here we will use the distance in the embeddings space to find the most similar word in a list

Feel free to play around with the word list and the example word to see how these interact with each other

In [None]:
import numpy as np

# Generate a list of 100 random words
random_words = "acorn breeze candle drift ember falcon glint harbor ink jumble kernel latch mirth nudge orbit prism quill rumble swoop tangle umber vortex whisk yonder zeal brisk cradle dusk echo fable gloom hush ivory jigsaw knack lush minglenook oath plume quirk riddle snag throb urge verge wisp yarn zest apex blunt clasp dabble elbow flick grin hinge imprint jest knot moat nibble opal paddle quench ripple smirk trudge utter vane waddle yell zany alloy beacon clutch drape fray grasp hurl inlet jolts kink loop mural nestle ogle peep quest rove snip trill unzip vault wane yelp zipper glimmer sprout gleam twig bask clink rippled"

# Embed the random words
random_word_embeddings = word_embedder(random_words)

# Define a function to find the most similar word
def find_most_similar_word(word, words_to_match, word_embedding_tool):
    # Embed the input word
    word_embedding = word_embedding_tool(word)

    # Embeddings from word_list
    random_word_embeddings = word_embedding_tool(random_words)
    
    # Compute cosine similarity
    similarities = []
    for i in range(len(random_word_embeddings)):
        similarity = word_embedding.similarity(random_word_embeddings[i])
        similarities.append(similarity)
    
    # This is our most similar word
    most_similar_index = np.argmax(similarities)

    # Return the most similar word by splitting to to match string
    match_list = words_to_match.split(' ')

    return match_list[most_similar_index] 

# Example usage
input_word = "pen"
most_similar_word = find_most_similar_word(input_word, random_words, word_embedder)
print(f"The most similar word to '{input_word}' is '{most_similar_word}'.")

The most similar word to 'pen' is 'ink'.


  similarity = word_embedding.similarity(random_word_embeddings[i])


### A cool property about word2vec embeddings

Here we are going to see how the distances in these embeddings themselves have meaning. The implication here is that there is some dimenson-like element that encodes certain properties

In [76]:
def compute_distance(word_1, word_2, word_embedding_tool):
    # Embed the words
    word_1_embedding = word_embedding_tool(word_1)
    word_2_embedding = word_embedding_tool(word_2)

    # Compute the distance
    distance = word_1_embedding.vector - word_2_embedding.vector
    
    return distance

# Embed the royal words
royal_difference = compute_distance("king", "queen", word_embedder)

# Embed kid words
kid_difference = compute_distance("boy", "girl", word_embedder)

# Embed some random words
random_difference = compute_distance("spaceship", "dog", word_embedder)

# Create difference of differences 
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compute the cosine similarity between the two differences
royal_kid_similarity = cosine_similarity(royal_difference, kid_difference)
print(f"The cosine similarity between the kid and royal differences is: {royal_kid_similarity}")

# Compute the cosine similarity between the two differences
royal_random_similarity = cosine_similarity(royal_difference, random_difference)
print(f"The cosine similarity between the random and royal differences is: {royal_random_similarity}")

# Finally, show the cosine similarity between the two royal words
royal_similarity = cosine_similarity(word_embedder("king")[0].vector, word_embedder("queen")[0].vector)
print(f"The cosine similarity between the royal words is: {royal_similarity}")

The cosine similarity between the kid and royal differences is: 0.47515803575515747
The cosine similarity between the random and royal differences is: -0.0818493589758873
The cosine similarity between the royal words is: 0.7252610921859741


## Part 2: Sentence Embeddings

Now we will use small transformer models (what state of the art models like ChatGPT use!) to play with sentence embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load a pre-trained sentence transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# List of sentences to embed
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold.",
    "The pen is mightier than the sword."
]

# Embed the sentences
sentence_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)

# New example sentence
example_sentence = "A long journey starts with one step."

# Embed the example sentence
example_embedding = sentence_model.encode(example_sentence, convert_to_tensor=True)

# Compute cosine similarities
cosine_similarities = util.cos_sim(example_embedding, sentence_embeddings)

# Find the most similar sentence
most_similar_idx = torch.argmax(cosine_similarities).item()
most_similar_sentence = sentences[most_similar_idx]

print(f"The most similar sentence to '{example_sentence}' is '{most_similar_sentence}'.")

The most similar sentence to 'A long journey starts with one step.' is 'A journey of a thousand miles begins with a single step.'.


In [92]:
from datasets import load_dataset

# Now load a much larger dataset of sentences
ds = load_dataset("agentlans/high-quality-english-sentences")

# Load a pre-trained sentence transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the first 10,000 sentences (this is a subset to run faster)
sentence_embeddings = sentence_model.encode(ds['train'][:10000]['text'], convert_to_tensor=True)

We split out the comparison cell because the embedding of 10k sentences can take a little time

In [93]:
# New example sentence
example_sentence = "A long journey starts with one step."
# Embed the example sentence
example_embedding = sentence_model.encode(example_sentence, convert_to_tensor=True)
# Compute cosine similarities
cosine_similarities = util.cos_sim(example_embedding, sentence_embeddings)
# Find the most similar sentence
most_similar_idx = torch.argmax(cosine_similarities).item()
most_similar_sentence = ds['train'][most_similar_idx]['text']
print(f"The most similar sentence to '{example_sentence}' is '{most_similar_sentence}'.")

The most similar sentence to 'A long journey starts with one step.' is 'Journeys are a very important part of our faith tradition, too.'.
