# Sentence Embeddings
Just like how words can be converted to vectors, so can sentences

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import gensim
import gensim.downloader
import json

from utils import *

## Sentence Embeddings
There are a variety of ways to embed entire sentences. 

- take the average of all of the words' embedding
- use a more sophisticated embedding model like BERT

### Average the word embeddings
The simplest way is to take the embedding of each word then average them together. This seems silly, but it is pretty effective. 

In [2]:
# Use small local model 
model_name = 'word2vec-google-news-300'
w2v_model = load_model(model_name)

loaded from binary: models/word2vec-google-news-300.bin


In [3]:
def embed_sentence_average(sentence, w2v_model):
    """Creates sentence embedding based on the average of the words' embeddings"""
    # split sentence into words
    words = sentence.split()
    
    # make array of embeddings, one row for each word
    embeddings = np.array([w2v_model[w.lower()] for w in words if w in w2v_model])
    
    # average over the rows (axis = 0)
    average_embedding = embeddings.mean(axis = 0)
    
    return average_embedding

def cosine_similarity(A, B):
    """Calculates the cosine similarity of 2 vectors A,B"""
    dot_product = np.dot(A, B)
    magnitude_A = np.linalg.norm(A)
    magnitude_B = np.linalg.norm(B)
    return dot_product / (magnitude_A * magnitude_B)

In [4]:
sentence = "I love cookies"
embedding = embed_sentence_average(sentence, w2v_model)
embedding.shape

(300,)

In [5]:
sentence_1 = "I love cookies"
sentence_2 = "cookies are my favorite"

embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

0.7167495

In [6]:
sentence_1 = "I love cookies"
sentence_2 = "that car is speeding in the school zone"

embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

0.1749812

In [7]:
# note that we can also use gensim's n_similarity() funtion to do the same
w2v_model.n_similarity(sentence_1.split(),sentence_2.split())

0.24969348

In [8]:
# Let's visualize defferent sentence embeddings in a PCA-reduced 3d space
sentences = [
    "cookies are my favorite",
    "cookies are a glorious dessert",
    "my, how I love biscuits",
    "I am a huge cookie lover",
    "Cookies are delicious",
    "summer vacation is coming",
    "where should we go on vacation",
    "the motorcycle can go very fast", 
    "that car speeding in the school zone"
]
embeddings = [embed_sentence_average(sentence, w2v_model) for sentence in sentences]
embeddings_reduced = reduce_dimensions(embeddings)
plot_embeddings(embeddings_reduced, sentences, title="Average sentence embeddings (reduced to 3D)")

In [9]:
# similarity between one word and many others
sentence = "I love cookies"
embedding = embed_sentence_average(sentence, w2v_model)



print(f"Similarity between {sentence} and :")
for other_sentence in sentences:
    other_embedding = embed_sentence_average(other_sentence, w2v_model)
    similarity = cosine_similarity(embedding, other_embedding)
    print(f"\t{similarity:.2f} : {other_sentence}")

Similarity between I love cookies and :
	0.72 : cookies are my favorite
	0.66 : cookies are a glorious dessert
	0.79 : my, how I love biscuits
	0.73 : I am a huge cookie lover
	0.67 : Cookies are delicious
	0.19 : summer vacation is coming
	0.31 : where should we go on vacation
	0.29 : the motorcycle can go very fast
	0.17 : that car speeding in the school zone


### Limitations of average embeddings
A drawback is that this method does not take into account the order of the words, which is very important to the meaning.

In [10]:
sentence_1 = "I love cats and hate dogs"
sentence_2 = "I love dogs and hate cats"
w2v_model.n_similarity(sentence_1.split(),sentence_2.split())

1.0

In [11]:
embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

1.0

## Advanced methods
We can use a language model to predict the next word in a sentence. 

Here we will a sentence transformer embedding model that can be run locally using the Transformers library. The process is a little involved and we won't get into it here. This will produce a 768-dimension embedding vector.

In [12]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [13]:
sentence = "I love cats and hate dogs"


embeddings1 = model.encode(sentence)
embeddings1.shape
# similarities = model.similarity(embeddings1, embeddings2)

(384,)

In [14]:
sentence_1 = "I love cats and hate dogs"
# sentence_2 = "I don't like dogs and love cats"
# sentence_2 = "I don't like dogs and love cats"
sentence_2 = "eat my shorts"
# sentence_2 = "I love dogs and hate cats"

embedding_1 = model.encode(sentence_1)
embedding_2 = model.encode(sentence_1)

similarity = model.similarity(embedding_1, embedding_2)
similarity

tensor([[1.0000]])

In [15]:
# similarity between one word and many others
sentence = "I love cookies"
embedding = model.encode(sentence)

print(f"Similarity between {sentence} and :")
for other_sentence in sentences:
    other_embedding = model.encode(other_sentence)
    similarity = cosine_similarity(embedding, other_embedding)
    print(f"\t{similarity:.2f} : {other_sentence}")

Similarity between I love cookies and :
	0.88 : cookies are my favorite
	0.73 : cookies are a glorious dessert
	0.60 : my, how I love biscuits
	0.77 : I am a huge cookie lover
	0.82 : Cookies are delicious
	0.11 : summer vacation is coming
	0.10 : where should we go on vacation
	0.07 : the motorcycle can go very fast
	0.01 : that car speeding in the school zone


## Retrieval
The concept of similarity is useful in the context of retrieval augmented generation (RAG). 
Rag is useful when you have a lot of documents that you would like to ask questions of with an LLM. Typicaly the LLM has no knowledge of your documents so you need to *retrieve* the relevant info from your documents and send it to the LLM

The idea is you ask a question, embed the question, then retrieve your documents that are closest to 

In [16]:
def retrieve_most_similar(query, names, descriptions, embeddings, n=5):
    """
    Retrieves the most similar items to a given query based on their embeddings.

    Parameters:
    query (str): The query to compare against.
    names (List[str]): The names of the items.
    descriptions (List[str]): The descriptions of the items.
    embeddings (List[np.ndarray]): The embeddings of the items.
    n (int, optional): The number of most similar items to return. Defaults to 5.

    Returns:
    pd.DataFrame: A DataFrame sorted by similarity to the query, containing the top n most similar items.
    """
    embedded_query = model.encode(query)
    similarities = []
    for embedding in embeddings:
        similarities.append(cosine_similarity(embedded_query, embedding))
    df = pd.DataFrame({
        "similarity": similarities,
        "name": names,
        "description": descriptions
    })
    return df.sort_values("similarity", ascending = False).head(n)

### Example
Let's load some data about pokemon characters from a json file that I asked Claude-3.5 Sonnet to generate

In [17]:
with open("data/pokemon.json") as f:
    pokemon_info = json.load(f)

# look at the first one
pokemon_info[0]

{'name': 'Pikachu',
 'description': "Pikachu is an iconic Electric-type Pokemon resembling a yellow mouse. It has red cheeks that store electricity and a lightning bolt-shaped tail. Pikachu is known for its ability to generate powerful electric shocks. It's the mascot of the Pokemon franchise and a popular starter Pokemon for many trainers."}

In [18]:
names = [item["name"] for item in pokemon_info]
descriptions = [item["description"] for item in pokemon_info]
embeddings = [model.encode(item["description"]) for item in pokemon_info]

In [19]:
query = "which pokemon can fly?"
print(f"query: '{query}'")
df_similar = retrieve_most_similar(query, names, descriptions, embeddings, n=5)
df_similar

query: 'which pokemon can fly?'


Unnamed: 0,similarity,name,description
1,0.481414,Charizard,Charizard is a powerful Fire/Flying-type Pokem...
26,0.461601,Blaziken,Blaziken is a Fire/Fighting-type Pokemon that ...
14,0.461312,Magikarp,Magikarp is a Water-type Pokemon infamous for ...
4,0.449819,Gyarados,Gyarados is a fearsome Water/Flying-type Pokem...
15,0.449307,Dragonite,Dragonite is a powerful Dragon/Flying-type Pok...


In [20]:
for i, description in enumerate(df_similar["description"], start=1):
    print(i, description, "\n")

1 Charizard is a powerful Fire/Flying-type Pokemon that evolves from Charmeleon. It resembles a large, orange dragon with wings and a flame burning at the tip of its tail. Charizard can breathe intense flames and fly at great speeds. It's known for its fierce battles and loyalty to its trainer. 

2 Blaziken is a Fire/Fighting-type Pokemon that evolves from Combusken. It has a bird-like appearance with powerful legs and fiery wrists. Blaziken is known for its incredible jumping ability and powerful kicks that can shatter skyscrapers. It's often seen as a representation of determination and fighting spirit. 

3 Magikarp is a Water-type Pokemon infamous for being weak and useless in battle. It resembles a large, orange fish and is known for its inability to learn many moves. However, Magikarp evolves into the powerful Gyarados, embodying the theme of hidden potential. It's often used as an example of how seemingly weak Pokemon can become strong. 

4 Gyarados is a fearsome Water/Flying-typ

#### So these results aren't the best, but we'll do better in a later notebook on RAG