# Sentence Embeddings
Just like how words can be converted to vectors, so can sentences

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import gensim
import gensim.downloader
import json

from utils import *

## Sentence Embeddings
There are a variety of ways to embed entire sentences. 

- take the average of all of the words' embedding
- use a more sophisticated embedding model like BERT

### Average the word embeddings
The simplest way is to take the embedding of each word then average them together. This seems silly, but it is pretty effective. 

In [4]:
# Use small local model 
model_name = 'word2vec-google-news-300'
# model_name = 'glove-twitter-100'
w2v_model = load_model(model_name)

loaded from binary: models/word2vec-google-news-300.bin


In [20]:
def embed_sentence_average(sentence, w2v_model):
    """Creates sentence embedding based on the average of the words' embeddings"""
    # split sentence into words
    words = sentence.split()
    
    # make array of embeddings, one row for each word
    embeddings = np.array([w2v_model[w.lower()] for w in words if w in w2v_model])
    
    # average over the rows (axis = 0)
    average_embedding = embeddings.mean(axis = 0)
    
    return average_embedding

def cosine_similarity(A, B):
    """Calculates the cosine similarity of 2 vectors A,B"""
    dot_product = np.dot(A, B)
    magnitude_A = np.linalg.norm(A)
    magnitude_B = np.linalg.norm(B)
    return dot_product / (magnitude_A * magnitude_B)

In [21]:
sentence = "I love cookies"
embedding = embed_sentence_average(sentence, w2v_model)
embedding.shape

(300,)

In [22]:
sentence_1 = "I love cookies"
sentence_2 = "cookies are my favorite"

embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

0.7167495

In [23]:
sentence_1 = "I love cookies"
sentence_2 = "that car is speeding in the school zone"

embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

0.1749812

In [24]:
# note that we can also use gensim's n_similarity() funtion to do the same
w2v_model.n_similarity(sentence_1.split(),sentence_2.split())

0.24969348

In [45]:
# Let's visualize defferent sentence embeddings in a PCA-reduced 3d space
sentences = [
    "cookies are my favorite",
    "cookies are a glorious dessert",
    "my, how I love biscuits",
    "I am a huge cookie lover",
    "Cookies are delicious",
    # "my dad ate all the pie", 
    # "could you make me some sandwiches", 
    # "did you eat all the dessert", 
    # "where is the remote", 
    "the tv is on",
    "summer vacation is coming",
    "where should we go on vacation",
    "the motorcycle can go very fast", 
    "that car speeding in the school zone"
]
embeddings = [embed_sentence_average(sentence, w2v_model) for sentence in sentences]
embeddings_reduced = reduce_dimensions(embeddings)
plot_embeddings(embeddings_reduced, sentences, title="Average sentence embeddings (reduced to 3D)")

In [46]:
# similarity between one word and many others
sentence = "I love cookies"
embedding = embed_sentence_average(sentence, w2v_model)



print(f"Similarity between {sentence} and :")
for other_sentence in sentences:
    other_embedding = embed_sentence_average(other_sentence, w2v_model)
    similarity = cosine_similarity(embedding, other_embedding)
    print(f"\t{similarity:.2f} : {other_sentence}")

Similarity between I love cookies and :
	0.72 : cookies are my favorite
	0.66 : cookies are a glorious dessert
	0.79 : my, how I love biscuits
	0.73 : I am a huge cookie lover
	0.67 : Cookies are delicious
	0.31 : the tv is on
	0.19 : summer vacation is coming
	0.31 : where should we go on vacation
	0.29 : the motorcycle can go very fast
	0.17 : that car speeding in the school zone


### Limitations of average embeddings
A drawback is that this method does not take into account the order of the words, which is very important to the meaning.

In [27]:
sentence_1 = "I love cats and hate dogs"
sentence_2 = "I love dogs and hate cats"
w2v_model.n_similarity(sentence_1.split(),sentence_2.split())

1.0

In [28]:
embedding_1 = embed_sentence_average(sentence_1, w2v_model)
embedding_2 = embed_sentence_average(sentence_2, w2v_model)

cosine_similarity(embedding_1, embedding_2)

1.0

## Advanced methods
We can use a language model to predict the next word in a sentence. 

Here we will a sentence transformer embedding model that can be run locally using the Transformers library. The process is a little involved and we won't get into it here. This will produce a 768-dimension embedding vector.

In [55]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [56]:
sentence = "I love cats and hate dogs"


embeddings1 = model.encode(sentence)
embeddings1.shape
# similarities = model.similarity(embeddings1, embeddings2)

(384,)

In [57]:
sentence_1 = "I love cats and hate dogs"
# sentence_2 = "I don't like dogs and love cats"
# sentence_2 = "I don't like dogs and love cats"
sentence_2 = "eat my shorts"
# sentence_2 = "I love dogs and hate cats"

embedding_1 = model.encode(sentence_1)
embedding_2 = model.encode(sentence_1)

similarity = model.similarity(embedding_1, embedding_2)
similarity

tensor([[1.0000]])

In [47]:
# similarity between one word and many others
sentence = "I love cookies"
embedding = model.encode(sentence)

print(f"Similarity between {sentence} and :")
for other_sentence in sentences:
    other_embedding = model.encode(other_sentence)
    similarity = cosine_similarity(embedding, other_embedding)
    print(f"\t{similarity:.2f} : {other_sentence}")

Similarity between I love cookies and :
	0.88 : cookies are my favorite
	0.73 : cookies are a glorious dessert
	0.60 : my, how I love biscuits
	0.77 : I am a huge cookie lover
	0.82 : Cookies are delicious
	-0.04 : the tv is on
	0.11 : summer vacation is coming
	0.10 : where should we go on vacation
	0.07 : the motorcycle can go very fast
	0.01 : that car speeding in the school zone


## Retrieval
The concept of similarity is useful in the context of retrieval augmented generation (RAG). 
Rag is useful when you have a lot of documents that you would like to ask questions of with an LLM. Typicaly the LLM has no knowledge of your documents so you need to *retrieve* the relevant info from your documents and send it to the LLM

The idea is you ask a question, embed the question, then retrieve your documents that are closest to 

In [20]:
pokemon_info = [
{
"name": "Pikachu",
"description": "Pikachu is an iconic Electric-type Pokémon resembling a yellow mouse. It has red cheeks that store electricity and a lightning bolt-shaped tail. Pikachu is known for its ability to generate powerful electric shocks. It's the mascot of the Pokémon franchise and a popular starter Pokémon for many trainers."
},
{
"name": "Charizard",
"description": "Charizard is a powerful Fire/Flying-type Pokémon that evolves from Charmeleon. It resembles a large, orange dragon with wings and a flame burning at the tip of its tail. Charizard can breathe intense flames and fly at great speeds. It's known for its fierce battles and loyalty to its trainer."
},
{
"name": "Mewtwo",
"description": "Mewtwo is a legendary Psychic-type Pokémon created through genetic manipulation. It has a humanoid appearance with feline features and immense psychic powers. Mewtwo is known for its intelligence, powerful abilities, and complex backstory. It's often portrayed as a conflicted and misunderstood character in the Pokémon universe."
},
{
"name": "Bulbasaur",
"description": "Bulbasaur is a Grass/Poison-type Pokémon that's one of the original starter choices. It has a plant bulb on its back that grows as it evolves. Bulbasaur can use its vines for various tasks and absorb sunlight for energy. It's known for its gentle nature and versatility in battle."
},
{
"name": "Gyarados",
"description": "Gyarados is a fearsome Water/Flying-type Pokémon that evolves from the seemingly weak Magikarp. It resembles a large, serpentine dragon with a gaping mouth. Gyarados is known for its destructive power and ability to cause massive storms. It represents the concept of perseverance and hidden potential."
},
{
"name": "Snorlax",
"description": "Snorlax is a large, round Normal-type Pokémon known for its sleeping habits. It spends most of its time eating and sleeping, only moving to find more food. Despite its lazy appearance, Snorlax is incredibly strong and can be a formidable opponent in battle. It's often used as a roadblock in Pokémon games."
},
{
"name": "Eevee",
"description": "Eevee is a Normal-type Pokémon famous for its multiple evolution possibilities. It has a fox-like appearance with large ears and a fluffy tail. Eevee can evolve into eight different forms, each representing a different type. This unique trait makes it a popular and versatile Pokémon among trainers."
},
{
"name": "Gengar",
"description": "Gengar is a Ghost/Poison-type Pokémon with a mischievous nature. It has a round body, red eyes, and a wide, toothy grin. Gengar is known for its ability to hide in shadows and create illusions. It's often associated with pranks and haunting, making it a favorite among ghost-type enthusiasts."
},
{
"name": "Meowth",
"description": "Meowth is a Normal-type Pokémon resembling a cat with a coin-like charm on its forehead. It's known for its love of shiny objects and its ability to speak human language in the anime. Meowth is often associated with Team Rocket and is famous for its cunning nature and get-rich-quick schemes."
},
{
"name": "Jigglypuff",
"description": "Jigglypuff is a Normal/Fairy-type Pokémon known for its singing ability. It has a round, pink body with large blue eyes. Jigglypuff's song can put anyone to sleep, which it often uses in battle. It becomes frustrated when others fall asleep during its performances and draws on their faces."
},
{
"name": "Alakazam",
"description": "Alakazam is a Psychic-type Pokémon with extraordinary mental capabilities. It has a humanoid appearance with a large mustache and holds two spoons to focus its psychic powers. Alakazam is said to have an IQ of 5,000 and can remember everything since its birth. It's known for its powerful psychic attacks and strategic battling."
},
{
"name": "Machamp",
"description": "Machamp is a muscular Fighting-type Pokémon with four arms. It's known for its incredible strength and martial arts skills. Machamp can throw a thousand punches in two seconds and move mountains with its bare hands. It's often seen as a symbol of physical power in the Pokémon world."
},
{
"name": "Lapras",
"description": "Lapras is a Water/Ice-type Pokémon resembling a plesiosaur with a shell on its back. It's known for its gentle nature and ability to understand human speech. Lapras can ferry people across bodies of water and is often associated with legends and folklore. It's highly intelligent and has a beautiful singing voice."
},
{
"name": "Ditto",
"description": "Ditto is a Normal-type Pokémon with the unique ability to transform into any other Pokémon. In its natural state, it appears as a pink blob with a simple face. Ditto can copy not only the appearance but also the abilities of other Pokémon. This makes it invaluable for breeding and a tricky opponent in battles."
},
{
"name": "Magikarp",
"description": "Magikarp is a Water-type Pokémon infamous for being weak and useless in battle. It resembles a large, orange fish and is known for its inability to learn many moves. However, Magikarp evolves into the powerful Gyarados, embodying the theme of hidden potential. It's often used as an example of how seemingly weak Pokémon can become strong."
},
{
"name": "Dragonite",
"description": "Dragonite is a powerful Dragon/Flying-type Pokémon with a friendly appearance. It resembles a large, orange dragon with small wings. Despite its intimidating size, Dragonite is known for its gentle and helpful nature. It can fly around the world in just 16 hours and is said to live in the sea."
},
{
"name": "Mew",
"description": "Mew is a Mythical Psychic-type Pokémon said to contain the genetic code of all Pokémon. It has a small, pink body with large eyes and a long tail. Mew is known for its playful nature and ability to learn any move. It's extremely rare and is often the subject of scientific research in the Pokémon world."
},
{
"name": "Articuno",
"description": "Articuno is a Legendary Ice/Flying-type Pokémon resembling a majestic blue bird. It's said to control ice and can create blizzards by flapping its wings. Articuno is one of the three Legendary birds of Kanto and is associated with winter and freezing temperatures. Its appearance is said to bring good luck to those who see it."
},
{
"name": "Scyther",
"description": "Scyther is a Bug/Flying-type Pokémon known for its speed and sharp blades. It has a green, insectoid body with scythe-like forearms. Scyther is a skilled hunter that can slice through thick logs with its blades. It's often found in grasslands and is respected for its ninja-like agility."
},
{
"name": "Cubone",
"description": "Cubone is a Ground-type Pokémon known for wearing a skull helmet. It carries a bone as a weapon and is often associated with loneliness and loss. Cubone is said to be mourning its deceased mother, whose skull it wears. Despite its sad backstory, it's a loyal and determined Pokémon in battle."
},
{
"name": "Vaporeon",
"description": "Vaporeon is a Water-type evolution of Eevee. It has a fish-like appearance with fins and a mermaid-like tail. Vaporeon can melt into water, becoming invisible, and has the ability to control water molecules. It's often found near clean water sources and is known for its graceful swimming abilities."
},
{
"name": "Metagross",
"description": "Metagross is a powerful Steel/Psychic-type Pokémon with a body made of steel. It has four legs and a massive, disc-like body with a large X on its face. Metagross is known for its high intelligence, possessing four brains that can perform complex calculations. It's often seen as one of the strongest non-legendary Pokémon."
},
{
"name": "Lucario",
"description": "Lucario is a Fighting/Steel-type Pokémon known for its ability to sense and manipulate aura. It has a canine-like appearance with spike protrusions on its body. Lucario is highly intelligent and can communicate telepathically with humans. It's often associated with concepts of loyalty and justice in the Pokémon world."
},
{
"name": "Gardevoir",
"description": "Gardevoir is a Psychic/Fairy-type Pokémon known for its elegance and protective nature. It has a humanoid appearance with a flowing gown-like lower body. Gardevoir can create small black holes and is said to be able to see the future. It forms a strong bond with its trainer and will protect them at all costs."
},
{
"name": "Tyranitar",
"description": "Tyranitar is a Rock/Dark-type Pokémon resembling a large, armored tyrannosaurus. It's known for its immense strength and ability to topple mountains. Tyranitar's body is as hard as rock, and it's often compared to Godzilla in appearance. It's considered a pseudo-legendary Pokémon due to its power and rarity."
},
{
"name": "Rayquaza",
"description": "Rayquaza is a Legendary Dragon/Flying-type Pokémon that resembles a large, green serpentine dragon. It's said to live in the ozone layer and has the ability to calm conflicts between Kyogre and Groudon. Rayquaza is known for its power over the weather and its role as a guardian of the sky."
},
{
"name": "Blaziken",
"description": "Blaziken is a Fire/Fighting-type Pokémon that evolves from Combusken. It has a bird-like appearance with powerful legs and fiery wrists. Blaziken is known for its incredible jumping ability and powerful kicks that can shatter skyscrapers. It's often seen as a representation of determination and fighting spirit."
},
{
"name": "Wobbuffet",
"description": "Wobbuffet is a Psychic-type Pokémon known for its defensive abilities. It has a blue, blob-like body with squinty eyes and a black tail with eye-like markings. Wobbuffet doesn't generally attack first but instead counters enemy moves. It's famous for its catchphrase 'Wobbuffet!' in the anime and its comical personality."
},
{
"name": "Absol",
"description": "Absol is a Dark-type Pokémon often associated with disasters. It has a quadruped form with a scythe-like growth on its head. Absol is known for appearing before natural disasters, leading to it being mistakenly viewed as a harbinger of doom. In reality, it tries to warn people of impending danger."
}
]
for item in pokemon_info:
    item['description'] = item["description"].replace("é","e")
with open("data/pokemon.json", "w") as f:
    json.dump(pokemon_info, f, indent = 2)

In [21]:
def retrieve_most_similar(query, names, descriptions, embeddings, n=5):
    """
    Retrieves the most similar items to a given query based on their embeddings.

    Parameters:
    query (str): The query to compare against.
    names (List[str]): The names of the items.
    descriptions (List[str]): The descriptions of the items.
    embeddings (List[np.ndarray]): The embeddings of the items.
    n (int, optional): The number of most similar items to return. Defaults to 5.

    Returns:
    pd.DataFrame: A DataFrame sorted by similarity to the query, containing the top n most similar items.
    """
    embedded_query = model.encode(query)
    similarities = []
    for embedding in embeddings:
        similarities.append(cosine_similarity(embedded_query, embedding))
    df = pd.DataFrame({
        "similarity": similarities,
        "name": names,
        "description": descriptions
    })
    return df.sort_values("similarity", ascending = False).head(n)

In [22]:
with open("data/pokemon.json") as f:
    pokemon_info = json.load(f)

pokemon_info[0]

{'name': 'Pikachu',
 'description': "Pikachu is an iconic Electric-type Pokemon resembling a yellow mouse. It has red cheeks that store electricity and a lightning bolt-shaped tail. Pikachu is known for its ability to generate powerful electric shocks. It's the mascot of the Pokemon franchise and a popular starter Pokemon for many trainers."}

In [23]:
names = [item["name"] for item in pokemon_info]
descriptions = [item["description"] for item in pokemon_info]
embeddings = [model.encode(item["description"]) for item in pokemon_info]

In [26]:
query = "which pokemon can fly?"
print(f"query: '{query}'")
df_similar = retrieve_most_similar(query, names, descriptions, embeddings, n=5)
df_similar

query: 'which pokemon can fly?'


Unnamed: 0,similarity,name,description
1,0.481414,Charizard,Charizard is a powerful Fire/Flying-type Pokem...
26,0.461601,Blaziken,Blaziken is a Fire/Fighting-type Pokemon that ...
14,0.461312,Magikarp,Magikarp is a Water-type Pokemon infamous for ...
4,0.449819,Gyarados,Gyarados is a fearsome Water/Flying-type Pokem...
15,0.449307,Dragonite,Dragonite is a powerful Dragon/Flying-type Pok...


In [27]:
for i, description in enumerate(df_similar["description"], start=1):
    print(i, description, "\n")

1 Charizard is a powerful Fire/Flying-type Pokemon that evolves from Charmeleon. It resembles a large, orange dragon with wings and a flame burning at the tip of its tail. Charizard can breathe intense flames and fly at great speeds. It's known for its fierce battles and loyalty to its trainer. 

2 Blaziken is a Fire/Fighting-type Pokemon that evolves from Combusken. It has a bird-like appearance with powerful legs and fiery wrists. Blaziken is known for its incredible jumping ability and powerful kicks that can shatter skyscrapers. It's often seen as a representation of determination and fighting spirit. 

3 Magikarp is a Water-type Pokemon infamous for being weak and useless in battle. It resembles a large, orange fish and is known for its inability to learn many moves. However, Magikarp evolves into the powerful Gyarados, embodying the theme of hidden potential. It's often used as an example of how seemingly weak Pokemon can become strong. 

4 Gyarados is a fearsome Water/Flying-typ

In [82]:
# embeddings = [model.encode(info) for pokemon, info in pokemon_info.items()]
# embeddings_reduced = reduce_dimensions(embeddings)
# plot_embeddings(embeddings_reduced, pokemon_info.keys(), title="Pokemon (reduced to 3D)")