# Getting Started With Text Embeddings w/ Mistral

In [1]:
import sys
#!{sys.executable} -m pip install -U google-generativeai

In [2]:
#import google.generativeai as palm
import os
import pprint
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#palm.configure(api_key=os.environ['API_KEY'])

In [3]:
from dotenv import load_dotenv, find_dotenv     

from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

In [4]:
"""
for model in palm.list_models():
  if 'embedText' in model.supported_generation_methods:
    print(model.name)
"""   

"\nfor model in palm.list_models():\n  if 'embedText' in model.supported_generation_methods:\n    print(model.name)\n"

In [6]:
import os
import numpy as np

from mistralai.client import MistralClient

def get_text_embedding(txt):
    #client = MistralClient(api_key=api_key, endpoint=dlai_endpoint)
    client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))
    embeddings_batch_response = client.embeddings(model="mistral-embed", input=txt)
    return embeddings_batch_response.data[0].embedding

#### Use the embeddings model


In [9]:
x = 'life'
close_to_x = 'What is the meaning of life?'

"""
model = "models/embedding-gecko-001"

# Create an embedding
embedding_x = palm.generate_embeddings(model=model, text=x)
embedding_close_to_x = palm.generate_embeddings(model=model, text=close_to_x)

vector = embedding_x['embedding']
print(f"Length = {len(vector)}")
print(vector[:10])
"""

#text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
embedding_x = get_text_embedding(x)
embedding_close_to_x = get_text_embedding(close_to_x)
print(f"Length = {len(embedding_x)}")
print(embedding_x[:10])


Length = 1024
[-0.04510498046875, 0.0157623291015625, 0.0043182373046875, -0.0145416259765625, 0.0033359527587890625, 0.00618743896484375, 0.033355712890625, -0.00968170166015625, -0.000301361083984375, -0.0233306884765625]


In [10]:
print(f"Length = {len(embedding_close_to_x)}")
print(embedding_close_to_x[:10])


Length = 1024
[-0.03485107421875, 0.027435302734375, 0.0308685302734375, 0.0004267692565917969, 0.0021762847900390625, -0.0298919677734375, 0.05352783203125, -0.0126953125, -0.00418853759765625, -0.035736083984375]


In [12]:
#similar_measure = np.dot(embedding_x['embedding'], embedding_close_to_x['embedding'])
similar_measure = np.dot(embedding_x, embedding_close_to_x)
print(similar_measure)

0.7719042309013275


#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.


In [24]:
"""
emb_1 = embedding_model.get_embeddings(
    ["What is the meaning of life?"]) # 42!

emb_2 = embedding_model.get_embeddings(
    ["How does one spend their time well on Earth?"])

emb_3 = embedding_model.get_embeddings(
    ["Would you like a salad?"])

vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]

emb_1 = palm.generate_embeddings(model=model, text=["What is the meaning of life?"])
emb_2 = palm.generate_embeddings(model=model, text=["How does one spend their time well on Earth?"])
emb_3 = palm.generate_embeddings(model=model, text=["Would you like a salad?"])

vec_1 = emb_1['embedding']
vec_2 = emb_2['embedding']
vec_3 = emb_3['embedding']
"""

text1="What is the meaning of life?"
text2="How does one spend their time well on Earth?"
text3="Would you like a salad?"

vec_1 = get_text_embedding(text1)
vec_2 = get_text_embedding(text2)
vec_3 = get_text_embedding(text3)
print(f"Length = {len(vec_1)}")


Length = 1024


- Note: the reason we wrap the embeddings (a Python list) in another list is because the `cosine_similarity` function expects either a 2D numpy array or a list of lists.
```Python
vec_1 = [emb_1[0].values]
```


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
X = [[0, 0, 0], [1, 1, 1]]
Y = [[1, 0, 0], [1, 1, 0]]
cosine_similarity(X, Y)

array([[0.        , 0.        ],
       [0.57735027, 0.81649658]])

In [26]:
"""
print(cosine_similarity(vec_1, vec_2)) 
print(cosine_similarity(vec_2, vec_3))
print(cosine_similarity(vec_1, vec_3])
"""
print(np.dot(vec_1, vec_2)) 
print(np.dot(vec_2, vec_3)) 
print(np.dot(vec_1, vec_3)) 


0.7440927744532928
0.6568434339229725
0.6933973190670031


#### From word to sentence embeddings
- One possible way to calculate sentence embeddings from word embeddings is to take the average of the word embeddings.
- This ignores word order and context, so two sentences with different meanings, but the same set of words will end up with the same sentence embedding.


In [29]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

- Remove stop words like ["the", "in", "for", "an", "is"] and punctuation.

In [30]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

- Generate one embedding for each word.  So this is a list of three lists.

In [11]:
"""
embeddings_1 = [palm.generate_embeddings(model=model, text=s)['embedding'] for s in in_pp_1]
print(len(embeddings_1))

emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)
"""

3
(3, 768)


In [31]:
embeddings_1 = np.array([get_text_embedding(s) for s in in_pp_1])
print(len(embeddings_1))

emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

3
(3, 1024)


- Use numpy to convert this list of lists into a 2D array of 3 rows and 768 columns.

In [32]:
"""
embeddings_2 = [palm.generate_embeddings(model=model, text=s)['embedding'] for s in in_pp_2]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_1.shape)
"""

embeddings_2 = np.array([get_text_embedding(s) for s in in_pp_2])
print(len(embeddings_2))

emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

3
(3, 1024)


- Take the average embedding across the 3 word embeddings 
- You'll get a single embedding of length 768.


In [34]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)

(1024,)


In [35]:
emb_2_mean = emb_array_2.mean(axis = 0)

- Check to see that taking an average of word embeddings results in two sentence embeddings that are identical.

In [36]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

[-0.03903707  0.02631632  0.01692963 -0.00691509]
[-0.03903707  0.02631632  0.01692963 -0.00691509]


#### Get sentence embeddings from the model.
- These sentence embeddings account for word order and context.
- Verify that the sentence embeddings are not the same.

In [37]:
print(in_1)
print(in_2)

The kids play in the park.
The play was for kids in the park.


In [17]:
"""
embedding_1 = palm.generate_embeddings(model=model, text=in_1)
embedding_2 = palm.generate_embeddings(model=model, text=in_2)
vector_1 = embedding_1['embedding']
print(vector_1[:4])
vector_2 = embedding_2['embedding']
print(vector_2[:4])
"""

In [18]:
"""
vector_1 = embedding_1['embedding']
print(vector_1[:4])
vector_2 = embedding_2['embedding']
print(vector_2[:4])
"""

[0.0040104063, -0.02063808, -0.0028912085, -0.007481416]
[-0.0154303685, -0.012839607, 0.012309532, -0.00071919535]


In [38]:
vector_1 = get_text_embedding(in_1)
vector_2 = get_text_embedding(in_2)
print(vector_1[:4])
print(vector_2[:4])

[-0.0149078369140625, 0.00988006591796875, 0.041595458984375, -0.027313232421875]
[-0.052001953125, 0.032867431640625, 0.054046630859375, -0.007633209228515625]
