# Getting Started With Text Embeddings

In [2]:
import sys
!{sys.executable} -m pip install -U google-generativeai


Collecting google-generativeai
  Obtaining dependency information for google-generativeai from https://files.pythonhosted.org/packages/f4/0a/d14c0482986fb488e4833399a29e972d63635d1f27b198e32fb28101f585/google_generativeai-0.2.1-py3-none-any.whl.metadata
  Downloading google_generativeai-0.2.1-py3-none-any.whl.metadata (3.1 kB)
Collecting google-ai-generativelanguage==0.3.3 (from google-generativeai)
  Obtaining dependency information for google-ai-generativelanguage==0.3.3 from https://files.pythonhosted.org/packages/8f/66/b5a83cdb4c8cb1d0f64243ae61bfa4e970b5740c2999dfe1bb96725599ab/google_ai_generativelanguage-0.3.3-py3-none-any.whl.metadata
  Downloading google_ai_generativelanguage-0.3.3-py3-none-any.whl.metadata (5.1 kB)
Downloading google_generativeai-0.2.1-py3-none-any.whl (130 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m754.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading google_ai_generativelanguage-0.3.3-py3-none-

In [11]:
import google.generativeai as palm
import os
import pprint
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

palm.configure(api_key=os.environ['API_KEY'])


In [4]:
for model in palm.list_models():
  if 'embedText' in model.supported_generation_methods:
    print(model.name)
    

models/embedding-gecko-001


#### Use the embeddings model


In [5]:
x = 'life'
close_to_x = 'What is the meaning of life?'

model = "models/embedding-gecko-001"

# Create an embedding
embedding_x = palm.generate_embeddings(model=model, text=x)
embedding_close_to_x = palm.generate_embeddings(model=model, text=close_to_x)

vector = embedding_x['embedding']
print(f"Length = {len(vector)}")
print(vector[:10])


Length = 768
[-0.0062485174, 0.015260352, -0.030612998, 0.053223047, 0.014430896, -0.05429975, 0.045275558, 0.021513924, -0.06517641, 0.019092383]


In [6]:
vector = embedding_close_to_x['embedding']
print(f"Length = {len(vector)}")
print(vector[:10])


Length = 768
[0.020220786, 0.022316508, -0.009204807, 0.004977066, 0.016092807, -0.018982107, 0.043249473, 0.026188457, -0.04345435, 0.023770839]


In [22]:
embedding_close_to_x


{'embedding': [0.020220786,
  0.022316508,
  -0.009204807,
  0.004977066,
  0.016092807,
  -0.018982107,
  0.043249473,
  0.026188457,
  -0.04345435,
  0.023770839,
  0.0070713465,
  0.013504822,
  0.027093675,
  -0.0010874213,
  -0.05329074,
  -0.034527313,
  -0.02143942,
  -0.019982468,
  0.036332387,
  -6.272615e-05,
  -0.09729496,
  -0.04436393,
  -0.01800044,
  -0.03490506,
  0.00077523215,
  -0.116615035,
  0.018457556,
  -0.02432723,
  -0.043237586,
  -0.016221896,
  0.015136606,
  0.008927828,
  -0.02670597,
  -0.004168643,
  -0.0037328538,
  0.08499492,
  -0.013948273,
  0.025831716,
  0.0075642345,
  0.016611703,
  0.08692311,
  -0.03519171,
  0.04700779,
  0.002084103,
  -0.03109261,
  -0.010730861,
  0.008774334,
  -0.0024247786,
  -0.06268653,
  -0.019765627,
  -0.0014091614,
  -0.0044126306,
  -0.0024397536,
  0.06306274,
  0.011103913,
  0.021858923,
  -0.031723406,
  0.01418032,
  -0.031356424,
  0.016775755,
  -0.017747268,
  -0.0017708481,
  -0.038333207,
  -0.0874839

In [10]:
similar_measure = np.dot(embedding_x['embedding'], embedding_close_to_x['embedding'])
print(similar_measure)


0.642187941111088


'\nemb_1 = embedding_model.get_embeddings(\n    ["What is the meaning of life?"]) # 42!\n\nemb_2 = embedding_model.get_embeddings(\n    ["How does one spend their time well on Earth?"])\n\nemb_3 = embedding_model.get_embeddings(\n    ["Would you like a salad?"])\n\nvec_1 = [emb_1[0].values]\nvec_2 = [emb_2[0].values]\nvec_3 = [emb_3[0].values]\n'

#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.


In [15]:
"""
emb_1 = embedding_model.get_embeddings(
    ["What is the meaning of life?"]) # 42!

emb_2 = embedding_model.get_embeddings(
    ["How does one spend their time well on Earth?"])

emb_3 = embedding_model.get_embeddings(
    ["Would you like a salad?"])

vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]
"""

emb_1 = palm.generate_embeddings(model=model, text=["What is the meaning of life?"])
emb_2 = palm.generate_embeddings(model=model, text=["How does one spend their time well on Earth?"])
emb_3 = palm.generate_embeddings(model=model, text=["Would you like a salad?"])

vec_1 = emb_1['embedding']
vec_2 = emb_2['embedding']
vec_3 = emb_3['embedding']


- Note: the reason we wrap the embeddings (a Python list) in another list is because the `cosine_similarity` function expects either a 2D numpy array or a list of lists.
```Python
vec_1 = [emb_1[0].values]
```


In [16]:
print(cosine_similarity(vec_1, vec_2)) 
print(cosine_similarity(vec_2, vec_3))
print(cosine_similarity(vec_1, vec_3))


[[0.65584864]]
[[0.51988917]]
[[0.54102993]]


#### From word to sentence embeddings
- One possible way to calculate sentence embeddings from word embeddings is to take the average of the word embeddings.
- This ignores word order and context, so two sentences with different meanings, but the same set of words will end up with the same sentence embedding.


In [17]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."


- Remove stop words like ["the", "in", "for", "an", "is"] and punctuation.

In [18]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]


- Generate one embedding for each word.  So this is a list of three lists.

In [34]:
embeddings_1 = [palm.generate_embeddings(model=model, text=s)['embedding'] for s in in_pp_1]
print(len(embeddings_1))

emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)


3
(3, 768)


- Use numpy to convert this list of lists into a 2D array of 3 rows and 768 columns.

In [35]:
embeddings_2 = [palm.generate_embeddings(model=model, text=s)['embedding'] for s in in_pp_2]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_1.shape)


(3, 768)


- Take the average embedding across the 3 word embeddings 
- You'll get a single embedding of length 768.


In [36]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)


(768,)


In [37]:
emb_2_mean = emb_array_2.mean(axis = 0)


- Check to see that taking an average of word embeddings results in two sentence embeddings that are identical.

In [38]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])


[-0.00388558 -0.00526848  0.0058703   0.03315523]
[-0.00388558 -0.00526848  0.0058703   0.03315523]


#### Get sentence embeddings from the model.
- These sentence embeddings account for word order and context.
- Verify that the sentence embeddings are not the same.

In [39]:
print(in_1)
print(in_2)


The kids play in the park.
The play was for kids in the park.


In [42]:
embedding_1 = palm.generate_embeddings(model=model, text=in_1)
embedding_2 = palm.generate_embeddings(model=model, text=in_2)


In [43]:
vector_1 = embedding_1['embedding']
print(vector_1[:4])
vector_2 = embedding_2['embedding']
print(vector_2[:4])


[0.0040104063, -0.02063808, -0.0028912085, -0.007481416]
[-0.0154303685, -0.012839607, 0.012309532, -0.00071919535]
