# Text Embeddings Example

In [1]:
import os
import pandas as pd
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from openai.embeddings_utils import cosine_similarity

In [2]:
# Set the database path

data_path = os.getenv("AI_DATASETS_PATH")
full_path = os.path.join(data_path, "genai_datasets/data.xlsx")

# Initialize the OpenAI Key
OpenAI.openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [4]:
# read the dataset
df = pd.read_excel(full_path)
df.head()

Unnamed: 0.1,Unnamed: 0,Words
0,0,Schhol
1,1,College
2,2,Car
3,3,Bike
4,4,Apple


In [5]:
# Now we will add one more column and apply embeddings for each entry, and then we create a csv
df['embedding'] = df['Words'].apply(lambda entries: embeddings.embed_query(entries))
df.to_csv('word_embeddings.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Words,embedding
0,0,Schhol,"[0.006822214127775622, 0.007156308832786324, -..."
1,1,College,"[0.008567630960204773, -0.009737508048176517, ..."
2,2,Car,"[-0.004958966229655546, -0.012322071811467243,..."
3,3,Bike,"[0.0054756435753999765, -0.0136203037636246, -..."
4,4,Apple,"[0.014477049403261352, -0.003934278727982006, ..."


In [7]:
# Now lets get embeddings for our text
our_text = "mango"
text_embedding = embeddings.embed_query(our_text)
print(text_embedding)

[-0.008513468245413543, -0.026108840258799254, 0.009153774861767678, -0.02085570973334096, 0.0054458751812192846, 0.009787548250316346, -0.03311301367186194, -0.01835981927126934, -0.004361274149497306, -0.03167558807066145, 0.0029238504109419545, 0.008990431128115128, -0.004155461175480254, 0.019862580875814738, -0.01957509650063269, 0.004358007069933291, 0.030969945450962405, -0.004305737056538023, 0.012538254154574989, -0.02000632306340576, -0.00983981779805033, 0.006762424892147873, -0.0011964918711162267, -0.019274543807194587, 0.0004426611375203088, 0.00260696371666762, 0.027781478824802665, -0.011773805965368794, 0.008003836429716936, 0.0067166885722193545, 0.043619274722778524, -0.014112886852915897, -0.006200522597394715, -0.005053850779246707, -0.008258651871903956, -0.0016081177027350109, -0.002551426856538979, -0.011675800097706291, -0.0017559436233657313, -0.016164481502635823, 0.01357711933202973, -0.021456815120217174, -0.018255280175801372, -0.015785524487596743, -0.017

#### We can determine how similar a word is to other words in our dataframe after we have a vector representing that word. By computing the cosine similarity of the word vector for our search term to each word embedding in our dataframe.

In [8]:
df['similarity_score'] = df['embedding'].apply(lambda vectors: cosine_similarity(vectors, text_embedding))

In [10]:
df.sort_values('similarity_score', ascending=False, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Words,embedding,similarity_score
6,6,Banana,"[-0.012999765903696864, -0.01998321619391984, ...",0.88614
5,5,Orange,"[0.013119571182465328, -0.028533135200008802, ...",0.843678
4,4,Apple,"[0.014477049403261352, -0.003934278727982006, ...",0.802333
3,3,Bike,"[0.0054756435753999765, -0.0136203037636246, -...",0.783956
1,1,College,"[0.008567630960204773, -0.009737508048176517, ...",0.770327
2,2,Car,"[-0.004958966229655546, -0.012322071811467243,...",0.77004
0,0,Schhol,"[0.006822214127775622, 0.007156308832786324, -...",0.764293


In [15]:
# to get the largest value
df.nlargest(1, 'similarity_score').iloc[0]

Unnamed: 0                                                          6
Words                                                          Banana
embedding           [-0.012999765903696864, -0.01998321619391984, ...
similarity_score                                              0.88614
Name: 6, dtype: object