## Get embeddings

+ The function `get_embedding` will give us an embedding for an input text.
+ 2 ways to create text embeddings using OpenAI's API: a direct call and a function with retry logic. 
+ Both approaches use the same model to transform the text into a numerical vector, which represents the semantic content of the input text.

In [None]:
# Imports
import os
import openai
import pandas as pd
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap.umap_ import UMAP
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Add the path to the constants file to the system path
import sys
sys.path.append('../../')
from constants import RANDOM_STATE, OPENAI_API_KEY

# OpenAI API Key
openai.api_key = OPENAI_API_KEY
# OpenAI GPT Model parameters
OPENAI_EMBEDDING_MODEL_ID = "text-embedding-ada-002"
TEMPERATURE = 0



  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
embedding = openai.Embedding.create(
    input="Toy Story (1995)", model=OPENAI_EMBEDDING_MODEL_ID
)["data"][0]["embedding"]
len(embedding)

1536

In [3]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=OPENAI_EMBEDDING_MODEL_ID) -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


embedding = get_embedding("Your text goes here", model=OPENAI_EMBEDDING_MODEL_ID)
print(len(embedding))


1536


In [4]:
# Embedding Function with Retry Logic
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embeddings(texts: list[str], model=OPENAI_EMBEDDING_MODEL_ID) -> list[list[float]]:
    return [item["embedding"] for item in openai.Embedding.create(input=texts, model=model)["data"]]

# Data Overview

In [5]:
# Read the dataset
dataset_path = "../data/ml-latest-small/merged_data.csv"
movie_data = pd.read_csv(dataset_path)
print(movie_data.info())
display(movie_data.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3476 entries, 0 to 3475
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3476 non-null   int64  
 1   imdbId   3476 non-null   int64  
 2   tmdbId   3476 non-null   float64
 3   title    3476 non-null   object 
 4   genres   3476 non-null   object 
 5   userId   3476 non-null   int64  
 6   rating   3476 non-null   float64
 7   tag      3476 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 217.4+ KB
None


Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,tag
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun


In [6]:
# Get embeddings in batches
batch_size = 100  # Define your batch size
title_df = movie_data[['title']]
embeddings = []

for i in range(0, len(title_df), batch_size):
    batch_texts = title_df["title"].iloc[i:i+batch_size].tolist()
    embeddings.extend(get_embeddings(batch_texts))

title_df["embedding"] = embeddings
title_df = title_df.reset_index(drop=True)
display(title_df.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_df["embedding"] = embeddings


Unnamed: 0,title,embedding
0,Toy Story (1995),"[-0.0019398860167711973, -0.037929877638816833..."
1,Toy Story (1995),"[-0.0019398860167711973, -0.037929877638816833..."
2,Toy Story (1995),"[-0.0019398860167711973, -0.037929877638816833..."
3,Jumanji (1995),"[0.00010395599383627996, -0.023625079542398453..."
4,Jumanji (1995),"[0.00010395599383627996, -0.023625079542398453..."
5,Jumanji (1995),"[0.00010395599383627996, -0.023625079542398453..."
6,Jumanji (1995),"[0.00010395599383627996, -0.023625079542398453..."
7,Grumpier Old Men (1995),"[0.00777850579470396, -0.04650207981467247, 0...."
8,Grumpier Old Men (1995),"[0.00777850579470396, -0.04650207981467247, 0...."
9,Father of the Bride Part II (1995),"[0.0103604756295681, -0.03069864958524704, -0...."


In [7]:
%%time

# Cluster the title data
kmeans = KMeans(n_clusters=3, n_init=3) 
kmeans.fit(title_df["embedding"].tolist())

CPU times: user 1.63 s, sys: 506 ms, total: 2.14 s
Wall time: 883 ms


In [8]:
# Reduce dimensions of embedded text title data
reducer = UMAP()
embeddings_2d = reducer.fit_transform(title_df["embedding"].tolist())



In [None]:
# Visualize the clusters
fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], color=kmeans.labels_)
fig.show()

# References

+ https://platform.openai.com/docs/guides/embeddings

+ https://www.datacamp.com/tutorial/introduction-to-text-embeddings-with-the-open-ai-api