# Embeddings Part 3
# Going Through the Use Cases
Here we show some representative use cases. We will use the Amazon fine-food reviews dataset for the following examples.


In [None]:
# import our packages
import os
import pandas as pd
import numpy as np
from openai import OpenAI

In [None]:
# Create an instance of the OpenAI client
client = OpenAI()

First, let's see what trying to answer without using RAG looks like

In [None]:
from openai import OpenAI

# an example question about the Amazon reviews
query = "What people hated their food?"

response = client.chat.completions.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about Amazon fine food reviews.'},
        {'role': 'user', 'content': query},
    ],
    model='gpt-4-turbo',
    temperature=0,
)

print(response.choices[0].message.content)

## Obtaining the Embeddings
The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text). For example:

| PRODUCT ID  | USER ID        | SCORE | SUMMARY             | TEXT                                           |
|-------------|----------------|-------|---------------------|------------------------------------------------|
| B001E4KFG0  | A3SGXH7AUHU8GW | 5     | Good Quality Dog Food | I have bought several of the Vitality canned... |
| B00813GRG4  | A1D87F6ZCVE5NK | 1     | Not as Advertised   | Product arrived labeled as Jumbo Salted Peanut... |



We will combine the review summary and review text into a single combined text. The model will encode this combined text and output a single vector embedding.

In [None]:
# utility function to get the file size
def get_file_size(file_path):
    """ Returns the size of the file in megabytes. """
    size_bytes = os.path.getsize(file_path)
    size_mb = size_bytes / (1024 * 1024)  # Convert from bytes to megabytes
    return size_mb

# main utility function to get the embeddings
def get_embedding(text, model="text-embedding-3-small"):
    # Replace newlines in the text with spaces for consistent formatting
    text = text.replace("\n", " ")
    # Request the embedding for the cleaned text and return the embedding
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# utility function to get the embeddings with reduced dimensions
def get_embedding_reduced_dims(text, model="text-embedding-3-large"):
    # Replace newlines in the text with spaces for consistent formatting
    text = text.replace("\n", " ")
    # Request the embedding for the cleaned text and return the embedding
    return client.embeddings.create(input=[text], model=model,dimensions=1024).data[0].embedding

# utility function to get the embeddings with reduced dimensions
def get_embedding_large(text, model="text-embedding-3-large"):
    # Replace newlines in the text with spaces for consistent formatting
    text = text.replace("\n", " ")
    # Request the embedding for the cleaned text and return the embedding
    return client.embeddings.create(input=[text], model=model,dimensions=1024).data[0].embedding

In [None]:

# Define the path to the input data file
input_data_path = "./EmbeddingsDemoAssets/fine_food_reviews_1k.csv"

# Read the CSV file using pandas and set the first column as the index
df = pd.read_csv(input_data_path, index_col=0)

# Select only the relevant columns from the dataframe
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]

# Drop rows with any missing values
df = df.dropna()

# Drop rows with any missing values
df = df.drop_duplicates()

# Combine the 'Summary' and 'Text' columns into a new column with a formatted string
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)

# Display the first 5 rows of the modified dataframe
df.head(5)


### Small Model Embedding
Now let's embed using the small model first

In [None]:

# Apply the `get_embedding` function to each entry in the 'combined' column and store the results
df['embedding'] = df['combined'].apply(lambda x: get_embedding(x))

# Save the dataframe with embeddings to a CSV file, omitting the index
df.to_csv('./EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_1k.csv', index=False)

# Display the first 5 rows of the modified dataframe
df.head(5)


In [None]:
# Get the size of the data file
file_path = './EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_1k.csv'
size_mb = get_file_size(file_path)
print(f"The size of the file is {size_mb:.2f} MB.")

### Large Model Embedding
Let's embed using the large model next.

In [None]:

# Apply the `get_embedding_large` function to each entry in the 'combined' column and store the results
df['embedding'] = df['combined'].apply(lambda x: get_embedding_large(x))

# Save the dataframe with embeddings to a CSV file, omitting the index
df.to_csv('./EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_large_1k.csv', index=False)

# Display the first 5 rows of the modified dataframe
df.head(5)

In [None]:
# Get the size of the data file
file_path = './EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_large_1k.csv'
size_mb = get_file_size(file_path)
print(f"The size of the file is {size_mb:.2f} MB.")

### Loading the Data
Here we show how to load the data into a dataframe from a CSV file to make it ready to be used again when needed. 

In [None]:

# Load the CSV file into a pandas DataFrame
df_load = pd.read_csv('./EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_large_1k.csv')

# Convert the string representation of embeddings in the 'embedding' column to numpy arrays
# By converting the embeddings from string format to numpy arrays immediately after loading, 
# we ensure that the data is in a ready-to-use state for any subsequent analysis or processing steps.
df_load['embedding'] = df_load['embedding'].apply(eval).apply(np.array)

# Display the first 5 rows of the loaded dataframe
df_load.head(5)

### Reducing the Dimensions
Using larger embeddings, for example storing them in a vector store for retrieval, generally costs more and consumes more compute, memory and storage than using smaller embeddings.

Both of our new embedding models were trained with a technique that allows developers to trade-off performance and cost of using embeddings. Specifically, developers can shorten embeddings (i.e. remove some numbers from the end of the sequence) without the embedding losing its concept-representing properties.

In [None]:

# Define the path to the input data file
input_data_path = "./EmbeddingsDemoAssets/fine_food_reviews_1k.csv"

# Read the CSV file using pandas and set the first column as the index
df_reduced_dims = pd.read_csv(input_data_path, index_col=0)

# Select only the relevant columns from the dataframe
df_reduced_dims = df_reduced_dims[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]

# Drop rows with any missing values
df_reduced_dims = df_reduced_dims.dropna()

# Drop rows with any missing values
df_reduced_dims = df_reduced_dims.drop_duplicates()

# Combine the 'Summary' and 'Text' columns into a new column with a formatted string
df_reduced_dims["combined"] = (
    "Title: " + df_reduced_dims.Summary.str.strip() + "; Content: " + df_reduced_dims.Text.str.strip()
)

# Display the first 5 rows of the modified dataframe
df_reduced_dims.head(5)

In [None]:

# Apply the `get_embedding` function to each entry in the 'combined' column and store the results
df_reduced_dims['embedding'] = df_reduced_dims['combined'].apply(lambda x: get_embedding_reduced_dims(x))

# Save the dataframe with embeddings to a CSV file, omitting the index
df_reduced_dims.to_csv('./EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_reduced_dims_1k.csv', index=False)

# Display the first 5 rows of the modified dataframe
df.head(5)


In [None]:

# Get the size of the data file
file_path = './EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_reduced_dims_1k.csv'
size_mb = get_file_size(file_path)
print(f"The size of the file is {size_mb:.2f} MB.")

## Question Answering Using Embeddings-Based Search
There are many common cases where the model is not trained on data which contains key facts and information you want to make accessible when generating responses to a user query. One way of solving this is to put additional information into the context window of the model. This is effective in many use cases but leads to higher token costs. The other way is to use RAG to obtain the information. 

In [None]:

# Load the CSV file into a pandas DataFrame
df_search = pd.read_csv('./EmbeddingsDemoAssets/fine_food_reviews_with_embeddings_large_1k.csv')

# Convert the string representation of embeddings in the 'embedding' column to numpy arrays
# By converting the embeddings from string format to numpy arrays immediately after loading, 
# we ensure that the data is in a ready-to-use state for any subsequent analysis or processing steps.
df_search['embedding'] = df_search['embedding'].apply(eval).apply(np.array)

# Display the first 5 rows of the loaded dataframe
df_search.head(5)

In [None]:
# Import necessary libraries
from scipy.spatial import distance
import pandas as pd

# Define a function that ranks strings from a pandas DataFrame based on their relatedness to a given query string.
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - distance.cosine(x, y),
    top_n: int = 100,
    threshold: float = 0.001  # Minimum score difference to consider for ranking
) -> tuple[list[str], list[float]]:
    """
    Retrieve the top 'n' strings related to a query string from a DataFrame, based on a custom relatedness function.

    Parameters:
    query (str): The string to compare other strings against.
    df (pd.DataFrame): DataFrame containing the strings and their embeddings.
    relatedness_fn (callable): A function that computes the relatedness score between two embeddings. By default, 
                               it uses the cosine similarity between embeddings.
    top_n (int): The number of top related strings to return.
    threshold (float): The minimum difference between scores needed to consider one string more related than another.

    Returns:
    tuple[list[str], list[float]]: A tuple containing two lists:
                                   1. The top 'n' strings most related to the query.
                                   2. Their corresponding relatedness scores.
    """
    
    # Retrieve the embedding for the query string using a pre-defined model.
    query_embedding_response = client.embeddings.create(
        model="text-embedding-3-large",
        input=query,
        dimensions=1024,
    )
    # Extract the embedding data from the response.
    query_embedding = query_embedding_response.data[0].embedding

    # Compute the relatedness of each string in the DataFrame to the query string.
    strings_and_relatednesses = [
        (row["combined"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]

    # Sort the list of tuples by the relatedness score in descending order (most related first).
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)

    # Initialize a list to store the filtered results and a variable to track the last accepted score.
    filtered = []
    last_score = -1

    # Filter strings to meet the threshold criteria and limit the number of results to 'top_n'.
    for item in strings_and_relatednesses:
        if abs(item[1] - last_score) > threshold:
            filtered.append(item)
            last_score = item[1]
        if len(filtered) >= top_n:
            break

    # Unzip the tuples to separate strings and their relatedness scores.
    strings, relatednesses = zip(*filtered)

    # Return only the top 'n' results as specified by the function arguments.
    return strings[:top_n], relatednesses[:top_n]



In [None]:
# This code performs a search to find the top 5 items related to the phrase "dogs dislike product" 
# from a DataFrame 'df_search' using the previously defined 'strings_ranked_by_relatedness' function.

# Call the function 'strings_ranked_by_relatedness' with the query string, DataFrame, and specify the number 
# of top related items to return (top_n=5).
strings, relatednesses = strings_ranked_by_relatedness("dislike the food", df_search, top_n=5)

# Loop over each string and its corresponding relatedness score.
# The 'zip' function combines the two lists 'strings' and 'relatednesses' so that items from both lists 
# can be accessed in a single loop iteration.
for string, relatedness in zip(strings, relatednesses):
    # Print a formatted string that includes a visual separator and the relatedness score formatted to three decimal places.
    print(f"\n========================\n{relatedness=:.3f}")
    
    # The 'display' function is typically used in Jupyter Notebooks or similar environments to render objects in a more
    # visually appealing manner than the basic print function. Here, it is used to display the string from the DataFrame.
    display(string)
