In [None]:
import os
import openai
import chromadb
import numpy as np
import pandas as pd
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

load_dotenv()

# Let's use Chroma to implement an in-memory vector store.
# This example will use generated data about cars for embedding storage and retrieval.

# Step 1: Install necessary packages (to be run in a notebook cell)
# !pip install openai chromadb

"""
The objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:
1. Generated data about cars.
2. OpenAI's Embedding model to convert car descriptions to vectors.
3. Chroma as the in-memory vector database to store and retrieve relevant vectors.
"""

"\nThe objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:\n1. Generated data about cars.\n2. OpenAI's Embedding model to convert car descriptions to vectors.\n3. Chroma as the in-memory vector database to store and retrieve relevant vectors.\n"

In [2]:
# Step 2: Setting up OpenAI key
# Note: You need an OpenAI API key to proceed. Set it below.
openai.api_key = os.environ["OPENAI_API_KEY"]


In [3]:
# Step 3: Generate data for the cars
"""
To simulate a real dataset, we'll generate information about cars. Each car will have a name, price, engine type, and description.
The following data is used to showcase how information is stored and used in the vector database for retrieval.
"""

# Here we define a small set of initial car data manually.
# This will serve as our base dataset to which we will later add more generated cars.
cars = [
    {
        "name": "Superfast Coupe 2024",
        "price": "$80,000",
        "engine": "V8 5.0L",
        "description": "The Superfast Coupe 2024 offers a V8 engine, exceptional acceleration, and high-speed performance. It is ideal for sports car enthusiasts seeking adrenaline and sleek design."
    },
    {
        "name": "EcoDrive Hatchback 2024",
        "price": "$25,000",
        "engine": "Electric",
        "description": "The EcoDrive Hatchback 2024 is a fully electric vehicle, designed for urban environments with excellent range efficiency, a compact form factor, and environment-friendly features."
    },
    {
        "name": "Family SUV XL 2024",
        "price": "$45,000",
        "engine": "V6 3.5L",
        "description": "The Family SUV XL 2024 is a spacious and versatile SUV that provides comfort, safety, and excellent driving dynamics for long journeys."
    },
    {
        "name": "Luxury Sedan Prime 2024",
        "price": "$70,000",
        "engine": "Hybrid",
        "description": "The Luxury Sedan Prime 2024 offers a combination of luxury, efficiency, and hybrid technology, ensuring a comfortable and refined ride for passengers."
    },
]


In [4]:
# Step 3a: Generate more car data using GPT-3
"""
To enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.
The generated data will include car name, price, engine type, and a detailed description.
We will use a prompt to guide the model to return data in a consistent and predictable format.
"""

"\nTo enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.\nThe generated data will include car name, price, engine type, and a detailed description.\nWe will use a prompt to guide the model to return data in a consistent and predictable format.\n"

In [5]:
import openai

def generate_car_data(num_cars=50):
    """
    Generates car data using OpenAI's GPT-3 model.
    Args:
        num_cars (int): Number of car entries to generate.
    Returns:
        List[dict]: A list of dictionaries containing car details.
    """
    car_data = []
    for i in range(num_cars):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that generates car details in a structured format."},
                {"role": "user", "content": (
                    "Generate details for a car including the following fields: \n"
                    "Name: <Car Name>\n"
                    "Price: <Car Price>\n"
                    "Engine: <Engine Type>\n"
                    "Description: <Car Description>\n"
                    "Please provide each field in the exact order and format as shown above."
                )}
            ],
            max_tokens=150
        )
        car_details = response['choices'][0]['message']['content']
        try:
            # Split the response into lines and parse each line
            lines = car_details.split('\n')
            car = {
                "name": lines[0].split(': ')[1].strip(),
                "price": lines[1].split(': ')[1].strip(),
                "engine": lines[2].split(': ')[1].strip(),
                "description": lines[3].split(': ')[1].strip()
            }
            car_data.append(car)
        except (IndexError, ValueError):
            print(f"Error parsing car details for iteration {i}, skipping entry.")
            continue
    return car_data

# Append the generated cars to the existing cars list
"""
Here, we append the 50 generated cars to our initial dataset.
This results in a dataset of 54 cars that will be used for embedding storage and retrieval.
"""
cars += generate_car_data(5)


In [6]:
# Step 4: Initialize Chroma DB and prepare embeddings
"""
We will use OpenAI embeddings to convert the car descriptions into vectors, which can be easily stored and queried in Chroma.
Chroma will serve as our in-memory vector database, allowing us to perform fast similarity searches for the car descriptions.
"""

# Set up Chroma and OpenAI embedding function
client = chromadb.Client()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=openai.api_key)

# Create a new collection for storing car embeddings
"""
A Chroma collection is similar to a table in a database.
In this collection, we will store embeddings representing each car's description along with metadata such as car name, price, and engine type.
"""
car_collection = client.create_collection(name="car_collection", embedding_function=openai_ef)

# Add car data to the Chroma collection
car_ids = [str(i) for i in range(len(cars))]
car_descriptions = [car["description"] for car in cars]
car_metadata = [{"name": car["name"], "price": car["price"], "engine": car["engine"]} for car in cars]

"""
Adding the car data to Chroma involves specifying unique IDs for each car, the descriptions to embed, and relevant metadata.
The metadata will be useful for presenting information to the user when we query the database.
"""
car_collection.add(ids=car_ids, metadatas=car_metadata, documents=car_descriptions)


In [13]:
# Display all data in the collection as a pandas DataFrame
"""
We will now retrieve all metadata from the Chroma collection and display it as a pandas DataFrame.
This provides a clear view of the data stored in the collection.
"""
all_metadata = car_collection.get(ids=car_ids, include=["embeddings", "metadatas"])
car_data = pd.DataFrame(all_metadata['metadatas'])
car_data["description"] = car_descriptions
car_data["embedding"] = all_metadata['embeddings']
print("\nFirst 5 Cars Entry in the Collection:")
car_data.head()


First 5 Cars Entry in the Collection:


Unnamed: 0,engine,name,price,description,embedding
0,V8 5.0L,Superfast Coupe 2024,"$80,000","The Superfast Coupe 2024 offers a V8 engine, e...","[-0.03257284313440323, -0.002681222977116704, ..."
1,Electric,EcoDrive Hatchback 2024,"$25,000",The EcoDrive Hatchback 2024 is a fully electri...,"[0.0016342909075319767, -0.0038865383248776197..."
2,V6 3.5L,Family SUV XL 2024,"$45,000",The Family SUV XL 2024 is a spacious and versa...,"[0.00976359099149704, -0.010079585947096348, 0..."
3,Hybrid,Luxury Sedan Prime 2024,"$70,000",The Luxury Sedan Prime 2024 offers a combinati...,"[0.012194924987852573, 0.0030802427791059017, ..."
4,Inline-4,Toyota Camry,"$25,000",The Toyota Camry is a reliable and comfortable...,"[0.010860085487365723, -0.006394834257662296, ..."


In [None]:
# Step 5: Querying Chroma for information
"""
In this step, we will demonstrate how to query the Chroma collection to find relevant cars.
We will use a natural language prompt to find cars that match specific requirements.
The embeddings will allow us to determine the similarity between the query and the car descriptions.
"""

# Example query prompt
prompts = [
    "I want a comfortable car for my family with good safety features.",
    "Looking for a sporty car with high speed and acceleration.",
    "Find me an eco-friendly electric car for urban use.",
    "I need a luxury sedan with a hybrid engine for a comfortable commute.",
    "Suggest an affordable car with great fuel efficiency for a student."
]

prompt = np.random.choice(prompts)

# Retrieve the top match from Chroma collection
"""
Using the `query` method, we search for the most relevant car based on the given prompt.
The query will return the car description that is most similar to the provided input.
"""
results = car_collection.query(query_texts=[prompt], n_results=1)

# Display the result
"""
Once we get the result, we extract the metadata for the recommended car, such as its name, price, and engine type.
We then print out the recommended car's details for the user.
"""
result_metadata = results["metadatas"][0][0]
result_name = result_metadata["name"]
result_price = result_metadata["price"]
result_engine = result_metadata["engine"]

print(f"Recommended Car: {result_name}\nPrice: {result_price}\nEngine: {result_engine}")

"""
This result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.
For a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.
"""


Recommended Car: Family SUV XL 2024
Price: $45,000
Engine: V6 3.5L


'\nThis result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.\nFor a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.\n'

In [9]:
# Step 6: Enriching the output using an LLM
"""
We will now use the LLM to generate a personalized message to the car buyer, leveraging the retrieved car's metadata.
This step showcases how to combine retrieval with generation for a more tailored experience.
"""
def generate_message_for_buyer(car_name, car_price, car_engine):
    """
    Generates a personalized message for the car buyer.
    Args:
        car_name (str): Name of the recommended car.
        car_price (str): Price of the recommended car.
        car_engine (str): Engine type of the recommended car.
    Returns:
        str: A personalized message for the buyer.
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant to sell a car."},
            {"role": "user", "content": (
                f"Create a friendly and engaging message for a car buyer interested in the {car_name}. \n"
                f"Mention its price of {car_price} and highlight the {car_engine} engine's benefits."
            )}
        ],
        max_tokens=150
    )
    return response['choices'][0]['message']['content'].strip()

# Generate the enriched message
enriched_message = generate_message_for_buyer(result_name, result_price, result_engine)
print("\nPersonalized Message:")
print(enriched_message)

"""
This step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.
The message can be further personalized based on additional user preferences or context.
"""


Personalized Message:
🚗 Are you looking for a reliable and spacious SUV to drive your family around in style? Look no further than our incredible Family SUV XL 2024! Priced at $45,000, this top-of-the-line vehicle offers all the space and safety features you need for your loved ones.

💪 Powered by a robust V6 3.5L engine, this SUV delivers exceptional performance and efficiency to make every drive smooth and enjoyable. With this powerful engine, you'll have the confidence to tackle any road trip or daily commute with ease.

Don't miss out on this amazing opportunity to own the perfect family SUV! Contact us now to schedule a test drive and experience the luxury and comfort of the Family SUV XL 2024


'\nThis step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.\nThe message can be further personalized based on additional user preferences or context.\n'