In [1]:
import os
import openai
import chromadb
import numpy as np
import pandas as pd
from chromadb.utils import embedding_functions
from IPython.display import Image, display
from dotenv import load_dotenv

load_dotenv()

# Let's use Chroma to implement an in-memory vector store.
# This example will use generated data about cars for embedding storage and retrieval.

# Step 1: Install necessary packages (to be run in a notebook cell)
# !pip install openai chromadb

"""
The objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:
1. Generated data about cars.
2. OpenAI's Embedding model to convert car descriptions to vectors.
3. Chroma as the in-memory vector database to store and retrieve relevant vectors.
"""

"\nThe objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:\n1. Generated data about cars.\n2. OpenAI's Embedding model to convert car descriptions to vectors.\n3. Chroma as the in-memory vector database to store and retrieve relevant vectors.\n"

In [2]:
# Step 2: Setting up OpenAI key
# Note: You need an OpenAI API key to proceed. Set it below.
openai.api_key = os.environ["OPENAI_API_KEY"]


In [3]:
# Step 3: Generate data for the cars
"""
To simulate a real dataset, we'll generate information about cars. Each car will have a name, price, engine type, and description.
The following data is used to showcase how information is stored and used in the vector database for retrieval.
"""

# Here we define a small set of initial car data manually.
# This will serve as our base dataset to which we will later add more generated cars.
cars = [
    {
        "name": "Toyota Corolla 2024",
        "price": "$25,000",
        "engine": "1.8L Inline-4",
        "year": 2024,
        "country": "Japan",
        "manufacturer": "Toyota",
        "description": "The Toyota Corolla 2024 offers reliability, fuel efficiency, and a comfortable ride, making it an ideal choice for families and urban commuters."
    },
    {
        "name": "Ford Mustang GT 2024",
        "price": "$45,000",
        "engine": "5.0L V8",
        "year": 2024,
        "country": "United States",
        "manufacturer": "Ford",
        "description": "The Ford Mustang GT 2024 delivers exhilarating performance with its powerful V8 engine, sporty handling, and iconic design."
    },
    {
        "name": "BMW 3 Series 2024",
        "price": "$42,000",
        "engine": "2.0L Turbo Inline-4",
        "year": 2024,
        "country": "Germany",
        "manufacturer": "BMW",
        "description": "The BMW 3 Series 2024 is a luxury sedan offering a perfect balance of performance, style, and advanced technology."
    },
    {
        "name": "Hyundai Ioniq 5 2024",
        "price": "$39,000",
        "engine": "Electric",
        "year": 2024,
        "country": "South Korea",
        "manufacturer": "Hyundai",
        "description": "The Hyundai Ioniq 5 2024 is a fully electric SUV with a futuristic design, exceptional range, and innovative features."
    },
]


In [4]:
# Step 3a: Generate more car data using GPT-3
"""
To enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.
The generated data will include car name, price, engine type, and a detailed description.
We will use a prompt to guide the model to return data in a consistent and predictable format.
"""

"\nTo enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.\nThe generated data will include car name, price, engine type, and a detailed description.\nWe will use a prompt to guide the model to return data in a consistent and predictable format.\n"

In [5]:
def generate_car_data(num_cars=50, existing_names=None):
    """
    Generates car data using OpenAI's GPT-3 model.
    Args:
        num_cars (int): Number of car entries to generate.
    Returns:
        List[dict]: A list of dictionaries containing car details.
    """
    if not existing_names:
        existing_names = set()
    car_data = []
    while len(car_data) < num_cars:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": (
                    "You are an expert assistant tasked with creating unique and realistic car entries. "
                    "Each car entry must have a unique name and be described in detail."
                    "Create cars for countries like Japan, Germany, South Korea, and the United States."
                )},
                {"role": "user", "content": (
                    "Generate details for a car including the following fields: \n"
                    "Name: <Unique Car Name>\n"
                    "Price: <Car Price>\n"
                    "Engine: <Engine Type>\n"
                    "Year: <Manufacturing Year>\n"
                    "Country: <Country of Origin>\n"
                    "Manufacturer: <Car Manufacturing Company>\n"
                    "Description: <Car Description>\n"
                    "Ensure that the car name is not a duplicate and provide realistic and varied entries."
                )}
            ],
            max_tokens=200
        )
        car_details = response['choices'][0]['message']['content'].strip().split('\n')
        try:
            car = {
                "name": car_details[0].split(': ')[1],
                "price": car_details[1].split(': ')[1],
                "engine": car_details[2].split(': ')[1],
                "year": int(car_details[3].split(': ')[1]),
                "country": car_details[4].split(': ')[1],
                "manufacturer": car_details[5].split(': ')[1],
                "description": car_details[6].split(': ')[1]
            }
            if car['name'] not in existing_names:
                existing_names.add(car['name'])
                car_data.append(car)
            else:
                print(f"Duplicate car detected: {car['name']}. Skipping entry. Current Number of cars: {len(car_data)}.")
        except IndexError:
            print(f"Error parsing car details for this iteration, skipping entry.")
            continue
    return car_data

# Append the generated cars to the existing cars list
"""
Here, we append the 26 generated cars to our initial dataset.
This results in a dataset of 30 cars that will be used for embedding storage and retrieval.
"""
cars += generate_car_data(26)


Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 7.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 12.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GTX-500. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT-R. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT-R. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT-2000. Skipping entry. Current Number of cars: 14.
Duplicate car detected: Sakura GT-R. Skipping entry. Current Number of cars: 16.
Duplicate car detected: Sakura GT. Skipping entry. Current Number of cars: 16.
Duplicate car detected: Sakura GT1. S

In [6]:
# Step 4: Initialize Chroma DB and prepare embeddings
"""
We will use OpenAI embeddings to convert the car descriptions into vectors, which can be easily stored and queried in Chroma.
Chroma will serve as our in-memory vector database, allowing us to perform fast similarity searches for the car descriptions.
"""

# Set up Chroma and OpenAI embedding function
client = chromadb.Client()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=openai.api_key, 
                                                        model_name="text-embedding-ada-002")

# Create a new collection for storing car embeddings
"""
A Chroma collection is similar to a table in a database.
In this collection, we will store embeddings representing each car's description along with metadata such as car name, price, and engine type.
"""
car_collection = client.create_collection(name="car_collection", embedding_function=openai_ef)

# Add car data to the Chroma collection
car_ids = [str(i) for i in range(len(cars))]
car_descriptions = [car["description"] for car in cars]
car_metadata = [{
    "name": car["name"],
    "price": car["price"],
    "engine": car["engine"],
    "year": car["year"],
    "country": car["country"],
    "manufacturer": car["manufacturer"]
} for car in cars]


"""
Adding the car data to Chroma involves specifying unique IDs for each car, the descriptions to embed, and relevant metadata.
The metadata will be useful for presenting information to the user when we query the database.
"""
car_collection.add(ids=car_ids, metadatas=car_metadata, documents=car_descriptions)


In [7]:
# Display all data in the collection as a pandas DataFrame
"""
We will now retrieve all metadata from the Chroma collection and display it as a pandas DataFrame.
This provides a clear view of the data stored in the collection.
"""
# Retrieve all metadata, embeddings, and documents (descriptions) from the Chroma collection
all_metadata = car_collection.get(ids=car_ids, include=["embeddings", "documents", "metadatas"])

# Create a DataFrame from the metadata
car_data = pd.DataFrame(all_metadata['metadatas'])
car_data["description"] = all_metadata['documents']
car_data["embedding"] = all_metadata['embeddings'] 
print("\Sample 5 Cars Entry in the Collection:")
car_data.sample(n=5)

\Sample 5 Cars Entry in the Collection:


Unnamed: 0,country,engine,manufacturer,name,price,year,description,embedding
17,Japan,2.0L Turbocharged Inline-4,Sakura Motors,Sakura GTX,"$35,000",2023,The Sakura GTX is a sleek and sporty coupe tha...,"[-0.008231259882450104, 0.03001203015446663, -..."
7,Japan,V6 Turbocharged,Sunrise Motors,Typhoon Racer,"$35,000",2022,The Typhoon Racer is a sleek and sporty coupe ...,"[-0.03214795142412186, 0.0006777484668418765, ..."
1,United States,5.0L V8,Ford,Ford Mustang GT 2024,"$45,000",2024,The Ford Mustang GT 2024 delivers exhilarating...,"[-0.03326338902115822, -0.011538316495716572, ..."
3,Japan,V6 Turbocharged,Sakura Motors,Sakura GTX-700,"$35,000",2022,The Sakura GTX-700 is a sleek and sporty coupe...,"[-0.01174828503280878, 0.024122612550854683, -..."
24,Japan,2.0L Turbocharged Inline-4,Sakura Motors,Sakura GT2000,"$45,000",2022,The Sakura GT2000 is a sleek and sporty coupe ...,"[-0.006631840020418167, 0.023130804300308228, ..."


In [8]:
# Step 5: Querying Chroma for information
"""
In this step, we will demonstrate how to query the Chroma collection to find relevant cars.
We will use a natural language prompt to find cars that match specific requirements.
The embeddings will allow us to determine the similarity between the query and the car descriptions.
"""

# Example query prompt
prompts = [
    "I want a comfortable car for my family with good safety features.",
    "Looking for a sporty car with high speed and acceleration.",
    "Find me an eco-friendly electric car for urban use in the United States.",
    "I need a luxury sedan with a hybrid engine for a comfortable commute.",
    "Suggest an affordable car with great fuel efficiency for a student."
]

prompt = np.random.choice(prompts)
print(f"The user's prompt is: {prompt}")

# Retrieve the top match from Chroma collection
"""
Using the `query` method, we search for the most relevant car based on the given prompt.
The query will return the car description that is most similar to the provided input.
"""
results = car_collection.query(query_texts=[prompt], n_results=1)

# Display the result
"""
Once we get the result, we extract the metadata for the recommended car, such as its name, price, and engine type.
We then print out the recommended car's details for the user.
"""
result_metadata = results["metadatas"][0][0]
result_name = result_metadata["name"]
result_price = result_metadata["price"]
result_engine = result_metadata["engine"]
result_year = result_metadata["year"]
result_manufacture = result_metadata["manufacturer"]
result_country = result_metadata["country"]

print(f"Recommended Car: {result_name}\nPrice: {result_price}\nEngine: {result_engine}\nYear: {result_year}\nManufacturer: {result_manufacture}\nCountry: {result_country}")

"""
This result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.
For a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.
"""


The user's prompt is: Find me an eco-friendly electric car for urban use in the United States.
Recommended Car: Hyundai Ioniq 5 2024
Price: $39,000
Engine: Electric
Year: 2024
Manufacturer: Hyundai
Country: South Korea


'\nThis result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.\nFor a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.\n'

In [9]:
# Additional query for metadata (Country-specific search)
"""
In this step, we demonstrate querying not only for text similarity but also filtering results based on metadata like country of origin.
"""
# Specify the country filter
country_filter = "United States"  # Replace with the desired country

# Query the Chroma collection with the country filter
results = car_collection.query(
    query_texts=[prompt],
    n_results=3,
    where={"country": country_filter}  # Metadata filter for country
)


# Display results with country filtering
for idx, metadata in enumerate(results["metadatas"][0]):
    name = metadata["name"]
    price = metadata["price"]
    engine = metadata["engine"]
    year = metadata["year"]
    country = metadata["country"]
    print(f"\nResult {idx + 1}:\nName: {name}\nPrice: {price}\nEngine: {engine}\nYear: {year}\nCountry: {country}")



Result 1:
Name: Phantom X
Price: $45,000
Engine: V8 twin-turbo
Year: 2022
Country: United States

Result 2:
Name: Phoenix S1
Price: $45,000
Engine: 2.0-liter turbocharged inline-4
Year: 2022
Country: United States

Result 3:
Name: Ford Mustang GT 2024
Price: $45,000
Engine: 5.0L V8
Year: 2024
Country: United States


In [10]:
# Step 6: Enriching the output using an LLM
"""
We will now use the LLM to generate a personalized message to the car buyer, leveraging the retrieved car's metadata.
This step showcases how to combine retrieval with generation for a more tailored experience.
"""
def generate_message_for_buyer(car_name, car_price, car_engine, car_year, car_manufacturer, car_country):
    """
    Generates a personalized message for the car buyer.
    Args:
        car_name (str): Name of the recommended car.
        car_price (str): Price of the recommended car.
        car_engine (str): Engine type of the recommended car.
        car_year (str): Year of the recommended car.
        car_manufacturer (str): Manufacturer of the recommended car.
        car_country (str): Country of origin of the recommended car.
    Returns:
        str: A personalized message for the buyer.
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant to sell a car."},
            {"role": "user", "content": (
                f"Create a friendly and engaging message for a car buyer interested in the {car_name}. \n"
                f"The user wants a car from a give {car_country} and add relevant information about this.\n"
                f"Mention its price of {car_price} and highlight the {car_engine} engine's benefits, the performance, and other key features.\n"
                f"Highlight the Year {car_year} and the Car Producer/Manufacturer {car_manufacturer} as key points."
            )}
        ],
        max_tokens=300
    )
    return response['choices'][0]['message']['content'].strip()

# Generate the enriched message
enriched_message = generate_message_for_buyer(result_name, 
                                              result_price, 
                                              result_engine, 
                                              result_year, 
                                              result_manufacture,
                                              result_country
                                            )
print("\nPersonalized Message:")
print(enriched_message)

"""
This step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.
The message can be further personalized based on additional user preferences or context.
"""


Personalized Message:
🌟🚗 Welcome to the future of driving with the 2024 Hyundai Ioniq 5! 🚗🌟

Experience the perfect blend of style, performance, and sustainability with this stunning electric vehicle straight from the tech-savvy streets of South Korea. Priced at $39,000, the Hyundai Ioniq 5 takes driving to a whole new level by offering a smooth and silent ride powered by its cutting-edge electric engine.

In the 2024 model, Hyundai has raised the bar even higher, ensuring that you not only drive a car but immerse yourself in a state-of-the-art driving experience. From its sleek design to its powerful performance capabilities, the Ioniq 5 is designed to make heads turn and hearts race wherever you go.

Key features include advanced safety technologies, a spacious and luxurious interior, and a range that will take you further than ever before on a single charge. Let this futuristic wonder be your companion on all your journeys, offering you a greener and more efficient way to travel.



'\nThis step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.\nThe message can be further personalized based on additional user preferences or context.\n'

In [11]:
def generate_image_for_car(car_name, car_year, car_manufacturer):
    """
    Generates an image for the recommended car using DALL-E or another image generation API.
    Args:
        car_name (str): Name of the car.
        car_year (str): Year of the car.
        car_manufacturer (str): Manufacturer of the car.
    Returns:
        str: File path or URL of the generated image.
    """
    image_prompt = (
        f"A full exterior view of a car from year {car_year}, manufacturer {car_manufacturer} and name {car_name} parked in a showroom. "
        "Show the car from the side angle, including wheels, headlights, and body details. "
        "The image should depict a realistic, high-quality photo in natural daylight."
        f"Add a plate with the the car price {car_year} and the car name {car_name}"
    )
    response = openai.Image.create(
        prompt=image_prompt,
        n=1,
        size="512x512"  # Specify the smaller image size
    )
    return response["data"][0]["url"]

print("\nPersonalized Message:")
print(enriched_message)

# Generate the car image
image_url = generate_image_for_car(result_name, result_year, result_manufacture)

print("\nGenerated Image:")
display(Image(url=image_url))

"""
This step demonstrates how to use the retrieved car data to generate both an enriched, human-like response and a visual representation of the car, enhancing user engagement.
The message and image can be further personalized based on additional user preferences or context.
"""


Personalized Message:
🌟🚗 Welcome to the future of driving with the 2024 Hyundai Ioniq 5! 🚗🌟

Experience the perfect blend of style, performance, and sustainability with this stunning electric vehicle straight from the tech-savvy streets of South Korea. Priced at $39,000, the Hyundai Ioniq 5 takes driving to a whole new level by offering a smooth and silent ride powered by its cutting-edge electric engine.

In the 2024 model, Hyundai has raised the bar even higher, ensuring that you not only drive a car but immerse yourself in a state-of-the-art driving experience. From its sleek design to its powerful performance capabilities, the Ioniq 5 is designed to make heads turn and hearts race wherever you go.

Key features include advanced safety technologies, a spacious and luxurious interior, and a range that will take you further than ever before on a single charge. Let this futuristic wonder be your companion on all your journeys, offering you a greener and more efficient way to travel.



'\nThis step demonstrates how to use the retrieved car data to generate both an enriched, human-like response and a visual representation of the car, enhancing user engagement.\nThe message and image can be further personalized based on additional user preferences or context.\n'