In [1]:
import os
import openai
import chromadb
import numpy as np
import pandas as pd
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

load_dotenv()

# Let's use Chroma to implement an in-memory vector store.
# This example will use generated data about cars for embedding storage and retrieval.

# Step 1: Install necessary packages (to be run in a notebook cell)
# !pip install openai chromadb

"""
The objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:
1. Generated data about cars.
2. OpenAI's Embedding model to convert car descriptions to vectors.
3. Chroma as the in-memory vector database to store and retrieve relevant vectors.
"""

"\nThe objective of this notebook is to demonstrate how to create a Retrieval-Augmented Generation (RAG) system using:\n1. Generated data about cars.\n2. OpenAI's Embedding model to convert car descriptions to vectors.\n3. Chroma as the in-memory vector database to store and retrieve relevant vectors.\n"

In [2]:
# Step 2: Setting up OpenAI key
# Note: You need an OpenAI API key to proceed. Set it below.
openai.api_key = os.environ["OPENAI_API_KEY"]


In [None]:
# Step 3: Generate data for the cars
"""
To simulate a real dataset, we'll generate information about cars. Each car will have a name, price, engine type, and description.
The following data is used to showcase how information is stored and used in the vector database for retrieval.
"""

# Here we define a small set of initial car data manually.
# This will serve as our base dataset to which we will later add more generated cars.
cars = [
    {
        "name": "Toyota Corolla 2024",
        "price": "$25,000",
        "engine": "1.8L Inline-4",
        "year": 2024,
        "country": "Japan",
        "manufacturer": "Toyota",
        "description": "The Toyota Corolla 2024 offers reliability, fuel efficiency, and a comfortable ride, making it an ideal choice for families and urban commuters."
    },
    {
        "name": "Ford Mustang GT 2024",
        "price": "$45,000",
        "engine": "5.0L V8",
        "year": 2024,
        "country": "United States",
        "manufacturer": "Ford",
        "description": "The Ford Mustang GT 2024 delivers exhilarating performance with its powerful V8 engine, sporty handling, and iconic design."
    },
    {
        "name": "BMW 3 Series 2024",
        "price": "$42,000",
        "engine": "2.0L Turbo Inline-4",
        "year": 2024,
        "country": "Germany",
        "manufacturer": "BMW",
        "description": "The BMW 3 Series 2024 is a luxury sedan offering a perfect balance of performance, style, and advanced technology."
    },
    {
        "name": "Hyundai Ioniq 5 2024",
        "price": "$39,000",
        "engine": "Electric",
        "year": 2024,
        "country": "South Korea",
        "manufacturer": "Hyundai",
        "description": "The Hyundai Ioniq 5 2024 is a fully electric SUV with a futuristic design, exceptional range, and innovative features."
    },
]


In [4]:
# Step 3a: Generate more car data using GPT-3
"""
To enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.
The generated data will include car name, price, engine type, and a detailed description.
We will use a prompt to guide the model to return data in a consistent and predictable format.
"""

"\nTo enhance the dataset, we will generate additional car entries using OpenAI's GPT-3 model.\nThe generated data will include car name, price, engine type, and a detailed description.\nWe will use a prompt to guide the model to return data in a consistent and predictable format.\n"

In [None]:
def generate_car_data(num_cars=50):
    """
    Generates car data using OpenAI's GPT-3 model.
    Args:
        num_cars (int): Number of car entries to generate.
    Returns:
        List[dict]: A list of dictionaries containing car details.
    """
    if existing_names is None:
        existing_names = set()
    car_data = []
    while len(car_data) < num_cars:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": (
                    "You are an expert assistant tasked with creating unique and realistic car entries. "
                    "Each car entry must have a unique name and be described in detail."
                )},
                {"role": "user", "content": (
                    "Generate details for a car including the following fields: \n"
                    "Name: <Unique Car Name>\n"
                    "Price: <Car Price>\n"
                    "Engine: <Engine Type>\n"
                    "Year: <Manufacturing Year>\n"
                    "Country: <Country of Origin>\n"
                    "Manufacturer: <Car Manufacturing Company>\n"
                    "Description: <Car Description>\n"
                    "Ensure that the car name is not a duplicate and provide realistic and varied entries."
                )}
            ],
            max_tokens=200
        )
        car_details = response['choices'][0]['message']['content'].strip().split('\n')
        try:
            car = {
                "name": car_details[0].split(': ')[1],
                "price": car_details[1].split(': ')[1],
                "engine": car_details[2].split(': ')[1],
                "year": int(car_details[3].split(': ')[1]),
                "country": car_details[4].split(': ')[1],
                "manufacturer": car_details[5].split(': ')[1],
                "description": car_details[6].split(': ')[1]
            }
            if car['name'] not in existing_names:
                existing_names.add(car['name'])
                car_data.append(car)
            else:
                print(f"Duplicate car name detected: {car['name']}. Skipping entry.")
        except IndexError:
            print(f"Error parsing car details for this iteration, skipping entry.")
            continue
    return car_data

# Append the generated cars to the existing cars list
"""
Here, we append the 50 generated cars to our initial dataset.
This results in a dataset of 54 cars that will be used for embedding storage and retrieval.
"""
cars += generate_car_data(50)


In [6]:
# Step 4: Initialize Chroma DB and prepare embeddings
"""
We will use OpenAI embeddings to convert the car descriptions into vectors, which can be easily stored and queried in Chroma.
Chroma will serve as our in-memory vector database, allowing us to perform fast similarity searches for the car descriptions.
"""

# Set up Chroma and OpenAI embedding function
client = chromadb.Client()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=openai.api_key)

# Create a new collection for storing car embeddings
"""
A Chroma collection is similar to a table in a database.
In this collection, we will store embeddings representing each car's description along with metadata such as car name, price, and engine type.
"""
car_collection = client.create_collection(name="car_collection", embedding_function=openai_ef)

# Add car data to the Chroma collection
car_ids = [str(i) for i in range(len(cars))]
car_descriptions = [car["description"] for car in cars]
car_metadata = [{"name": car["name"], "price": car["price"], "engine": car["engine"]} for car in cars]

"""
Adding the car data to Chroma involves specifying unique IDs for each car, the descriptions to embed, and relevant metadata.
The metadata will be useful for presenting information to the user when we query the database.
"""
car_collection.add(ids=car_ids, metadatas=car_metadata, documents=car_descriptions)


In [12]:
# Display all data in the collection as a pandas DataFrame
"""
We will now retrieve all metadata from the Chroma collection and display it as a pandas DataFrame.
This provides a clear view of the data stored in the collection.
"""
# Retrieve all metadata, embeddings, and documents (descriptions) from the Chroma collection
all_metadata = car_collection.get(ids=car_ids, include=["embeddings", "documents", "metadatas"])

# Create a DataFrame from the metadata
car_data = pd.DataFrame(all_metadata['metadatas'])
car_data["description"] = all_metadata['documents']
car_data["embedding"] = all_metadata['embeddings'] 
print("\Sample 5 Cars Entry in the Collection:")
car_data.sample(n=5)

\Sample 5 Cars Entry in the Collection:


Unnamed: 0,engine,name,price,description,embedding
53,3.0L V6 Twin Turbo,Solaris GT,"$75,000",The Solaris GT is a sleek and powerful sports ...,"[0.008844923228025436, 0.00681699812412262, 0...."
32,3.0L twin-turbocharged V6,Sparrow GT,"$45,000",The Sparrow GT is a sleek and sporty coupe des...,"[-0.020500145852565765, 0.011505590751767159, ..."
11,V8 Twin-Turbo,Embera GT,"$78,000",The Embera GT is a high-performance sports car...,"[-0.02033199556171894, 0.005416034255176783, -..."
10,V6 Turbocharged,Aurora S1,"$35,000",The Aurora S1 is a sleek and modern sports car...,"[0.00041681830771267414, 0.01504009123891592, ..."
0,V8 5.0L,Superfast Coupe 2024,"$80,000","The Superfast Coupe 2024 offers a V8 engine, e...","[-0.03257284313440323, -0.002681222977116704, ..."


In [13]:
car_data

Unnamed: 0,engine,name,price,description,embedding
0,V8 5.0L,Superfast Coupe 2024,"$80,000","The Superfast Coupe 2024 offers a V8 engine, e...","[-0.03257284313440323, -0.002681222977116704, ..."
1,Electric,EcoDrive Hatchback 2024,"$25,000",The EcoDrive Hatchback 2024 is a fully electri...,"[0.001607043668627739, -0.0038725833874195814,..."
2,3.0L V6 twin-turbo,Serenity GT,"$85,000",The Serenity GT is a luxury sports car designe...,"[0.001486198278144002, 0.011765601113438606, 0..."
3,3.0L V6 Twin-Turbo,Solaris GT,"$45,000",The Solaris GT is a high-performance luxury se...,"[0.009452015161514282, 0.0034284137655049562, ..."
4,V6 3.5L twin-turbo,Crimson Thunder,"$45,000",The Crimson Thunder is a sleek and powerful sp...,"[-0.01616011932492256, -0.003087409073486924, ..."
5,3.0L turbocharged V6,Thunderbolt X5,"$45,000",The Thunderbolt X5 is a sleek and powerful spo...,"[-0.015341528691351414, 0.008179505355656147, ..."
6,3.0L twin-turbo V6,Hyperion X1,"$70,000",The Hyperion X1 is a sleek and futuristic spor...,"[0.018868761137127876, 0.017414266243577003, -..."
7,Electric,Electra GT,"$45,000","The Electra GT is a sleek, all-electric sports...","[-0.015263844281435013, 0.009929985739290714, ..."
8,3.5L V6 twin-turbo,Inferno GT,"$85,000",The Inferno GT is a high-performance sports ca...,"[-0.012796686962246895, 0.013538618572056293, ..."
9,Twin-turbo V6,Aurora GT,"$45,000",The Aurora GT is a sleek and stylish sports ca...,"[-0.011618737131357193, 0.013283236883580685, ..."


In [8]:
# Step 5: Querying Chroma for information
"""
In this step, we will demonstrate how to query the Chroma collection to find relevant cars.
We will use a natural language prompt to find cars that match specific requirements.
The embeddings will allow us to determine the similarity between the query and the car descriptions.
"""

# Example query prompt
prompts = [
    "I want a comfortable car for my family with good safety features.",
    "Looking for a sporty car with high speed and acceleration.",
    "Find me an eco-friendly electric car for urban use.",
    "I need a luxury sedan with a hybrid engine for a comfortable commute.",
    "Suggest an affordable car with great fuel efficiency for a student."
]

prompt = np.random.choice(prompts)
print(f"The user's prompt is: {prompt}")

# Retrieve the top match from Chroma collection
"""
Using the `query` method, we search for the most relevant car based on the given prompt.
The query will return the car description that is most similar to the provided input.
"""
results = car_collection.query(query_texts=[prompt], n_results=1)

# Display the result
"""
Once we get the result, we extract the metadata for the recommended car, such as its name, price, and engine type.
We then print out the recommended car's details for the user.
"""
result_metadata = results["metadatas"][0][0]
result_name = result_metadata["name"]
result_price = result_metadata["price"]
result_engine = result_metadata["engine"]

print(f"Recommended Car: {result_name}\nPrice: {result_price}\nEngine: {result_engine}")

"""
This result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.
For a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.
"""


The user's prompt is: Find me an eco-friendly electric car for urban use.
Recommended Car: EcoDrive Hatchback 2024
Price: $25,000
Engine: Electric


'\nThis result demonstrates how the RAG approach, combining embeddings and natural language queries, can provide users with relevant and personalized insights.\nFor a user looking for a specific type of car, such as one that is comfortable for a family, our system can find the best match from the dataset.\n'

In [9]:
# Step 6: Enriching the output using an LLM
"""
We will now use the LLM to generate a personalized message to the car buyer, leveraging the retrieved car's metadata.
This step showcases how to combine retrieval with generation for a more tailored experience.
"""
def generate_message_for_buyer(car_name, car_price, car_engine):
    """
    Generates a personalized message for the car buyer.
    Args:
        car_name (str): Name of the recommended car.
        car_price (str): Price of the recommended car.
        car_engine (str): Engine type of the recommended car.
    Returns:
        str: A personalized message for the buyer.
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant to sell a car."},
            {"role": "user", "content": (
                f"Create a friendly and engaging message for a car buyer interested in the {car_name}. \n"
                f"Mention its price of {car_price} and highlight the {car_engine} engine's benefits."
            )}
        ],
        max_tokens=150
    )
    return response['choices'][0]['message']['content'].strip()

# Generate the enriched message
enriched_message = generate_message_for_buyer(result_name, result_price, result_engine)
print("\nPersonalized Message:")
print(enriched_message)

"""
This step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.
The message can be further personalized based on additional user preferences or context.
"""


Personalized Message:
🌿🚗 Hello there! Are you looking for a stylish and eco-friendly ride? Look no further than the EcoDrive Hatchback 2024! Priced at just $25,000, this sleek car comes equipped with an Electric engine that not only saves you money on gas but also helps the environment by reducing emissions. Say goodbye to frequent trips to the gas station and hello to smooth, quiet rides with the EcoDrive Hatchback 2024. Don't miss out on this opportunity to drive into a more sustainable future! 🌿🚗 #EcoDrive2024 #ElectricEngine #GreenDriving


'\nThis step demonstrates how to use the retrieved car data to generate an enriched, human-like response that enhances user engagement.\nThe message can be further personalized based on additional user preferences or context.\n'