# **Building Custom-Gpt based on Vehicle Data implementing RAG.**

## Installations

In [None]:
#pip install pinecone-client sentence-transformers
#pip install --upgrade pinecone-client
#pip install datasets
#pip install openai
#pip install langchain
#pip install tiktoken

Example from CARFAX API using Demo Key.

In [None]:
import requests
api_spec = 'https://specifications.vinaudit.com/v3/specifications?format=json&key=VA_DEMO_KEY&vin=1NXBR32E85Z505904'


In [None]:
spec_response = requests.get(api_spec)
response = requests.get(url_veh)

In [None]:
if spec_response.status_code == 200:
    data = spec_response.json()
    print(data)
else:
    print("Failed to retrieve data from the API. Status code:", spec_response.status_code)


{'input': {'key': 'VA_DEMO_KEY', 'vin': '1NXBR32E85Z505904', 'format': 'json'}, 'selections': {'trims': [{'id': '2005_toyota_corolla_ce', 'name': 'CE', 'selected': 1, 'styles': []}]}, 'attributes': {'year': '2005', 'make': 'Toyota', 'model': 'Corolla', 'trim': 'CE', 'style': '', 'type': 'Sedan', 'size': 'Compact', 'category': 'Compact Car', 'made_in': 'Japan', 'made_in_city': 'OBU', 'doors': '4-Door', 'fuel_type': 'Regular Unleaded', 'fuel_capacity': '13.20 gallons', 'city_mileage': '30 - 32 miles/gallon', 'highway_mileage': '38 - 41 miles/gallon', 'engine': '1.8-L L-4 DOHC 16V', 'engine_size': '1.8', 'engine_size_units': '', 'engine_cylinders': '4', 'transmission': '5-Speed Manual', 'transmission_type': 'Manual', 'transmission_speeds': '5-Speed', 'drivetrain': 'Front-Wheel Drive', 'anti_brake_system': 'Non-ABS 4-Wheel ABS', 'steering_type': 'Rack & Pinion', 'curb_weight': '2590 pounds', 'gross_vehicle_weight_rating': '', 'overall_height': '58.50 inches', 'overall_length': '178.30 inch

#**DATA GENERATION**

#### Following the structure from CARFAX API, generate random data.

In [None]:
import random
import json

def generate_vin():
    """Generate a random VIN."""
    chars = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
    return "".join(random.choice(chars) for _ in range(17))

def random_attribute(attributes):
    """Randomly select an attribute."""
    return random.choice(attributes)

# Define possible values for each attribute
years = list(range(1995, 2023))
makes = ["Toyota", "Honda", "Ford", "Nissan", "Chevrolet"]
models = ["Corolla", "Civic", "Fusion", "Altima", "Malibu"]
trims = ["Base", "LX", "EX", "SE", "SLE"]
types = ["Sedan", "SUV", "Truck", "Coupe"]
sizes = ["Compact", "Midsize", "Fullsize"]
categories = ["Economy Car", "Compact Car", "Sports Car", "Luxury Car"]
made_ins = ["USA", "Japan", "Germany", "South Korea", "Mexico"]
doors = ["2-Door", "4-Door"]
fuel_types = ["Regular Unleaded", "Premium Unleaded", "Diesel", "Electric"]
transmissions = ["Manual", "Automatic", "CVT"]
drivetrains = ["Front-Wheel Drive", "Rear-Wheel Drive", "All-Wheel Drive", "Four-Wheel Drive"]
steering_types = ["Rack & Pinion", "Recirculating Ball"]

# Generate 20,000 records
records = []
for _ in range(20000):
    year = random_attribute(years)
    make = random_attribute(makes)
    model = random_attribute(models)
    trim = random_attribute(trims)
    record = {
        'input': {
            'key': 'VA_DEMO_KEY',
            'vin': generate_vin(),
            'format': 'json'
        },
        'selections': {
            'trims': [{
                'id': f'{year}_{make.lower()}_{model.lower()}_{trim.lower()}',
                'name': trim,
                'selected': 1,
                'styles': []
            }]
        },
        'attributes': {
            'year': str(year),
            'make': make,
            'model': model,
            'trim': trim,
            'style': '',
            'type': random_attribute(types),
            'size': random_attribute(sizes),
            'category': random_attribute(categories),
            'made_in': random_attribute(made_ins),
            'made_in_city': '',
            'doors': random_attribute(doors),
            'fuel_type': random_attribute(fuel_types),
            'fuel_capacity': f"{random.uniform(10, 20):.2f} gallons",
            'city_mileage': f"{random.randint(15, 30)} - {random.randint(31, 45)} miles/gallon",
            'highway_mileage': f"{random.randint(20, 35)} - {random.randint(36, 50)} miles/gallon",
            'engine': f"{random.choice([1.4, 1.6, 2.0, 2.5, 3.0, 3.5])}-L L-{random.choice([4, 6, 8])} DOHC 16V",
            'engine_size': '',
            'engine_cylinders': random.choice(['4', '6', '8']),
            'transmission': random_attribute(transmissions),
            'transmission_type': '',
            'transmission_speeds': f"{random.choice(['5-Speed', '6-Speed', '8-Speed'])}",
            'drivetrain': random_attribute(drivetrains),
            'anti_brake_system': random.choice(['Non-ABS', '4-Wheel ABS']),
            'steering_type': random_attribute(steering_types),
            'curb_weight': f"{random.randint(2000, 5000)} pounds",
            'gross_vehicle_weight_rating': '',
            'overall_height': f"{random.uniform(50.0, 70.0):.2f} inches",
            'overall_length': f"{random.uniform(150.0, 200.0):.2f} inches",
            'overall_width': f"{random.uniform(60.0, 80.0):.2f} inches",
            'wheelbase_length': f"{random.uniform(90.0, 120.0):.2f} inches",
            'standard_seating': random.choice(['4', '5', '7']),
            'invoice_price': f"${random.randint(15000, 30000)}",
            'delivery_charges': f"${random.randint(500, 1500)}",
            'manufacturer_suggested_retail_price': f"${random.randint(20000, 40000)}"
        },
        'success': True,
        'error': ''
    }
    records.append(record)



In [None]:
# First 5 records to check
print(json.dumps(records[:5], indent=4))

[
    {
        "input": {
            "key": "VA_DEMO_KEY",
            "vin": "ZMNSDBSFSJ6WSC5GG",
            "format": "json"
        },
        "selections": {
            "trims": [
                {
                    "id": "2005_nissan_altima_ex",
                    "name": "EX",
                    "selected": 1,
                    "styles": []
                }
            ]
        },
        "attributes": {
            "year": "2005",
            "make": "Nissan",
            "model": "Altima",
            "trim": "EX",
            "style": "",
            "type": "Sedan",
            "size": "Midsize",
            "category": "Compact Car",
            "made_in": "Germany",
            "made_in_city": "",
            "doors": "2-Door",
            "fuel_type": "Electric",
            "fuel_capacity": "11.55 gallons",
            "city_mileage": "21 - 35 miles/gallon",
            "highway_mileage": "26 - 48 miles/gallon",
            "engine": "1.6-L L-8 DOHC 16V",
    

In [None]:
def extract_keys_and_indices(obj, dict_keys_set, list_indices_set, current_path=[]):
    """
    Recursively extract unique keys from nested dictionaries and unique list indices.
    """
    if isinstance(obj, dict):
        for k, v in obj.items():
            # Add the key to the dictionary keys set
            dict_keys_set.add(k)
            # Continue searching recursively, updating the path
            extract_keys_and_indices(v, dict_keys_set, list_indices_set, current_path + [k])
    elif isinstance(obj, list):
        # For lists, iterate through each item
        for i, item in enumerate(obj):
            # Add the index to the list indices set if the list contains dictionaries
            if any(isinstance(elem, dict) for elem in obj):
                list_indices_set.add(f"{current_path[-1]}[{i}]")
            extract_keys_and_indices(item, dict_keys_set, list_indices_set, current_path)

unique_dict_keys = set()
unique_list_indices = set()

for record in records:
    extract_keys_and_indices(record, unique_dict_keys, unique_list_indices)

unique_dict_keys_list = sorted(list(unique_dict_keys))
unique_list_indices_list = sorted(list(unique_list_indices))

print("Unique Dictionary Keys:", unique_dict_keys_list)
print("Unique List Indices:", unique_list_indices_list)


Unique Dictionary Keys: ['anti_brake_system', 'attributes', 'category', 'city_mileage', 'curb_weight', 'delivery_charges', 'doors', 'drivetrain', 'engine', 'engine_cylinders', 'engine_size', 'error', 'format', 'fuel_capacity', 'fuel_type', 'gross_vehicle_weight_rating', 'highway_mileage', 'id', 'input', 'invoice_price', 'key', 'made_in', 'made_in_city', 'make', 'manufacturer_suggested_retail_price', 'model', 'name', 'overall_height', 'overall_length', 'overall_width', 'selected', 'selections', 'size', 'standard_seating', 'steering_type', 'style', 'styles', 'success', 'transmission', 'transmission_speeds', 'transmission_type', 'trim', 'trims', 'type', 'vin', 'wheelbase_length', 'year']
Unique List Indices: ['trims[0]']


In [None]:
unique_dict_keys_list

['anti_brake_system',
 'attributes',
 'category',
 'city_mileage',
 'curb_weight',
 'delivery_charges',
 'doors',
 'drivetrain',
 'engine',
 'engine_cylinders',
 'engine_size',
 'error',
 'format',
 'fuel_capacity',
 'fuel_type',
 'gross_vehicle_weight_rating',
 'highway_mileage',
 'id',
 'input',
 'invoice_price',
 'key',
 'made_in',
 'made_in_city',
 'make',
 'manufacturer_suggested_retail_price',
 'model',
 'name',
 'overall_height',
 'overall_length',
 'overall_width',
 'selected',
 'selections',
 'size',
 'standard_seating',
 'steering_type',
 'style',
 'styles',
 'success',
 'transmission',
 'transmission_speeds',
 'transmission_type',
 'trim',
 'trims',
 'type',
 'vin',
 'wheelbase_length',
 'year']

In [None]:
def record_to_string(record):
    text_attributes = [record['attributes'][key] for key in unique_dict_keys_list if key in record['attributes']]
    return " ".join(text_attributes)

In [None]:
from datasets import Dataset, DatasetDict

# Convert the list of dictionaries to a Hugging Face Dataset
dataset = Dataset.from_dict({"records": records})
dataset_dict = DatasetDict({
    "all": dataset
})


In [None]:
record_strings = [record_to_string(record) for record in records]

# **DATA PREPROCESSING**

## Building a Vector Database using Pinecone and OpenAi Embeddings

In [None]:
import openai
openai.api_key = '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'

In [None]:
from posix import environ
import pinecone
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key="9089ac06-c107-4cbd-98aa-55fead1258ef")

In [None]:
index_name = "vehicle-index-try-second"
vector_dimension = 1536

In [None]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=vector_dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west-2"
        )
    )
    print(f"Index {index_name} created.")
else:
    print(f"Index {index_name} already exists.")


Index vehicle-index-try-second created.


In [None]:
import time
index = pc.Index(index_name)
time.sleep(1)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [None]:
import os
os.environ['OPENAI_API_KEY'] = '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'

In [None]:
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    print("OPENAI_API_KEY is not set.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [None]:
from langchain.chat_models import ChatOpenAI

In [None]:
chat = ChatOpenAI(
    openai_api_key=openai_api_key,
    model='gpt-3.5-turbo'
)


  warn_deprecated(


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [None]:
import tiktoken

In [None]:
def get_openai_embeddings(batch_texts):
    response = openai.Embedding.create(
        input=batch_texts,
        engine="text-similarity-babbage-001"
    )
    return [item['embedding'] for item in response['data']]

In [None]:
assert isinstance(records, list) and all(isinstance(record, dict) for record in records), "Data structure issue."

In [None]:
import time
from tqdm import tqdm

batch_size = 350
rate_limit_seconds = 20
requests_made_today = 0
daily_limit = 200

for i in tqdm(range(0, len(record_strings), batch_size)):
    # Generate embeddings for the current batch
    batch_texts = record_strings[i:i+batch_size]
    print(batch_texts)
    embeds = embed_model.embed_documents(batch_texts)
    # Generate unique IDs for each record in the batch
    ids = [record['input']['vin'] for record in records[i:i+batch_size]]
    # Prepare metadata
    metadata = [
    {
        **{key: record['attributes'][key] for key in record['attributes'].keys()},
        'text': record_to_string(record)
    }
    for record in records[i:i+batch_size]
    ]
    print(metadata)
    vectors = list(zip(ids, embeds, metadata))
    index.upsert(index_name="vehicle-index-try-second", vectors=vectors)

    requests_made_today += len(batch_texts)
    if requests_made_today >= daily_limit:
        print("Approaching daily limit, consider pausing or stopping the operation.")
        break

    time.sleep(rate_limit_seconds)


  0%|          | 0/58 [00:00<?, ?it/s]

['4-Wheel ABS Compact Car 21 - 35 miles/gallon 4942 pounds $1255 2-Door Rear-Wheel Drive 1.6-L L-8 DOHC 16V 6  11.55 gallons Electric  26 - 48 miles/gallon $23106 Germany  Nissan $26560 Altima 69.64 inches 195.77 inches 71.87 inches Midsize 7 Rack & Pinion  Automatic 5-Speed  EX Sedan 113.33 inches 2005', 'Non-ABS Luxury Car 25 - 40 miles/gallon 4629 pounds $824 4-Door Front-Wheel Drive 3.5-L L-4 DOHC 16V 4  15.96 gallons Regular Unleaded  29 - 45 miles/gallon $25669 Germany  Ford $20059 Malibu 66.76 inches 189.87 inches 76.59 inches Midsize 4 Rack & Pinion  CVT 6-Speed  LX Coupe 100.40 inches 2019', 'Non-ABS Luxury Car 20 - 40 miles/gallon 4758 pounds $579 2-Door Front-Wheel Drive 3.0-L L-6 DOHC 16V 4  15.57 gallons Regular Unleaded  29 - 39 miles/gallon $27957 Mexico  Ford $26266 Corolla 67.48 inches 164.96 inches 62.87 inches Compact 7 Recirculating Ball  CVT 6-Speed  SLE Coupe 119.21 inches 1996', 'Non-ABS Luxury Car 27 - 31 miles/gallon 4316 pounds $562 4-Door Four-Wheel Drive 1.4

  0%|          | 0/58 [00:06<?, ?it/s]

Approaching daily limit, consider pausing or stopping the operation.





In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 350}},
 'total_vector_count': 350}

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"
# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



## **Building the chatbot**

In [None]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

In [None]:
vectorstore

<langchain_community.vectorstores.pinecone.Pinecone at 0x78ffb971bd30>

In [None]:
query = "tell me about sports car"
vectorstore.similarity_search(query, k=3)

[Document(page_content='Non-ABS Sports Car 19 - 32 miles/gallon 2170 pounds $1436 2-Door All-Wheel Drive 1.6-L L-8 DOHC 16V 8  10.58 gallons Electric  32 - 42 miles/gallon $24627 Germany  Nissan $24812 Civic 60.41 inches 197.48 inches 69.20 inches Midsize 4 Recirculating Ball  Manual 6-Speed  SE Coupe 91.25 inches 1999', metadata={'anti_brake_system': 'Non-ABS', 'category': 'Sports Car', 'city_mileage': '19 - 32 miles/gallon', 'curb_weight': '2170 pounds', 'delivery_charges': '$1436', 'doors': '2-Door', 'drivetrain': 'All-Wheel Drive', 'engine': '1.6-L L-8 DOHC 16V', 'engine_cylinders': '8', 'engine_size': '', 'fuel_capacity': '10.58 gallons', 'fuel_type': 'Electric', 'gross_vehicle_weight_rating': '', 'highway_mileage': '32 - 42 miles/gallon', 'invoice_price': '$24627', 'made_in': 'Germany', 'made_in_city': '', 'make': 'Nissan', 'manufacturer_suggested_retail_price': '$24812', 'model': 'Civic', 'overall_height': '60.41 inches', 'overall_length': '197.48 inches', 'overall_width': '69.2

In [None]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    Query: {query}"""
    return augmented_prompt

In [None]:
print(augment_prompt(query))

Using the contexts below, answer the query.

    Contexts:
    Non-ABS Sports Car 19 - 32 miles/gallon 2170 pounds $1436 2-Door All-Wheel Drive 1.6-L L-8 DOHC 16V 8  10.58 gallons Electric  32 - 42 miles/gallon $24627 Germany  Nissan $24812 Civic 60.41 inches 197.48 inches 69.20 inches Midsize 4 Recirculating Ball  Manual 6-Speed  SE Coupe 91.25 inches 1999
Non-ABS Sports Car 21 - 44 miles/gallon 2513 pounds $697 2-Door All-Wheel Drive 2.5-L L-4 DOHC 16V 6  11.67 gallons Electric  20 - 45 miles/gallon $23841 Japan  Nissan $23702 Corolla 57.64 inches 172.53 inches 71.38 inches Compact 4 Recirculating Ball  Automatic 6-Speed  Base Coupe 117.77 inches 1998
Non-ABS Sports Car 16 - 33 miles/gallon 2483 pounds $902 4-Door Four-Wheel Drive 1.6-L L-8 DOHC 16V 8  17.28 gallons Electric  21 - 43 miles/gallon $27392 USA  Toyota $29426 Corolla 61.76 inches 183.21 inches 66.51 inches Compact 7 Rack & Pinion  CVT 8-Speed  Base SUV 113.74 inches 2012

    Query: tell me about sports car


In [None]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand nlp.")
]

In [None]:
prompt = HumanMessage(
    content=augment_prompt(
        "what are the diiferent sports cars names"
    )
)
res = chat(messages + [prompt])
print(res.content)

Based on the provided contexts, the different sports car names are:

1. Altima
2. Civic
3. Fusion
