In [16]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import chromadb

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to('cuda')  # Move model to GPU

# Example texts
texts = ["What are the signs that your car needs an oil change?",
         "How often should you check and replace your air filter?"]

# Generate embeddings
def generate_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to('cuda')  # Move input to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Get the embeddings and move to CPU

embeddings = generate_embeddings(texts)



In [17]:
# Initialize ChromaDB
client = chromadb.Client()

# Create a collection
collection = client.get_or_create_collection("car_maintenance")

# Generate unique IDs for each document
ids = [str(i) for i in range(len(texts))]  # Simple numeric IDs
print(ids)

# Add the embeddings to the collection
collection.add(
    documents=texts,
    embeddings=embeddings.tolist(),  # Convert to list for storage
    ids=ids  # Include the unique IDs
)

print("Embeddings stored in ChromaDB.")


Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 0
Add of existing embedding ID: 1


['0', '1']
Embeddings stored in ChromaDB.


In [18]:
query ="Does my car needs oil change?"
query_embeddings = generate_embeddings(query)
query_result = collection.query(query_embeddings, n_results=1,)
print(query_result)

{'ids': [['0']], 'embeddings': None, 'documents': [['What are the signs that your car needs an oil change?']], 'uris': None, 'data': None, 'metadatas': [[None]], 'distances': [[6.679023742675781]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
