In [1]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# Define the columns we want to embed vs which ones we want in metadata
columns_to_emebd = ["Description","Features"]
columns_to_metadata = ["Product Name","Price", "Rating","Description", "Features"]

In [2]:
import dotenv
dotenv.load_dotenv()  # Load variables from the .env file

import os

# Access environment variables as usual
my_variable_value = os.environ.get('OPENAI_API_KEY')

In [3]:
my_variable_value

'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'

In [5]:
# Process the CSV into the embedable content vs the metadata and put it into Document format so that we can chunk it into pieces.
docs = []
with open('TestListings.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_emebd if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

In [6]:
docs

[Document(page_content="Description: Elevate your music experience with our Ultimate Wireless Bluetooth Earbuds. These earbuds are designed for audiophiles who demand premium sound quality and convenience. With advanced Bluetooth 5.0 technology, you'll enjoy seamless connectivity and crystal-clear audio.\nFeatures: - High-fidelity sound with deep bass and clear treble. - Noise-canceling technology for an immersive listening experience. - 20 hours of playtime on a single charge. - Ergonomic design for a secure and comfortable fit. - Touch controls for easy playback and calls. - IPX7 waterproof rating for workouts and outdoor activities. - Built-in microphone for hands-free calls.", metadata={'Product Name': 'Ultimate Wireless Bluetooth Earbuds', 'Price': '79.99', 'Rating': '4.4', 'Description': "Elevate your music experience with our Ultimate Wireless Bluetooth Earbuds. These earbuds are designed for audiophiles who demand premium sound quality and convenience. With advanced Bluetooth 5

In [7]:
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

In [8]:
# Now we have metadata against Description and Features of the product
documents

[Document(page_content="Description: Elevate your music experience with our Ultimate Wireless Bluetooth Earbuds. These earbuds are designed for audiophiles who demand premium sound quality and convenience. With advanced Bluetooth 5.0 technology, you'll enjoy seamless connectivity and crystal-clear audio.", metadata={'Product Name': 'Ultimate Wireless Bluetooth Earbuds', 'Price': '79.99', 'Rating': '4.4', 'Description': "Elevate your music experience with our Ultimate Wireless Bluetooth Earbuds. These earbuds are designed for audiophiles who demand premium sound quality and convenience. With advanced Bluetooth 5.0 technology, you'll enjoy seamless connectivity and crystal-clear audio.", 'Features': '- High-fidelity sound with deep bass and clear treble. - Noise-canceling technology for an immersive listening experience. - 20 hours of playtime on a single charge. - Ergonomic design for a secure and comfortable fit. - Touch controls for easy playback and calls. - IPX7 waterproof rating fo

In [9]:
# Generate embeddings from documents and store in a vector database
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, OpenAIEmbeddings())

