# Store Embeddings in Pinecone

This notebook demonstrates how to load data from a .pkl file using pandas and save it into a Chroma database.

In [6]:
import pandas as pd
from pinecone import Pinecone
from tqdm import tqdm

# Constants
INPUT_PKL_FILE = "../data/horizon_projects_embeddings.pkl"
BATCH_SIZE = 100  # Adjust this based on your needs and Pinecone's limits

# Initialize Pinecone
pc = Pinecone()
index = pc.Index("projects-text-embedding-3-small")

# Load data from .pkl file
df = pd.read_pickle(INPUT_PKL_FILE)

# Convert Timestamp to ISO format string
df['contentUpdateDate'] = df['contentUpdateDate'].dt.strftime('%Y-%m-%dT%H:%M:%S')

# Prepare data for Pinecone
ids = df['id'].astype(str).tolist()
embeddings = df['ada_embedding'].tolist()
metadatas = df[['title', 'objective', 'contentUpdateDate']].to_dict('records')

# Add data to Pinecone in batches
total_batches = len(ids) // BATCH_SIZE + (1 if len(ids) % BATCH_SIZE != 0 else 0)

for i in tqdm(range(0, len(ids), BATCH_SIZE), total=total_batches, desc="Adding batches"):
    batch_end = min(i + BATCH_SIZE, len(ids))
    batch_ids = ids[i:batch_end]
    batch_embeddings = embeddings[i:batch_end]
    batch_metadatas = metadatas[i:batch_end]
    
    # Create a list of dictionaries for upsert
    vectors_to_upsert = [
        (id, embedding, metadata)
        for id, embedding, metadata in zip(batch_ids, batch_embeddings, batch_metadatas)
    ]
    
    # Upsert the batch to Pinecone
    index.upsert(vectors=vectors_to_upsert)

print(f"Data uploaded to Pinecone index: projects-text-embedding-3-small")

Adding batches: 100%|██████████| 488/488 [10:05<00:00,  1.24s/it]

Data uploaded to Pinecone index: projects-text-embedding-3-small



