# Convert .pkl File to Chroma DB using pandas

This notebook demonstrates how to load data from a .pkl file using pandas and save it into a Chroma database.

In [1]:
import pandas as pd
import chromadb
from tqdm import tqdm

# Define input and output
INPUT_PKL_FILE = "../data/horizon_projects_embeddings.pkl"
# Load data from .pkl file
df = pd.read_pickle(INPUT_PKL_FILE)

# Convert Timestamp to ISO format string
df['contentUpdateDate'] = df['contentUpdateDate'].dt.strftime('%Y-%m-%dT%H:%M:%S')

# Display the first few rows of the dataframe
print(df.head())

          id                                              title  \
0  101006382  Mission-Oriented SwafS to Advance Innovation t...   
1     633080  Monitoring Atmospheric Composition and Climate...   
2     633212                    Aging Lungs in European Cohorts   
3     879534  The Enterprise Europe Network Baden-Wuerttembe...   
4     743826  The Enterprise Europe Network Baden-Wuerttembe...   

                                           objective    contentUpdateDate  \
0  While most SwafS initiatives have contributed ...  2024-07-22T12:39:54   
1  MACC-III is the last of the pre-operational st...  2022-08-16T16:46:44   
2  This programme of work will advance the unders...  2023-10-25T16:11:30   
3  BW-KAM 5 will implement tested and tailored in...  2022-10-28T14:08:00   
4  By providing Key Account Management and Enhanc...  2022-08-15T13:07:16   

   title_length  objective_length  \
0            64              1448   
1            51              1932   
2            31        

In [None]:
# Initialize Chroma client with the new method
client = chromadb.PersistentClient(path=OUTPUT_PERSIST_DIRECTORY)

# Create or get a collection
collection = client.get_or_create_collection(name=OUTPUT_COLLECTION_NAME)

# Prepare data for Chroma DB
ids = df['id'].astype(str).tolist()
embeddings = df['ada_embedding'].tolist()
metadatas = df[['title', 'objective', 'contentUpdateDate']].to_dict('records')

# Add data to the collection in batches
total_batches = len(ids) // BATCH_SIZE + (1 if len(ids) % BATCH_SIZE != 0 else 0)

for i in tqdm(range(0, len(ids), BATCH_SIZE), total=total_batches, desc="Adding batches"):
    batch_end = min(i + BATCH_SIZE, len(ids))
    batch_ids = ids[i:batch_end]
    batch_embeddings = embeddings[i:batch_end]
    batch_metadatas = metadatas[i:batch_end]
    
    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas
    )

print(f"Data saved to Chroma DB in {OUTPUT_PERSIST_DIRECTORY}")

### Load Existing Chroma DB

In [None]:
import chromadb
from chromadb.config import Settings

# Path to your existing Chroma DB
PERSIST_DIRECTORY = "../chroma_db"
COLLECTION_NAME = "horizon_projects"

# Initialize the Chroma client
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)

# Get the existing collection
collection = client.get_collection(COLLECTION_NAME)

print(f"Successfully loaded collection: {COLLECTION_NAME}")

# Get the count of items in the collection
item_count = collection.count()
print(f"Number of items in the collection: {item_count}")