In [1]:
from dotenv import load_dotenv
load_dotenv()  # This will load VOYAGE_API_KEY from .env


True

In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('posts.csv')

In [44]:
import voyageai
import asyncio
import time
from tqdm.notebook import tqdm
import pickle
import os
from markdownify import markdownify as md

async def process_batch(vo, batch, batch_idx, posts_to_embed, existing_embeddings, BATCH_SIZE):
    try:
        result = await vo.embed(batch, model="voyage-3")
        
        # Save embeddings for this batch
        for i, embedding in enumerate(result.embeddings):
            post_idx = batch_idx * BATCH_SIZE + i
            if post_idx >= len(posts_to_embed):  # Guard against index out of range
                break
            post = posts_to_embed[post_idx]
            existing_embeddings[f"embedding-{post['id']}"] = {
                **post,
                'embedding': embedding
            }
        
        print(f"Completed batch {batch_idx} with {len(batch)} items")
        return result.total_tokens
        
    except Exception as e:
        print(f"Error on batch {batch_idx}: {e}")
        return 0

async def get_embeddings(df):
    BATCH_SIZE = 100
    MAX_CONCURRENT = 5  # Maximum number of concurrent batches
    vo = voyageai.AsyncClient()
    
    # Load existing embeddings if any
    existing_embeddings = {}
    if os.path.exists('embeddings_progress.pkl'):
        with open('embeddings_progress.pkl', 'rb') as f:
            existing_embeddings = pickle.load(f)
        print(f"Loaded {len(existing_embeddings)} existing embeddings")
    
    # Filter posts that need embedding
    posts_to_embed = []
    skipped = 0
    
    for _, row in df.iterrows():
        if f"embedding-{row['id']}" in existing_embeddings:
            skipped += 1
            continue
        posts_to_embed.append(row)
    
    if skipped:
        print(f"Skipped {skipped} posts; already embedded")
    
    if not posts_to_embed:
        print("No new posts to embed")
        return existing_embeddings
    
    # Format text same as original: <h1>title</h1>content
    texts = [f"<h1>{post['title']}</h1>{post['content']}" for post in posts_to_embed]
    
    # Convert HTML to markdown
    texts = [md(text) for text in texts]
    
    # Split into batches
    batches = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]
    
    print(f"Starting embedding of {len(texts)} texts in {len(batches)} batches...")
    
    total_tokens = 0
    
    # Process batches concurrently with rate limiting
    for i in range(0, len(batches), MAX_CONCURRENT):
        batch_group = batches[i:i + MAX_CONCURRENT]
        tasks = [
            process_batch(vo, batch, i + j, posts_to_embed, existing_embeddings, BATCH_SIZE) 
            for j, batch in enumerate(batch_group)
        ]
        
        # Wait for current group to complete
        results = await asyncio.gather(*tasks)
        total_tokens += sum(results)
        
        # Save progress after each group
        with open('embeddings_progress.pkl', 'wb') as f:
            pickle.dump(existing_embeddings, f)
        
        # Rate limiting delay between groups
        await asyncio.sleep(0.2)
    
    print(f"Total tokens used: {total_tokens}")
    return existing_embeddings

embeddings = await get_embeddings(df)


Loaded 33064 existing embeddings
Skipped 33064 posts; already embedded
No new posts to embed


In [43]:
with open('embeddings_progress.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

# Keep track of how many we delete
deleted_count = 0

# We use list() to create a copy of items since we'll be modifying the dict
for key, value in list(embeddings_dict.items()):
    if not isinstance(value, dict):
        del embeddings_dict[key]
        deleted_count += 1

print(f"Deleted {deleted_count} entries with non-dict values")

# Save back to pickle file
with open('embeddings_progress.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)


Deleted 2 entries with non-dict values


In [45]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time
from tqdm import tqdm
import json
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

async def send_embeddings_to_pinecone(embeddings_dict):
    # Get the index
    index = pc.Index(os.getenv("PINECONE_INDEX_NAME"))
    
    # Convert embeddings dict to list of vectors
    vectors = []
    for key, post in embeddings_dict.items():
        vector = {
            "id": str(post['id']),
            "values": post['embedding'],
            "metadata": {
                "title": post['title'],
                "when": post['when'],
                "utime": post['utime'],
                "first": post['first'],
                "last": post['last'],
                "content": post['content'],
                "forum": post['forum'],
                "poster": "RogerRabbit"
            }
        }
        
        # # Check metadata size (10KB limit)
        # if len(json.dumps(vector['metadata']).encode('utf-8')) > 10240:
        #     print(f"Removing content in {vector['id']} due to metadata size")
        #     vector['metadata']['content'] = ""
        #     vector['metadata']['removedContent'] = True
            
        vectors.append(vector)
    
    # Batch upsert
    VECTOR_BATCH_SIZE = 100
    failed_upserts = []
    
    for i in range(0, len(vectors), VECTOR_BATCH_SIZE):
        batch = vectors[i:i + VECTOR_BATCH_SIZE]
        try:
            index.upsert(
                vectors=batch,
                namespace="rr-posts"
            )
            print(f"Upserted {i + len(batch)}/{len(vectors)} vectors")
        except Exception as e:
            failed_upserts.append(f"{i}-{i + len(batch)}")
            print(f"Error on batch {i}-{i + len(batch)}: {e}")
        
        await asyncio.sleep(0.2)  # Same 1s delay as original
    
    if failed_upserts:
        print(f"Failed to upsert in {len(failed_upserts)} requests: {', '.join(failed_upserts)}")

# Load your embeddings from the pickle file
with open('embeddings_progress.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

# Send to Pinecone
await send_embeddings_to_pinecone(embeddings_dict)


<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'di