In [65]:
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import os
from tqdm import tqdm
from openai import OpenAI
import time

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [57]:
root_dir = Path.cwd().parent
processed_dir = root_dir / "data/artifacts" / "processed"
eq_df = pd.read_csv(processed_dir / "EQ.csv")


In [58]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "groww-instruments-eq"
dimension = 1024

In [59]:
existing_indexes = [index.name for index in pc.list_indexes()]

In [60]:
if index_name not in existing_indexes:
    print(f"Creating new index: {index_name}")
    pc.create_index_for_model(
        name= index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )
else:
    print(f"Index {index_name} already exists")

Creating new index: groww-instruments-eq


In [61]:
index = pc.Index(index_name)

In [62]:
def get_openai_embedding(text: str, model: str = "text-embedding-3-large"):
    """
    Get embedding from OpenAI API
    
    Args:
        text: Text to embed
        model: OpenAI embedding model (default: text-embedding-3-large)
    
    Returns:
        List of embedding values
    """
    try:
        response = client.embeddings.create(
            input=text,
            model=model,
            dimensions=dimension
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        raise

# Test the embedding function
test_text = "RELIANCE"
test_embedding = get_openai_embedding(test_text)
print(f"Test embedding generated: {len(test_embedding)} dimensions")

Test embedding generated: 1024 dimensions


In [63]:
def create_searchable_text(row):
    """Create a searchable text string from row data"""
    parts = []
    
    if pd.notna(row.get('name')):
        parts.append(str(row['name']))
    if pd.notna(row.get('trading_symbol')):
        parts.append(f"Symbol: {row['trading_symbol']}")
    if pd.notna(row.get('groww_symbol')):
        parts.append(f"Groww: {row['groww_symbol']}")
    if pd.notna(row.get('exchange')):
        parts.append(f"Exchange: {row['exchange']}")
    
    return " | ".join(parts)

eq_df['searchable_text'] = eq_df.apply(create_searchable_text, axis=1)

print("Created searchable text for all records")
print("\nSample searchable texts:")
print(eq_df['searchable_text'].head(3).tolist())

Created searchable text for all records

Sample searchable texts:
['Sec Re NCD 10.50% Sr 6 | Symbol: 1050ISFL28 | Groww: NSE-1050ISFL28 | Exchange: NSE', '1015EFL29 | Symbol: 1015ECL29 | Groww: NSE-1015ECL29 | Exchange: NSE', 'UCL-10.25%-24-10-26-NCD | Symbol: 1025UCL26A | Groww: NSE-1025UCL26A | Exchange: NSE']


In [None]:
# Generate embeddings in batches with rate limiting
batch_size = 100
vectors_to_upsert = []
openai_batch_size = 100

print(f"ðŸ”„ Generating embeddings for {len(eq_df)} records using OpenAI...")

for batch_start in tqdm(range(0, len(eq_df), openai_batch_size), desc="Processing batches"):
    batch_end = min(batch_start + openai_batch_size, len(eq_df))
    batch_df = eq_df.iloc[batch_start:batch_end]
    
    # Get embeddings for the batch
    texts = batch_df['searchable_text'].tolist()
    
    try:
        # Generate embeddings for the batch
        response = client.embeddings.create(
            input=texts,
            model="text-embedding-3-large",
            dimensions=1024
        )
        
        # Process each embedding
        for idx, (row_idx, row) in enumerate(batch_df.iterrows()):
            embedding = response.data[idx].embedding
            
            # Prepare metadata
            metadata = {
                'trading_symbol': str(row.get('trading_symbol', '')),
                'groww_symbol': str(row.get('groww_symbol', '')),
                'name': str(row.get('name', '')) if pd.notna(row.get('name')) else '',
                'exchange': str(row.get('exchange', '')),
                'instrument_type': str(row.get('instrument_type', '')),
                'segment': str(row.get('segment', '')),
                'searchable_text': row['searchable_text']
            }
            
            # Add other non-null fields to metadata
            for col in eq_df.columns:
                if col not in ['searchable_text'] and pd.notna(row.get(col)):
                    try:
                        metadata[col] = str(row[col])
                    except:
                        pass
            
            # Create vector record
            vector_id = f"eq_{row_idx}_{row.get('trading_symbol', row_idx)}"
            vectors_to_upsert.append({
                'id': vector_id,
                'values': embedding,
                'metadata': metadata
            })
        
        # Upsert to Pinecone in batches
        if len(vectors_to_upsert) >= batch_size:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []
        
        # Small delay to respect rate limits
        time.sleep(0.1)
        
    except Exception as e:
        print(f"Error processing batch {batch_start}-{batch_end}: {e}")
        # Continue with next batch
        continue

# Upsert remaining vectors
if vectors_to_upsert:
    index.upsert(vectors=vectors_to_upsert)

print(f"\nSuccessfully uploaded {len(eq_df)} vectors to Pinecone using OpenAI embeddings")

ðŸ”„ Generating embeddings for 12586 records using OpenAI...


Processing batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 126/126 [07:26<00:00,  3.54s/it]



âœ… Successfully uploaded 12586 vectors to Pinecone using OpenAI embeddings


In [67]:
stats = index.describe_index_stats()

In [68]:
print(f"   Index Name: {index_name}")
print(f"   Total Vectors: {stats.total_vector_count}")
print(f"   Dimension: {stats.dimension}")
print(f"   Index Fullness: {stats.index_fullness}")
print(f"   Namespaces: {list(stats.namespaces.keys())}")

   Index Name: groww-instruments-eq
   Total Vectors: 12586
   Dimension: 1024
   Index Fullness: 0.0
   Namespaces: ['']
