In [1]:
import os
import logging
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import ReadTheDocsLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
INDEX_DIMENSION=os.getenv("INDEX_DIMENSION")
INDEX_METRIC=os.getenv("INDEX_METRIC")
PINECONE_CLOUD=os.getenv("PINECONE_CLOUD")
PINECONE_REGION=os.getenv("PINECONE_REGION")
EMBED_MODEL=os.getenv("EMBED_MODEL")

In [5]:
PINECONE_INDEX_NAME

'langchain-doc-index-zachary'

In [6]:
def initialize_pinecone():
    try:
        pc = Pinecone(api_key=PINECONE_API_KEY)
        logging.info("Pinecone initialized successfully.")
        return pc
    except Exception as e:
        logging.error(f"Failed to initialize Pinecone: {e}")
        raise

In [7]:
from pinecone import ServerlessSpec
import time
import os

def check_or_create_index(pc):
    try:
        # Define ServerlessSpec
        spec = ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION)
        
        # Check if index exists
        if PINECONE_INDEX_NAME in pc.list_indexes().names():
            logging.info(f"Index '{PINECONE_INDEX_NAME}' exists, deleting the existing index...")
            pc.delete_index(PINECONE_INDEX_NAME)
        
        # Create new index
        logging.info(f"Creating new index '{PINECONE_INDEX_NAME}' with dimension {INDEX_DIMENSION} and metric {INDEX_METRIC}.")
        pc.create_index(
            PINECONE_INDEX_NAME,
            dimension=int(INDEX_DIMENSION),  # Dimensionality of the embeddings
            metric=INDEX_METRIC,
            spec=spec
        )

        # Wait for the index to be ready
        logging.info(f"Waiting for index '{PINECONE_INDEX_NAME}' to be initialized...")
        while not pc.describe_index(PINECONE_INDEX_NAME).status['ready']:
            time.sleep(1)
        
        logging.info(f"Index '{PINECONE_INDEX_NAME}' is ready.")
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        
        # Wait a moment for connection to establish
        time.sleep(1)

        # Return index and print index stats
        logging.info("Index stats:")
        logging.info(index.describe_index_stats())
        return index

    except Exception as e:
        logging.error(f"Failed to check or create index: {e}")
        raise


In [8]:
pc = initialize_pinecone()

2024-09-13 11:27:58,011 - INFO - Pinecone initialized successfully.


In [9]:
index = check_or_create_index(pc)

2024-09-13 11:27:59,685 - INFO - Creating new index 'langchain-doc-index-zachary' with dimension 1536 and metric cosine.
2024-09-13 11:28:10,876 - INFO - Waiting for index 'langchain-doc-index-zachary' to be initialized...
2024-09-13 11:28:10,958 - INFO - Index 'langchain-doc-index-zachary' is ready.
2024-09-13 11:28:11,960 - INFO - Index stats:
2024-09-13 11:28:12,159 - INFO - {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import logging

def initialize_openai_embeddings():
    try:
        # Load the model name and API key from environment variables
        model_name = EMBED_MODEL
        openai_api_key = OPENAI_API_KEY

        # Validate the presence of API key
        if not openai_api_key:
            raise ValueError("OpenAI API key not found in environment variables. Make sure 'OPENAI_API_KEY' is set.")

        # Initialize OpenAIEmbeddings
        embed = OpenAIEmbeddings(
            model=model_name,
            openai_api_key=openai_api_key
        )

        logging.info(f"OpenAI Embeddings initialized with model '{model_name}' successfully.")
        return embed

    except Exception as e:
        logging.error(f"Failed to initialize OpenAI Embeddings: {e}")
        raise

In [11]:
initialize_openai_embeddings()

  embed = OpenAIEmbeddings(
2024-09-13 11:35:52,721 - INFO - OpenAI Embeddings initialized with model 'text-embedding-3-small' successfully.


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1151a9b50>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x11508cd90>, model='text-embedding-3-small', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)