In [1]:
from dotenv import load_dotenv
load_dotenv() # This line is crucial for loading  .env file

import os
import logging
from typing import List

# prepare_chunks.py is a separate file that handles reading, cleaning, and chunking transcripts.
from prepare_chunks import read_and_chunk_transcripts

# LangChain Imports
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

# Pinecone Imports
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from pinecone import PineconeApiException

# --- Configure logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Initialize OpenAI client (for embeddings) ---
try:
    # OpenAI client is implicitly used by OpenAIEmbeddings, but explicitly initializing
    # here can help with early error detection for the API key.
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        raise ValueError("OPENAI_API_KEY environment variable not set.")
except ValueError as e:
    logging.error(f"Configuration Error: {e}")
    exit("Exiting: OpenAI API key is missing. Please set OPENAI_API_KEY environment variable.")
except Exception as e:
    logging.error(f"Error initializing OpenAI API key: {e}")
    exit("Exiting: Failed to set OpenAI API key.")


# --- Initialize LangChain's OpenAIEmbeddings ---
try:
    embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY"))
    logging.info("Initialized OpenAIEmbeddings model.")
except Exception as e:
    logging.error(f"Error initializing OpenAIEmbeddings: {e}")
    exit("Exiting: Failed to initialize OpenAIEmbeddings. Check API key.")

# --- Pinecone Configuration ---
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT") # e.g., "us-east-1" or "gcp-starter"
INDEX_NAME = "financial-literacy-chatbot" # Your chosen Pinecone index name

if not PINECONE_API_KEY or not PINECONE_ENVIRONMENT:
    logging.error("Pinecone API key or environment not set. Please add PINECONE_API_KEY and PINECONE_ENVIRONMENT to your .env file.")
    exit("Exiting: Pinecone credentials missing.")

try:
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    logging.info("Initialized Pinecone client.")
except Exception as e:
    logging.error(f"Error initializing Pinecone client: {e}")
    exit("Exiting: Failed to initialize Pinecone. Check API key and environment.")

# --- Data Loading, Cleaning, and Chunking (using prepare_chunks.py) ---
chunks: List[Document] = [] # Initialize chunks list as list of Document
try:
    raw_chunks = read_and_chunk_transcripts('transcripts/')
    if not raw_chunks:
        logging.warning("No chunks read from 'transcripts/'. Ensure files exist and content is processed.")
        exit("Exiting: No transcript chunks found.")

    # Convert raw string chunks to LangChain Document objects
    chunks = [Document(page_content=chunk) for chunk in raw_chunks]
    logging.info(f"Successfully loaded and converted {len(chunks)} text chunks to LangChain Documents.")
except Exception as e:
    logging.error(f"Error reading, cleaning, or chunking transcripts: {e}")
    exit("Exiting: Failed to process transcripts. Check 'prepare_chunks.py' and 'transcripts/' directory.")

# --- Pinecone Index Setup and Embedding ---
if __name__ == "__main__":
    logging.info("Starting Pinecone data loading process...")
    try:
        # Check if index exists by listing all indexes
        existing_indexes = pc.list_indexes()
        
        index_exists = False
        for idx_info in existing_indexes:
            if isinstance(idx_info, dict) and idx_info.get('name') == INDEX_NAME:
                index_exists = True
                break
            elif hasattr(idx_info, 'name') and idx_info.name == INDEX_NAME:
                index_exists = True
                break

        if not index_exists:
            logging.info(f"Creating Pinecone index: {INDEX_NAME} with ServerlessSpec (cloud='aws', region='{PINECONE_ENVIRONMENT}').")
            try:
                pc.create_index(
                    name=INDEX_NAME,
                    dimension=1536, # Dimension for text-embedding-ada-002
                    metric='cosine',
                    spec=ServerlessSpec(cloud="aws", region=PINECONE_ENVIRONMENT)
                )
                logging.info(f"Pinecone index '{INDEX_NAME}' created successfully.")
            except PineconeApiException as e_create_api:
                if e_create_api.status == 409: # Catch ALREADY_EXISTS specifically
                    logging.warning(f"Pinecone index '{INDEX_NAME}' already exists (caught 409 Conflict during create_index). Proceeding to connect.")
                    index_exists = True # Mark as existing
                else:
                    raise e_create_api # Re-raise if it's another API error

        if index_exists:
            logging.info(f"Connecting to Pinecone index: {INDEX_NAME}.")
            index = pc.Index(INDEX_NAME)

            # Check if the index is empty. If so, upload embeddings.
            
            if index.describe_index_stats().total_vector_count == 0:
                logging.info("Existing Pinecone index is empty. Proceeding with initial embedding upload.")
                BATCH_SIZE = 100 # Adjust batch size based on your data and Pinecone limits
                for i in range(0, len(chunks), BATCH_SIZE):
                    batch = chunks[i:i + BATCH_SIZE]
                    logging.info(f"Processing batch {i//BATCH_SIZE + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE} ({len(batch)} documents)...")
                    PineconeVectorStore.from_documents(
                        documents=batch,
                        embedding=embeddings_model,
                        index_name=INDEX_NAME
                    )
                    logging.info(f"Uploaded batch starting with document {i} to Pinecone.")
                logging.info(f"Finished uploading all embeddings to Pinecone index '{INDEX_NAME}'. Total vectors now: {index.describe_index_stats().total_vector_count}")
            else:
                logging.info(f"Pinecone index '{INDEX_NAME}' already contains {index.describe_index_stats().total_vector_count} vectors. Skipping embedding upload.")
            
            logging.info("Pinecone data loading process complete. Index is ready.")

        else:
            logging.error(f"Pinecone index '{INDEX_NAME}' could not be created or connected to.")
            exit("Exiting: Pinecone index setup failed.")

    except Exception as e:
        logging.error(f"Critical error during Pinecone index setup and data loading: {e}")
        exit("Exiting: Failed to set up Pinecone index or load data. Ensure network connectivity, correct API key/environment, and valid Pinecone environment (e.g., 'us-west-2').")



print("API Key (start):", os.getenv("PINECONE_API_KEY")[:8], "...")
print("Environment:", os.getenv("PINECONE_ENVIRONMENT"))
print("Available Indexes:", pc.list_indexes())





2025-06-30 17:49:29,048 - INFO - Initialized OpenAIEmbeddings model.
2025-06-30 17:49:29,054 - INFO - Initialized Pinecone client.
2025-06-30 17:49:31,365 - INFO - Successfully read and cleaned 76 transcript files.
2025-06-30 17:49:31,368 - INFO - Splitting text into chunks with size 500 and overlap 100...
2025-06-30 17:49:32,389 - INFO - Successfully loaded and converted 2139 text chunks to LangChain Documents.
2025-06-30 17:49:32,392 - INFO - Starting Pinecone data loading process...


✅ Loaded 2139 chunks from transcripts.


2025-06-30 17:49:34,214 - INFO - Connecting to Pinecone index: financial-literacy-chatbot.
2025-06-30 17:49:36,787 - INFO - Pinecone index 'financial-literacy-chatbot' already contains 2139 vectors. Skipping embedding upload.
2025-06-30 17:49:36,788 - INFO - Pinecone data loading process complete. Index is ready.


API Key (start): pcsk_4rM ...
Environment: us-east-1
Available Indexes: [{
    "name": "financial-literacy-chatbot",
    "metric": "cosine",
    "host": "financial-literacy-chatbot-okahzuc.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "question-answering",
    "metric": "cosine",
    "host": "question-answering-okahzuc.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "abstractive-question-answering",
  