In [4]:
from dotenv import load_dotenv
load_dotenv() # This line is crucial for loading your .env file

import os
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity # Still needed if you want to test similarity directly, but not for Pinecone retrieval
# Assuming prepare_chunks.py is a separate file that you control
from prepare_chunks import read_and_chunk_transcripts
import logging
import re # For parsing numbers from input string
from typing import List

# LangChain Imports
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain.tools import Tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings # Added OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.documents import Document # To work with LangChain's Document objects

# Pinecone Imports
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from pinecone import PineconeApiException # Corrected import path for PineconeApiException

# --- Configure logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Initialize OpenAI client ---
try:
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    if not client.api_key:
        raise ValueError("OPENAI_API_KEY environment variable not set.")
except ValueError as e:
    logging.error(f"Configuration Error: {e}")
    exit("Exiting: OpenAI API key is missing. Please set OPENAI_API_KEY environment variable.")

# --- Initialize LangChain's OpenAIEmbeddings ---
# This is the embedding model used by LangChain to convert text to vectors for Pinecone
try:
    embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY"))
except Exception as e:
    logging.error(f"Error initializing OpenAIEmbeddings: {e}")
    exit("Exiting: Failed to initialize OpenAIEmbeddings. Check API key.")

# --- Pinecone Configuration ---
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT") # e.g., "us-east-1" or "gcp-starter"
INDEX_NAME = "financial-literacy-chatbot" # Your chosen Pinecone index name

if not PINECONE_API_KEY or not PINECONE_ENVIRONMENT:
    logging.error("Pinecone API key or environment not set. Please add PINECONE_API_KEY and PINECONE_ENVIRONMENT to your .env file.")
    exit("Exiting: Pinecone credentials missing.")

try:
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    logging.info("Initialized Pinecone client.")
except Exception as e:
    logging.error(f"Error initializing Pinecone client: {e}")
    exit("Exiting: Failed to initialize Pinecone. Check API key and environment.")

# --- Data Loading, Cleaning, and Chunking (using prepare_chunks.py) ---
chunks = [] # Initialize chunks list
try:
    raw_chunks = read_and_chunk_transcripts('transcripts/')
    if not raw_chunks:
        logging.warning("No chunks read from 'transcripts/'. Ensure files exist and content is processed.")
        exit("Exiting: No transcript chunks found.")

    # Convert raw string chunks to LangChain Document objects
    chunks = [Document(page_content=chunk) for chunk in raw_chunks]
    logging.info(f"Successfully loaded and converted {len(chunks)} text chunks to LangChain Documents.")
except Exception as e:
    logging.error(f"Error reading, cleaning, or chunking transcripts: {e}")
    exit("Exiting: Failed to process transcripts. Check 'prepare_chunks.py' and 'transcripts/' directory.")

# --- Pinecone Index Setup and Embedding ---
vectorstore = None
retriever = None # Initialize retriever outside try block for wider scope

try:
    # Check if index exists by listing all indexes
    existing_indexes = pc.list_indexes()
    
    # Robust way to check if index exists, accounting for different Pinecone client versions
    index_exists = False
    for idx_info in existing_indexes:
        if isinstance(idx_info, dict) and idx_info.get('name') == INDEX_NAME:
            index_exists = True
            break
        elif hasattr(idx_info, 'name') and idx_info.name == INDEX_NAME:
            index_exists = True
            break

    if not index_exists:
        logging.info(f"Creating Pinecone index: {INDEX_NAME} with ServerlessSpec (cloud='aws', region='{PINECONE_ENVIRONMENT}').")
        try:
            pc.create_index(
                name=INDEX_NAME,
                dimension=1536, # Dimension for text-embedding-ada-002
                metric='cosine',
                spec=ServerlessSpec(cloud="aws", region=PINECONE_ENVIRONMENT)
            )
            logging.info(f"Pinecone index '{INDEX_NAME}' created successfully.")
        except PineconeApiException as e_create_api:
            if e_create_api.status == 409: # Catch ALREADY_EXISTS specifically
                logging.warning(f"Pinecone index '{INDEX_NAME}' already exists (caught 409 Conflict during create_index). Proceeding to connect.")
                index_exists = True # Mark as existing
            else:
                raise e_create_api # Re-raise if it's another API error

    if index_exists:
        logging.info(f"Connecting to Pinecone index: {INDEX_NAME}.")
        index = pc.Index(INDEX_NAME)

        # Check if the index is empty. If so, upload embeddings.
        if index.describe_index_stats().total_vector_count == 0:
            logging.info("Existing Pinecone index is empty. Proceeding with initial embedding upload.")
            BATCH_SIZE = 100
            for i in range(0, len(chunks), BATCH_SIZE):
                batch = chunks[i:i + BATCH_SIZE]
                logging.info(f"Processing batch {i//BATCH_SIZE + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE} ({len(batch)} documents)...")
                PineconeVectorStore.from_documents(
                    documents=batch,
                    embedding=embeddings_model,
                    index_name=INDEX_NAME
                )
                logging.info(f"Uploaded batch starting with document {i} to Pinecone.")
            logging.info(f"Finished uploading all embeddings to Pinecone index '{INDEX_NAME}'. Total vectors now: {index.describe_index_stats().total_vector_count}")
        else:
            logging.info(f"Pinecone index '{INDEX_NAME}' already contains {index.describe_index_stats().total_vector_count} vectors. Skipping embedding upload.")
        
        # Initialize the PineconeVectorStore for retrieval
        vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings_model)
        retriever = vectorstore.as_retriever()
        logging.info("Pinecone vector store and retriever initialized.")

    else: # This block would only be hit if create_index truly failed and it didn't exist
        logging.error(f"Pinecone index '{INDEX_NAME}' could not be created or connected to.")
        exit("Exiting: Pinecone index setup failed.")

except Exception as e:
    logging.error(f"Critical error during Pinecone index setup: {e}")
    exit("Exiting: Failed to set up Pinecone index. Ensure network connectivity, correct API key/environment, and valid Pinecone environment (e.g., 'us-west-2').")


# --- Modified function to be used as a LangChain Tool ---
# Renamed to clarify its role with the vector database
def query_knowledge_base(query: str, top_k: int = 5) -> str:
    """
    Retrieves relevant documents from the Pinecone knowledge base and uses an LLM
    to answer the query. This function is designed to be called as a LangChain Tool.

    Args:
        query (str): The user's question about financial literacy.
        top_k (int): The number of top relevant documents to retrieve.

    Returns:
        str: The AI's answer based on the retrieved context, or a standardized error message.
    """
    logging.info(f"Tool 'query_knowledge_base' received query: '{query}'")

    if not query.strip():
        return "Please provide a non-empty question for the knowledge base tool."

    # Ensure retriever is available before trying to use it
    if retriever is None:
        logging.error("Retriever is not initialized. Cannot query knowledge base.")
        return "The knowledge base is not available right now. Please try again later."

    try:
        # Use the LangChain retriever connected to Pinecone
        # Ensure top_k is passed to the retriever
        retrieved_docs: List[Document] = retriever.get_relevant_documents(query) # top_k is handled by retriever default or config

        if not retrieved_docs:
            return "No sufficient information found in the knowledge base for that query."

        context_parts = []
        for i, doc in enumerate(retrieved_docs):
            # Use page_content from the Document object
            context_parts.append(f"<DOCUMENT_START id={i}>\n{doc.page_content}\n<DOCUMENT_END>")
        context = "\n\n".join(context_parts) # Corrected line: Removed the extraneous character

        logging.info(f"\n🔎 Retrieved top {len(retrieved_docs)} documents from Pinecone for tool:\n{context[:500]}...\n") # Log truncated context

        # Prompt for GPT, specifically tailored for the tool's interaction with the LLM
        tool_llm_prompt = f"""You are a financial literacy expert. Your goal is to answer questions using ONLY the information provided in the following retrieved documents.
If the answer is not directly available or cannot be reasonably inferred from the context, state that you cannot answer based on the provided information.

<RETRIEVED_DOCUMENTS>
{context}
</RETRIEVED_DOCUMENTS>

Question: {query}

Answer:
"""
        logging.info("Calling LLM from within 'query_knowledge_base' tool...")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": tool_llm_prompt}],
            temperature=0.5,
            max_tokens=700,
        )
        answer = response.choices[0].message.content.strip()
        logging.info("LLM response received by tool.")
        
        # Standardize "I don't know" responses from the LLM
        if "i apologize, but i don't have enough information" in answer.lower() or \
           "cannot answer based on the provided information" in answer.lower() or \
           "no sufficient information found" in answer.lower():
            return "No sufficient information found in the knowledge base to answer that."
        return answer
    except Exception as e:
        logging.error(f"Error calling LLM API or retrieving from Pinecone within tool: {e}")
        return "Internal tool error: Could not generate an answer."

# --- Calculation Tools (remain largely unchanged) ---
def recommend_savings(input_str: str) -> str:
    """
    Provides savings recommendations. If monthly income and spending are provided,
    it calculates a recommended 20% savings. Otherwise, it gives general guidelines
    and prompts for input.
    Input format expected: "income=<amount>, spending=<amount>" or just a general query.
    """
    logging.info(f"Tool 'recommend_savings' called with input: {input_str}")
    income = None
    spending = None

    income_match = re.search(r"income[=\s]*(\d[\d,\.]*)", input_str, re.IGNORECASE)
    spending_match = re.search(r"spending[=\s]*(\d[\d,\.]*)", input_str, re.IGNORECASE)

    if income_match:
        try: income = float(income_match.group(1).replace(',', ''))
        except ValueError: pass
    if spending_match:
        try: spending = float(spending_match.group(1).replace(',', ''))
        except ValueError: pass

    if income is not None and spending is not None:
        if income < 0 or spending < 0: return "Income and spending must be non-negative values."
        if spending > income: return "Your spending seems to exceed your income. While saving is important, focusing on reducing spending or increasing income might be your first step."

        recommended_savings = 0.20 * income
        needs_wants_budget = 0.80 * income

        return (
            f"Based on your monthly income of ${income:,.2f} and spending of ${spending:,.2f}:\n"
            f"Following the 50/30/20 rule, a recommended monthly savings amount (including debt repayment) is ${recommended_savings:,.2f} (20% of income).\n"
            f"This would leave ${needs_wants_budget:,.2f} for your needs and wants.\n"
            "Remember, consistency is key, and even small amounts add up over time."
        )
    else:
        return (
            "To give you a personalized savings recommendation, I need your monthly income and average monthly spending. "
            "A common guideline is the 50/30/20 rule: 50% for needs, 30% for wants, and 20% for savings and debt repayment. "
            "Please provide your monthly income and spending, for example: 'my income is 3000 and spending is 2000'."
        )

def get_budgeting_templates(input_str: str = "") -> str:
    """Describes common budgeting templates and methods."""
    logging.info(f"Tool 'get_budgeting_templates' called with input: {input_str}")
    return (
        "Budgeting templates can help you organize your finances. Common methods include:\n"
        "- **Spreadsheets:** Excel or Google Sheets offer great flexibility for custom budgets. "
        "You can find many free templates online.\n"
        "- **Budgeting Apps:** Apps like Mint, YNAB (You Need A Budget), or Personal Capital offer "
        "features like transaction tracking, goal setting, and visual reports.\n"
        "- **Pen and Paper:** A simple notebook can also work for tracking income and expenses.\n"
        "The key is to choose a method that you find easy to use and stick with."
    )

def get_expense_tracker_info(input_str: str = "") -> str:
    """Explains what an expense tracker is and its benefits."""
    logging.info(f"Tool 'get_expense_tracker_info' called with input: {input_str}")
    return (
        "An expense tracker helps you monitor where your money goes. Its benefits include:\n"
        "- **Understanding Spending Habits:** Reveals where you might be overspending.\n"
        "- **Budget Adherence:** Helps you stick to your budget and identify areas for adjustment.\n"
        "- **Financial Goal Achievement:** By seeing your spending, you can find more money for savings or debt.\n"
        "- **Tax Preparation:** Makes it easier to categorize expenses for tax purposes.\n"
        "Methods range from manual logging to using sophisticated apps."
    )

def calculate_debt_details(input_str: str) -> str:
    """
    Calculates estimated time to pay off a debt and total interest paid based on
    principal, annual interest rate, and monthly payment.
    Input format expected: "principal=<amount>, interest_rate=<percentage>, monthly_payment=<amount>"
    """
    logging.info(f"Tool 'calculate_debt_details' called with input: {input_str}")
    principal = None
    annual_interest_rate = None
    monthly_payment = None

    principal_match = re.search(r"principal[=\s]*(\d[\d,\.]*)", input_str, re.IGNORECASE)
    rate_match = re.search(r"interest_rate[=\s]*(\d[\d,\.]*)", input_str, re.IGNORECASE)
    payment_match = re.search(r"monthly_payment[=\s]*(\d[\d,\.]*)", input_str, re.IGNORECASE)

    if principal_match:
        try: principal = float(principal_match.group(1).replace(',', ''))
        except ValueError: pass
    if rate_match:
        try: annual_interest_rate = float(rate_match.group(1).replace(',', ''))
        except ValueError: pass
    if payment_match:
        try: monthly_payment = float(payment_match.group(1).replace(',', ''))
        except ValueError: pass

    if None in [principal, annual_interest_rate, monthly_payment]:
        return (
            "To calculate debt details, I need the loan principal, the annual interest rate (as a percentage), "
            "and your monthly payment. "
            "Please provide them, for example: 'principal=10000, interest_rate=5, monthly_payment=200'."
        )

    if principal <= 0 or annual_interest_rate < 0 or monthly_payment <= 0:
        return "All input values (principal, interest rate, monthly payment) must be positive."

    monthly_interest_rate = (annual_interest_rate / 100) / 12

    if monthly_payment <= (principal * monthly_interest_rate) and annual_interest_rate > 0:
        return "Your monthly payment is too low to ever pay off the principal, or just covers interest. You might need to increase your payment to see progress."

    remaining_principal = principal
    total_interest_paid = 0
    months = 0
    max_months = 600

    while remaining_principal > 0 and months < max_months:
        interest_for_month = remaining_principal * monthly_interest_rate
        principal_paid_this_month = monthly_payment - interest_for_month

        if principal_paid_this_month <= 0 and remaining_principal > 0:
            return "With these inputs, it seems your monthly payment is not sufficient to pay off the principal within a reasonable timeframe (e.g., it only covers interest). You may need to increase your payment."

        remaining_principal -= principal_paid_this_month
        total_interest_paid += interest_for_month
        months += 1

        if remaining_principal < 0.01:
            principal_paid_this_month += remaining_principal
            total_interest_paid -= remaining_principal
            remaining_principal = 0

    if remaining_principal > 0:
        return (
            f"It would take more than {max_months} months (50 years) to pay off a principal of ${principal:,.2f} "
            f"with an annual interest rate of {annual_interest_rate}% and a monthly payment of ${monthly_payment:,.2f}. "
            "You might consider increasing your monthly payment."
        )
    else:
        years = months / 12
        return (
            f"To pay off a principal of ${principal:,.2f} with an annual interest rate of {annual_interest_rate}% "
            f"and a monthly payment of ${monthly_payment:,.2f}:\n"
            f"- Estimated time to pay off: {months} months ({years:.1f} years)\n"
            f"- Estimated total interest paid: ${total_interest_paid:,.2f}"
        )

def get_investment_planning_advice(input_str: str = "") -> str:
    """Provides general advice on investment planning."""
    logging.info(f"Tool 'get_investment_planning_advice' called with input: {input_str}")
    return (
        "Investment planning involves setting financial goals and creating a strategy to achieve them through investments. Key aspects include:\n"
        "- **Define Your Goals:** What are you saving for? (e.g., retirement, down payment, education)\n"
        "- **Assess Risk Tolerance:** How comfortable are you with market fluctuations? This influences your asset allocation.\n"
        "- **Diversification:** Spreading investments across different asset classes (stocks, bonds, real estate) to reduce risk.\n"
        "- **Long-term vs. Short-term:** Tailor investments based on your timeline.\n"
        "- **Regular Contributions:** Consistency is often key to compounding returns.\n"
        "It's often recommended to consult a financial advisor for personalized investment planning."
    )


# --- Define LangChain Tools ---
tools = [
    Tool(
        name="FinancialLiteracyRetriever",
        func=query_knowledge_base, # Pointing to the new Pinecone-integrated function
        description="Useful for answering specific financial literacy questions by retrieving information from a comprehensive knowledge base about topics like 401k, IRA, credit scores, mortgages, etc. Input should be a concise financial literacy question.",
    ),
    Tool(
        name="SavingsAdvisor",
        func=recommend_savings,
        description="Calculates a recommended savings amount based on provided monthly income and spending (e.g., 'income=3000, spending=2000'). If numbers are not provided, it gives general savings guidelines and prompts for input. Use this when the user asks about how much they should save or for personalized savings recommendations.",
    ),
    Tool(
        name="BudgetingTemplateInfo",
        func=get_budgeting_templates,
        description="Provides information about different types of budgeting templates and methods. Use this when the user asks about budgeting templates, how to start a budget, or tools for budgeting.",
    ),
    Tool(
        name="ExpenseTrackerInfo",
        func=get_expense_tracker_info,
        description="Explains what an expense tracker is and its benefits. Use this when the user asks about tracking expenses or managing spending.",
    ),
    Tool(
        name="DebtCalculator",
        func=calculate_debt_details,
        description="Calculates the estimated time to pay off a debt and total interest paid. Requires specific inputs: 'principal=<amount>, interest_rate=<percentage>, monthly_payment=<amount>'. Use this when the user asks to calculate debt, loan payoff time, or total interest.",
    ),
    Tool(
        name="InvestmentPlanningAdvisor",
        func=get_investment_planning_advice,
        description="Offers general advice and principles for investment planning. Use this when the user asks about how to plan investments, investment strategies, or getting started with investing.",
    ),
]

# --- Setup LangChain Agent ---
logging.info("Setting up LangChain Agent...")

llm = ChatOpenAI(model="gpt-4o", temperature=0.7, openai_api_key=os.getenv("OPENAI_API_KEY"))
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Define the agent's prompt for create_openai_tools_agent
prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content="You are a friendly and helpful financial literacy chatbot. Your goal is to assist users with their questions about personal finance, investing, debt management, budgeting, and savings. You have several specialized tools to help you find information and provide advice. When a calculation is requested, ensure you ask for all necessary numerical inputs clearly, specifying the format (e.g., 'monthly income is 3000, spending is 2000' for savings, or 'principal=10000, interest_rate=5, monthly_payment=200' for debt calculation)."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])

# Create the OpenAI Tools agent
agent = create_openai_tools_agent(llm, tools, prompt)

# Create the Agent Executor
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=memory, handle_parsing_errors=True)

logging.info("LangChain Agent setup complete.")

if __name__ == "__main__":
    print("Welcome to Financial Literacy Chatbot (LangChain Agent Demo). Type 'exit' to quit.")
    # Check if vectorstore is initialized before starting interaction
    if vectorstore is None:
        print("Chatbot cannot start: Pinecone vector store was not initialized. Check logs for errors.")
    else:
        while True:
            query = input("\nAsk your question: ")
            if query.lower() == "exit":
                break
            try:
                response = agent_executor.invoke({"input": query})
                print(f"\n💬 Chatbot:\n{response['output']}")
            except Exception as e:
                logging.error(f"Error during agent execution: {e}")
                print("I apologize, I encountered an error trying to answer your question. Please try again.")


2025-06-27 15:21:33,292 - INFO - Initialized Pinecone client.
2025-06-27 15:21:34,454 - INFO - Successfully read and cleaned 76 transcript files.
2025-06-27 15:21:34,456 - INFO - Splitting text into chunks with size 500 and overlap 100...
2025-06-27 15:21:34,704 - INFO - Successfully loaded and converted 2139 text chunks to LangChain Documents.


✅ Loaded 2139 chunks from transcripts.


2025-06-27 15:21:35,467 - INFO - Connecting to Pinecone index: financial-literacy-chatbot.
2025-06-27 15:21:36,549 - INFO - Pinecone index 'financial-literacy-chatbot' already contains 2139 vectors. Skipping embedding upload.
2025-06-27 15:21:36,551 - INFO - Pinecone vector store and retriever initialized.
2025-06-27 15:21:36,555 - INFO - Setting up LangChain Agent...
2025-06-27 15:21:36,624 - INFO - LangChain Agent setup complete.


Welcome to Financial Literacy Chatbot (LangChain Agent Demo). Type 'exit' to quit.


[1m> Entering new AgentExecutor chain...[0m


2025-06-27 15:23:16,885 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-27 15:23:17,154 - INFO - Tool 'query_knowledge_base' received query: 'tax implications of withdrawing from a 401(k) before age 59½'


[32;1m[1;3m
Invoking: `FinancialLiteracyRetriever` with `tax implications of withdrawing from a 401(k) before age 59½`


[0m

2025-06-27 15:23:17,463 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-06-27 15:23:18,514 - INFO - 
🔎 Retrieved top 4 documents from Pinecone for tool:
<DOCUMENT_START id=0>
then grows free of income and capital gains tax you can then draw 25 of that out tax-free and the rest will be taxed at your marginal income tax rate in retirement which is likely to be less than you're paying now the only downside the only downside to a pension is that you can't access them until you're 55 but you're already there that's not a problem at all on that original video that i made talking about how you should be investing in your 20s and 30s i got a lot of comm...

2025-06-27 15:23:18,515 - INFO - Calling LLM from within 'query_knowledge_base' tool...
2025-06-27 15:23:19,657 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-27 15:23:19,659 - INFO - LLM response received by tool.


[36;1m[1;3mNo sufficient information found in the knowledge base to answer that.[0m

2025-06-27 15:23:20,330 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mWithdrawing from a 401(k) before age 59½ typically comes with several tax implications:

1. **Early Withdrawal Penalty**: You may face a 10% early withdrawal penalty on the amount you withdraw.

2. **Income Tax**: The withdrawn amount is usually subject to ordinary income tax. This means it will be added to your taxable income for the year, potentially pushing you into a higher tax bracket.

3. **State Taxes**: Depending on your state, there might be additional state taxes on the withdrawal.

However, there are some exceptions to the early withdrawal penalty, such as certain medical expenses, disability, or a qualified domestic relations order. Always consult a tax professional for personalized advice and to explore if any exceptions apply to your situation.[0m

[1m> Finished chain.[0m

💬 Chatbot:
Withdrawing from a 401(k) before age 59½ typically comes with several tax implications:

1. **Early Withdrawal Penalty**: You may face a 10% early withdrawal penalty on the am