In [None]:
import os
import json
from datetime import datetime
from transformers import pipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.tools import Tool
from langchain.agents import create_conversational_retrieval_agent

In [None]:
# Set API keys
os.environ['HUGGINGFACE_HUB_TOKEN'] = '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'
os.environ["OPENAI_API_KEY"] = "sk-proj-FZ2v2sIScsxguqHnuT4VT3BlbkFJEgAuDrqMx91v7PG2fhDu"

# Paths and Model Names
pdf_path = 'test_cover.pdf'
model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'

# Initialize embeddings and LLM
embeddings_model = SentenceTransformer(model_name)
embeddings = HuggingFaceEmbeddings(model_name=model_name)
llm = ChatOpenAI(model_name='gpt-3.5-turbo')

# Load and split PDF
loader = PyPDFLoader(pdf_path)
raw_documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separator="\n")
documents = text_splitter.split_documents(raw_documents)

# Create FAISS index
db = FAISS.from_documents(documents, embeddings)

# Sample Email Conversations
email_conversations = [
    {
        "from": "insurer@example.com",
        "to": ["client@example.com"],
        "date": "March 1, 2024, 10:00 AM",
        "subject": "Policy Information",
        "body": "Dear Client, \n\nYour policy number is PL123456. The net premium amount is $500. The issue date is 2024-02-25.\n\nBest Regards,\nInsurer"
    },
    {
        "from": "client@example.com",
        "to": ["insurer@example.com"],
        "date": "March 2, 2024, 11:00 AM",
        "subject": "Re: Policy Information",
        "body": "Dear Insurer, \n\nThank you for the information. Can you confirm if the cover note number is COV789012?\n\nBest,\nClient"
    }
]

# Queries
queries = {
    "net_premium": "What is the net premium amount?",
    "issue_date": "What is the issue date of the document?",
    "covernote_number": "What is the cover note number which starts with 'COV'? Rule: It must start with 'COV'",
    "policy_number": "What is the policy number which starts with 'PL'? Rule: It must start with 'PL'"
}

# Tool A: Query Email Conversations
def query_email(query):
    for email in email_conversations:
        if query.lower() in email['body'].lower():
            return email['body']
    return "Not found"

email_tool = Tool(
    name="EmailQueryTool",
    func=query_email,
    description="Queries email conversations for relevant information"
)

# Tool B: RAG on PDF
def query_pdf(query):
    retrieved_docs = db.similarity_search(query, k=3)
    return [doc.page_content for doc in retrieved_docs] if retrieved_docs else ["Not found"]

pdf_tool = Tool(
    name="PDFRAGTool",
    func=query_pdf,
    description="Performs Retrieval-Augmented Generation on PDF documents"
)

# Generate Responses
systm_msg = """
You are a helpful assistant that will help the user to figure out the answer to the related query given the content-data. 
The content data will contain 3 choices for the most relevant data, you have to use a combination of the choices to answer the query.
Do not perform any mathematical operations on your own, all the data will be contained within the choices.
The answer will be singular. use an '=' sign before the answer.
If you cannot find the answer easily, say the following: '404 Not Found'.
There should be only one '=' sign in the answer if it is found. 
for any date that is found, convert it into the following format: YYYY-MM-DD.
Do not perform any mathematical operations on your own, all the data will be contained within the choices,
"""

# Create memory

# Create agent
agent_executor = create_conversational_retrieval_agent(
    llm=llm,
    tools=[email_tool, pdf_tool],
    verbose=True
)

# Example usage with queries
for key, query_text in queries.items():
    response = agent_executor.run(input=query_text)
    print(f"Response for '{key}': {response}")