In [None]:
%pip install langchain openai pypdf sentence-transformers chromadb
%pip install -qU langchain-openai


In [None]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chains.base import Chain
from langchain_openai import ChatOpenAI


In [None]:
load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")


In [None]:
# Define the folder path containing the PDF files
folder_path = 'data'

# Load all PDF files from the specified directory
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
documents = []

for pdf_file in pdf_files:
    loader = PyPDFLoader(os.path.join(folder_path, pdf_file))
    documents.extend(loader.load())


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=56)
split_documents = text_splitter.split_documents(documents)


In [None]:
# Initialize Chroma client
chroma_client = chromadb.Client()

# Create a collection in Chroma
collection_name = "knowledge_base"
chroma_collection = chroma_client.get_or_create_collection(collection_name)

# Initialize embeddings model
embeddings_model = OpenAIEmbeddings(api_key=api_key)

# Create a Chroma vector store from documents and embeddings
vector_store = Chroma.from_documents(
    documents=split_documents,
    embedding=embeddings_model,
    collection_name="knowledge_base"
)


In [None]:
retriever = vector_store.as_retriever()

# Initialize the language model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", api_key=api_key)

# Function to implement the Self-RAG logic
def self_rag(query, retriever, llm, max_iterations=3, threshold=0.9):
    """
    Self-RAG pipeline with iterative refinement.
    """
    context = []
    for iteration in range(max_iterations):
        # Step 1: Retrieve documents based on the query or refined query
        retrieved_docs = retriever.get_relevant_documents(query)

        # Step 2: Generate a response using the retrieved documents
        input_context = "\n".join([doc.page_content for doc in retrieved_docs])
        prompt = (
            f"Context:\n{input_context}\n\n"
            f"Question: {query}\n\n"
            "Provide a detailed answer based on the context above. If insufficient information is available, specify what additional information is needed."
        )
        response = llm.predict(prompt)

        # Step 3: Check if the response is satisfactory
        if "insufficient information" not in response.lower():
            return response  # Exit early if the answer is sufficient

        # Step 4: Refine query based on the response
        query_refinement_prompt = (
            f"Initial Query: {query}\n"
            f"Response: {response}\n\n"
            "What clarifying or follow-up query would help retrieve better context?"
        )
        refined_query = llm.predict(query_refinement_prompt).strip()

        # Add refined query and responses to context for subsequent iterations
        context.append({"query": query, "response": response})
        query = refined_query 
        

    # Return the last response if max iterations are reached
    return f"Final Response (after {max_iterations} iterations): {response}"

query = "Can you tell me about flow trajectories in detail?"
response = self_rag(query, retriever, llm)
print(response)


In [None]:
# retriever = vector_store.as_retriever()
# qa_chain = RetrievalQA.from_chain_type(
#     llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
#     chain_type="stuff",
#     retriever=retriever
# )


In [None]:
# query = "Can you tell me about flow trajectories in detail?"
# response = qa_chain({"query": query})
# print(response['result'])

In [None]:

# # Initialize Conversational Retrieval Chain (RAG with feedback loop) using langchain
# qa_chain = ConversationalRetrievalChain.from_llm(
#     llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
#     retriever=retriever
# )

# # Self-assessment logic
# def self_rag_query(query):
#     # Initial response
#     response = qa_chain({"question": query, "chat_history": []})
#     print("Initial response:", response['answer'])

#     # Self-reflection
#     refinement_query = f"Based on the above answer, is there any missing or unclear information? If so, reframe the query to improve it. Original query: {query}"
#     reflection = qa_chain({"question": refinement_query, "chat_history": []})
#     print("Reflection:", reflection['answer'])

#     # Iterative improvement
#     if "no missing information" not in reflection['answer'].lower():
#         refined_query = reflection['answer']
#         improved_response = qa_chain({"question": refined_query, "chat_history": []})
#         return improved_response['answer']
#     return response['answer']

# # Example usage
# query = "Can you tell me about flow trajectories in detail?"
# final_response = self_rag_query(query)
# print("Final response:", final_response)
