 ## This implements the DEMO.png setup in this folder
 

In [1]:


%load_ext autoreload
%autoreload 2

# Load a Llama-3-8B instruct 
import transformers
import sys
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import torch
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.vectorstores import VectorStoreRetriever
from typing import Union, List, Dict, Any, NoReturn
from advancedRagUtils import convert_score_to_float



In [2]:
# Test the AdaptiveRAGSupport
%load_ext autoreload
%autoreload 2
from AdvancedRAG.AdaptiveRAGSupport import AdaptiveRAGSupport, logger
from pprint import pprint
model = None
tokenizer = None
retriever = None
support = AdaptiveRAGSupport(model, tokenizer, retriever)

question = "agent memory" #"What is the best way to train a model?"
docs_content = support.get_docs_content_by_query(question, support.DEFAULT_MAX_DOCS_RETURNED, format_as_single_text=True, verbose=False)
answer = support.answer_with_rag(question, docs_content, skip_prompt=True)
logger.warning(answer)
logger.warning(f"answer is good for the question? {support.answer_grader(question, answer, skip_prompt=True)}")
logger.warning(f"halucination grader, i.e., is the answer sustained by documents? {support.hallucination_grader(answer, docs_content, skip_prompt=True)}")
logger.warning(f"question category: {support.question_router(question, skip_prompt=True)})")

    

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Prepare model, tokenizer: 20.536 sec.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
USER_AGENT environment variable not set, consider setting it to identify your requests.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The context discusses the concept of generative agents, which combines LLM with memory, planning, and reflection mechanisms to enable agents to behave conditioned on past experience. In this context, memory is used to record a comprehensive list of agents' experience in natural language, allowing the agent to reflect on past events and guide its future behavior.
answer is good for the question? {'score': 'yes'}
halucination grader, i.e., is the answer sustained by documents? {'score': 'yes'}
question category: {'datasource': 'vectorstore'})


In [3]:
logger.warning(f"question category: {support.question_router('What are the types of agent memory?', skip_prompt=True)})")


question category: {'datasource': 'vectorstore'})


In [6]:

### Search
import os
os.environ["TAVILY_API_KEY"] = "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"
from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [7]:
from typing import TypedDict
from langchain_core.documents import Document
from typing import List
from langgraph.graph import END, StateGraph

class AgentMemory(TypedDict):
    """
    Agent Memory to store the state of the agent and the conversation history 
    """
    
    question: str # The last question asked by user
    generation: str  # The last generated prompt from assistant
    web_search: bool # Should do web search ?
    documents: List[str] # The list of documents extracted
    
    
    
def retrieve(state: AgentMemory):
    """
    Retrieve documents from vectorstore
    Args:
        state: AgentMemory
    Returns:
        state (dict): New keys to be added to the state/update the state, containing the documents
    """
    question = state["question"]
    logger.warning(f"Retrieving documents for question: {question}")
    
    documents = support.get_docs_content_by_query(question, support.DEFAULT_MAX_DOCS_RETURNED, format_as_single_text=False, verbose=False)
    return {"documents": documents, "question": question}
    
def generate(state: AgentMemory):
    """
    Generate an answer using RAG model
    Args:
        state: AgentMemory
    Returns:
        state (dict): New keys to be added to the state/update the state, containing the generation
    """
    question = state["question"]
    documents = state["documents"]
    logger.warning(f"Generating answer for question: {question}")
    
    answer = support.answer_with_rag(question, documents, skip_prompt=True)
    return {"documents": documents, "question": question, "generation": answer}

def grade_documents(state: AgentMemory):
    """
    Checks if the response is sustained by the documents. If any doc is not relevant, we set a flag to run a web search
    Args:
        state: AgentMemory
    Returns:
        state (dict): Filtered out irrelevant documents and updates the web_search flag
    """
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]
    logger.warning(f"Grading documents for question: {question}")
    
    # Check if the answer is sustained by the documents
    # Score each individually
    filtered_docs = []
    for doc in documents:
        score = support.hallucination_grader(generation, doc.page_content, skip_prompt=True)
        
        grade = convert_score_to_float(score)
        
        # Document is relevant if the grade is above 0.5
        if grade > 0.5:
            filtered_docs.append(doc)
        # If not, we set a flag to run a web search
        else: 
            web_search = True
            continue # IS THIS NEEDED ?????
        
    return {"documents": filtered_docs, "question": question, "web_search": web_search}

def web_search(state: AgentMemory):
    """
    Run a web search based on the question
    
    Args:
        state: AgentMemory
    Returns:
        state (dict): Added the web search results to the state
        
    """
    # We run a web search
    question = state["question"]
    documents = state["documents"]
    
    # Web search 
    logger.warning(f"Running web search for question: {question}")
    web_search_docs = web_search_tool.invoke({"query": question, 'k': support.NUM_WEB_SEARCH_RESULTS})
    web_results = "\n".join(doc["content"] for doc in web_search_docs)
    web_results = Document(page_content=web_results)
    if documents is not None:
        documents.append(web_results)
    else:    
        documents = [web_results]
        
    return {"documents": documents, "question": question}

# Conditional edges
def route_question(state: AgentMemory):
    """
    Route the question to the right category, RAG or websearch
    Args:
        state: AgentMemory
    Returns:
        str: name of the next node to be executed
    """
    question = state["question"]
    
    
    logger.warning(f"Routing question: {question}")
    
    source = support.question_router(question, skip_prompt=True)
    
    logger.warning(f"Question category: {source['datasource']}")
    if source["datasource"] == "websearch":
        return "websearch"
    else:
        return "vectorstore"
    
    
    category = support.question_router(question, generation, documents, skip_prompt=True)
    return {"documents": documents, "question": question, "generation": generation, "category": category}

def decide_if_websearch_is_needed(state: AgentMemory):
    """
    Check if we need to run the web search after a collection of documents were retrieved based on the state of the agent
    Args:
        state: AgentMemory
    Returns:
        str: name of the next node to be executed
    """
    web_search = state["web_search"]
    logger.warning(f"Assessing the status of web search: {web_search}")
    assert type(web_search) == bool, "web_search should be a boolean"
    if web_search:
        logger.warning(f"---DECISION: Not all documents are relevant, running web search")
        return "websearch"
    else:
        logger.warning(f"---DECISION: ALL documents are relevant, go to generation")
        return "vectorstore"
    
def grade_generation_vs_documents_and_question(state: AgentMemory):
    """
    Determines if the generation is actually grounded in the documents retrieved from the vectorstore according and answering the question
    Args:
        state: AgentMemory
    Returns:
        str: Decision for next node to call.
    """
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]
    
    logger.warning(f"Hallucination check and then if the answer is useful: Grading generation vs documents and question")
    
    # Check if answer is sustained by the documents, hallucination check
    score_hallucination = support.hallucination_grader(question, documents, skip_prompt=True)
    grade_hallucination = convert_score_to_float(score_hallucination)
    if grade_hallucination > 0.5:
        # Answer is sustained by the documents, now check if the answer is useful
        logger.warning(f"---DECISION: Generation is grounded in the documents")
                    
        score_useful = support.answer_grader(question, generation, skip_prompt=True)
        grade_useful = convert_score_to_float(score_useful)
        if grade_useful > 0.5:
            logger.warning(f"---DECISION: Generation is sustained by the documents and the answer is useful")
            return "useful"
        else:
            logger.warning(f"---DECISION: Generation does not address question or is not useful")
            return "not useful"
    else:
        logger.warning(f"---DECISION: Generation is NOT grounded in the documents, RETRYING")
        return "not supported"
    
workflow = StateGraph(AgentMemory)

workflow.add_node("websearch", web_search) # web search node
workflow.add_node("retrieve", retrieve) # retrieve node
workflow.add_node("generate", generate) # generate node
workflow.add_node("grade_documents", grade_documents) # grade documents node

# workflow.add_node("route_question", route_question) # route question node
# workflow.add_node("decide_if_websearch_is_needed", decide_if_websearch_is_needed) # decide if websearch is needed node
# workflow.add_node("grade_generation_vs_documents_and_question", grade_generation_vs_documents_and_question) # grade generation vs documents and question node



## Building the graph


In [8]:
# Check the result 
workflow.set_conditional_entry_point(
    route_question,
    {
        "websearch" : "websearch",
        "vectorstore" : "retrieve",
    },
) # Dict from result of the function to next node

workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_if_websearch_is_needed,
    {
        "websearch" : "websearch",
        "generate" : "generate", 
    }
)

workflow.add_edge("websearch", "generate")
workflow.add_conditional_edges(
    "generate",
    grade_generation_vs_documents_and_question,
    {
      "not supported" : "generate",
        "useful": END,
        "not useful": "websearch",
    },
)

In [9]:
from pprint import pprint
app = workflow.compile()

# Test 

inputs = {"question": "What are the types of agent memory?"}


Routing question: What are the types of agent memory?
Question category: vectorstore
Retrieving documents for question: What are the types of agent memory?
Grading documents for question: What are the types of agent memory?


'Finished running: retrieve:'


Assessing the status of web search: True
---DECISION: Not all documents are relevant, running web search
Running web search for question: What are the types of agent memory?


'Finished running: grade_documents:'


Generating answer for question: What are the types of agent memory?
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


'Finished running: websearch:'


Hallucination check and then if the answer is useful: Grading generation vs documents and question
---DECISION: Generation is grounded in the documents
---DECISION: Generation is sustained by the documents and the answer is useful


'Finished running: generate:'
('According to the provided context, there are two categories of agent memory: '
 'short-term and long-term memories.')
