In [7]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python langchain-openai
# this notebook uses OpenAI for embeddings and LLM inferencing with GPT-4o
# by default, web search is turned off to save API calls when testing/evaluating

Collecting langchain_community
  Using cached langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting langchain
  Using cached langchain-0.1.20-py3-none-any.whl.metadata (13 kB)
Collecting langgraph
  Downloading langgraph-0.0.48-py3-none-any.whl.metadata (22 kB)
Collecting langchain-core<0.2,>=0.1 (from langchain-nomic)
  Using cached langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Using cached langchain_community-0.0.38-py3-none-any.whl (2.0 MB)
Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl (799 kB)
   ---------------------------------------- 0.0/799.0 kB ? eta -:--:--
    --------------------------------------- 10.2/799.0 kB ? eta -:--:--
    --------------------------------------- 10.2/799.0 kB ? eta -:--:--
   --- ----------------------------------- 71.7/799.0 kB 787.7 kB/s eta 0:00:01
   ------- -------------------------------- 143.4/799.0 kB 1.1 MB/s eta 

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
crewai-tools 0.1.7 requires chromadb<0.5.0,>=0.4.22, but you have chromadb 0.5.0 which is incompatible.
embedchain 0.1.100 requires chromadb<0.5.0,>=0.4.17, but you have chromadb 0.5.0 which is incompatible.
embedchain 0.1.100 requires tiktoken<0.6.0,>=0.5.2, but you have tiktoken 0.7.0 which is incompatible.
langchain-chroma 0.1.0 requires chromadb<0.5.0,>=0.4.0, but you have chromadb 0.5.0 which is incompatible.
langchain-openai 0.0.5 requires tiktoken<0.6.0,>=0.5.2, but you have tiktoken 0.7.0 which is incompatible.


In [8]:
import dotenv
import os

dotenv.load_dotenv()

TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
USE_WEB_SEARCH = 0

In [9]:
llm = "gpt-4o"

In [10]:
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from langchain_core.documents import Document

jsonl_file_path = "document_data\\ac2_all_documents.jsonl"

dataset = []

embedding = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [11]:
#load dataset as doc list
with open(jsonl_file_path, "r") as file:
    for line in file:
        data = json.loads(line)
        document = Document(**data)
        dataset.append(document)

print(f'Dataset loaded with {len(dataset)} documents.')

Dataset loaded with 499 documents.


In [12]:
#load vectordb
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(dataset)

vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=embedding,
)
retriever = vectorstore.as_retriever()

In [None]:
#vectorstore.delete_collection("rag-chroma")

In [16]:
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

llm = ChatOpenAI(model="gpt-4o", temperature=0)

prompt = PromptTemplate(
    template="""You are a grader assessing relevance 
    of a retrieved document to a user question. If the document contains keywords related to the user question, 
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "document"],
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "arwic"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

{'score': 'yes'}


In [17]:
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = PromptTemplate(
    template="""You are a chatbot assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question, and respond in a conversational and helpful tone. If 
    you don't know the answer, just say that you don't know. Do not tell the user about documents, provided text, or context information, 
    that information is provided to you by a separate system which was searched based on the user question.
    Be accurate, do not combine context information unless they are directly related (ex. from the same source).
    Context: {context} 
    Question: {question}
    Answer: """,
    input_variables=["question", "context"],
)


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
question = "around what level should i head to omishan?"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

You should head to Omishan around level 30. Many quests in Omishan, such as the Shreth-Hunter's Guide to Omishan and the Omishan Soulbound Weapon, have a level requirement of 30.


In [18]:
#hallucination grader
prompt = PromptTemplate(
    template="""You are a grader assessing whether 
    an answer is grounded in / supported by a set of facts. Give a binary 'yes' or 'no' score to indicate 
    whether the answer is grounded in / supported by a set of facts. Provide the binary score as a JSON with a 
    single key 'score' and no preamble or explanation.
    Here are the facts:
    \n ------- \n
    {documents} 
    \n ------- \n
    Here is the answer you need to score as a 'yes' or 'no', attached to the json key 'score': {generation}""",
    input_variables=["generation", "documents"],
)

hallucination_grader = prompt | llm | JsonOutputParser()
graded = hallucination_grader.invoke({"documents": docs, "generation": generation})
print(graded)

{'score': 'yes'}


In [19]:
#answer grader
prompt = PromptTemplate(
    template="""You are a grader assessing whether an 
    answer is useful to resolve a question. Give a binary score 'yes' or 'no' to indicate whether the answer is 
    useful to resolve a question. Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
    Here is the answer:
    \n ------- \n
    {generation} 
    \n ------- \n
    Here is the question: {question}""",
    input_variables=["generation", "question"],
)

answer_grader = prompt | llm | JsonOutputParser()
answer_grader.invoke({"question": question, "generation": generation})

{'score': 'yes'}

In [21]:
# LLM
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an expert at routing a 
    user question to a vectorstore or web search. Use the vectorstore for questions on the Asheron's Call 2
    MMORPG or any associated topics (quests, monsters, etc...). You do not need to be stringent with the keywords 
    in the question related to these topics. Otherwise, use web-search. Give a binary choice 'web_search' 
    or 'vectorstore' based on the question. Return the a JSON with a single key 'datasource' and 
    no premable or explaination. Question to route: {question} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question"],
)

question_router = prompt | llm | JsonOutputParser()
# this should return web_search
question = "samsung"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print("The result for an unrelated query is " + str(question_router.invoke({"question": question})))
# this should return vectorstore
question = "arwic"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print("The result for a related query is " + str(question_router.invoke({"question": question})))

The result for an unrelated query is {'datasource': 'web_search'}
The result for a related query is {'datasource': 'vectorstore'}


In [22]:
# setup search
from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [23]:
from typing_extensions import TypedDict
from typing import List

class GraphState(TypedDict):
    question: str
    generation: str
    web_search: str
    documents: List[str]

In [24]:
from langchain.schema import Document

def retrieve(state):
    print("---RETRIEVE---")
    question = state["question"]

    documents = retriever.invoke(question)
    return {"documents": documents, "question": question}

def generate(state):
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation}

def grade_documents(state):
    print("---GRADE DOCUMENTS---")
    question = state["question"]
    documents = state["documents"]

    filtered_docs = []
    web_search = "no"
    for d in documents:
        score = retrieval_grader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score["score"]
        if grade.lower() == "yes":
            print("---GRADE: RELEVANT---")
            filtered_docs.append(d)
        else:
            print("---GRADE: NOT RELEVANT---")
        
    if USE_WEB_SEARCH == 1 and not filtered_docs:
        web_search = "yes"

    return {"documents": filtered_docs, "question": question, "web_search": web_search}

def web_search(state):
    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]
    
    docs = web_search_tool.invoke({"query": question})
    web_results = "\n".join([d["content"] for d in docs])
    web_results = Document(page_content=web_results)
    if documents is not None:
        documents.append(web_results)
    else:
        documents= [web_results]
    return {"documents": documents, "question": question}

def route_question(state):
    print("---ROUTE QUESTION---")
    question = state["question"]
    print(question)
    source = question_router.invoke({"question": question})
    print(source)
    print(source["datasource"])
    if source["datasource"] == "web_search":
        print("---ROUTE QUESTION TO WEB SEARCH---")
        return "websearch"
    elif source["datasource"] == "vectorstore":
        print("---ROUTE QUESTION TO RAG---")
        return "vectorstore"
    
def decide_to_generate(state):
    print("---ASSESS GRADED DOCUMENTS---")
    question = state["question"]
    web_search = state["web_search"]
    filtered_documents = state["documents"]

    if web_search == "yes" and USE_WEB_SEARCH == 1:
        print("---DECISION: NO RELEVANT DOCS, USE WEB SEARCH---")
        return "websearch"
    else:
        print("---DECISION: GENERATE---")
        return "generate"

def grade_generation_v_documents_and_question(state):
    print("---CHECK HALLUCINATIONS---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]

    score = hallucination_grader.invoke(
        {"documents": documents, "generation": generation}
    )
    grade = score["score"]

    if grade == "yes":
        print("---DECISION: ANSWER GROUNDED IN DOCS---")
        print("---GRADE ANSWER vs QUESTION---")
        score = answer_grader.invoke({"question": question, "generation": generation})
        grade = score["score"]
        if grade == "yes":
            print("---DECISION: ANSWERS QUESTION---")
            return "useful"
        else:
            print("---DECISION: ANSWER DOES NOT ADDRESS QUESSTION---")
            return "not useful"
    else:
        print("---DECISION: GENERATION NOT GROUNDED IN DOCS---")
        return "not supported"
    

In [25]:
from langgraph.graph import END, StateGraph

workflow = StateGraph(GraphState)

workflow.add_node("websearch", web_search)
workflow.add_node("retrieve", retrieve)
workflow.add_node("grade_documents", grade_documents)
workflow.add_node("generate", generate)

workflow.set_conditional_entry_point(
    route_question,
    {
        "vectorstore": "retrieve",
        "websearch": "websearch",
    },
)

workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "websearch": "websearch",
        "generate": "generate",
    },
)
workflow.add_edge("websearch", "generate")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "websearch",
    },
)

In [26]:
app = workflow.compile()

from pprint import pprint

question = "what do i need to do to get the arwic mines quest?"

inputs = {"question": question}

for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}:")

pprint(value["generation"])

---ROUTE QUESTION---
what do i need to do to get the arwic mines quest?
{'datasource': 'vectorstore'}
vectorstore
---ROUTE QUESTION TO RAG---
---RETRIEVE---
'Finished running: retrieve:'
---GRADE DOCUMENTS---
---GRADE: RELEVANT---
---GRADE: RELEVANT---
---GRADE: NOT RELEVANT---
---GRADE: NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
'Finished running: grade_documents:'
---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: ANSWER GROUNDED IN DOCS---
---GRADE ANSWER vs QUESTION---
---DECISION: ANSWERS QUESTION---
'Finished running: generate:'
('To get the Arwic Mines Quest, you need to first complete the "Find the Arwic '
 'Mines Overseer" quest. Here’s a step-by-step guide:\n'
 '\n'
 '1. **Complete the Obelisk Search Quests**: You need to finish one of the '
 'three Obelisk Search Quests.\n'
 '2. **Talk to the Explorer**: After completing one of the Obelisk Search '
 'Quests, talk to the zone named Explorer.\n'
 '3. **Find the Arwic Mines Overseer**: The Explor