In [None]:
# Install required packages
# pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain langgraph tavily-python

In [2]:
# Import required modules
import os
import dotenv

# Load environment variables from .env file
dotenv.load_dotenv()

# Set environment variables
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_PROJECT"] = "CRAG-LangGraph"

In [3]:
# Import required modules
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the URLs of the documents to load
urls = [
    "https://div.beehiiv.com/p/advanced-rag-series-indexing",
    "https://div.beehiiv.com/p/advanced-rag-series-retrieval",
    "https://div.beehiiv.com/p/advanced-rag-series-generation-evaluation",
]

# Load the documents from the URLs
docs = [WebBaseLoader(url).load() for url in urls]

# Flatten the list of documents
docs_list = [item for sublist in docs for item in sublist]

# Create a text splitter for chunking the documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=50
)

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

# Create a Chroma vectorstore from the document chunks
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=OpenAIEmbeddings(),
)

# Create a retriever from the vectorstore
retriever = vectorstore.as_retriever()

In [4]:
### Retrieval Grader 

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt 
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader
question = "chunk optimization"
docs = retriever.get_relevant_documents(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

binary_score='yes'
