In [2]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [3]:
import os

# Set environment variables
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'cortex'

# Get keys from the environment
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

if langchain_api_key:
    os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
else:
    raise ValueError("LANGCHAIN_API_KEY is not set in the environment.")

if groq_api_key:
    os.environ['GROQ_API_KEY'] = groq_api_key
else:
    raise ValueError("GROQ_API_KEY is not set in the environment.")

PART 15 - RE-RANKING

In [26]:
# Step 1: Load the blog content
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

# Step 2: Split the document into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the chunk size and overlap parameters
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50
)

# Split the blog document into smaller chunks
splits = text_splitter.split_documents(blog_docs)

# Step 3: Set up embeddings and index the documents
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# Set up the HuggingFace embeddings model (without passing `model_kwargs`)
model_name = "BAAI/bge-small-en"
encode_kwargs = {"normalize_embeddings": True}  # Normalize embeddings during encoding

hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, 
    encode_kwargs=encode_kwargs
)

from langchain_community.vectorstores import Chroma

# Index the split documents using Chroma vector store
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=hf_embeddings
)

# Step 4: Create a retriever from the vector store
retriever = vectorstore.as_retriever()


TypeError: SentenceTransformer.__init__() got an unexpected keyword argument 'model_kwargs'

In [21]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion
template = """
You are a helpful assistant that generates multiple search queries based on a single input query.
Generate multiple search queries related to: {question}
Output (4 queries):
"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [22]:
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq

generate_queries = (
    prompt_rag_fusion 
    | ChatGroq(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [24]:
from langchain.load import dumps, loads
from langchain.prompts import ChatPromptTemplate

# Define reciprocal rank fusion function
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal rank fusion method combining multiple lists of ranked documents """
    
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return reranked_results

# Sample input question
question = "What is task decomposition for LLM agents?"
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

# Now invoke the pipeline
docs = retrieval_chain_rag_fusion.invoke({"question": question})

# Output the length of documents retrieved and ranked
print(f"Number of reranked documents: {len(docs)}")


NameError: name 'retriever' is not defined