In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain langchain-huggingface bs4 langsmith langchain_ollama

# RAG Overview

In [2]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate

from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings

from dotenv import load_dotenv
load_dotenv()

# llm = Ollama(model="deepseek-r1:1.5b")

# prompt = """
# Use ONLY the context below.
# If unsure, say "I don't know".
# Keep answers under 4 sentences.

# Context: {context}
# Question: {question}
# Answer:
# """
# QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt)

#### INDEXING ####

# Load Document
# This document will be split into chunks -> vector embed -> added to the the prompt for context
# WebBaseLoader() standardizes webpage document, and transform them into a Document structure with two main fields: page_content (raw text) and metadata (additional information)
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed. Create vector embedding for each chunk. Chroma runs locally.
vectorstore = Chroma.from_documents(documents=splits, 
                                    # embedding=OpenAIEmbeddings())   # OpenAI almost always have better embedding performance compared to open-source counterparts
                                    embedding=OllamaEmbeddings(model="mxbai-embed-large"))  # SOTA (as of March 2024) large embedding model from mixedbread.ai

# The vector embedding that will be added to the prompt, for context
retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt for RAG
prompt = hub.pull("rlm/rag-prompt")

# The default value of temperature is 1.0.
# USE CASE	TEMPERATURE
# Coding / Math   	0.0
# Data Cleaning / Data Analysis	1.0
# General Conversation	1.3
# Translation	1.3
# Creative Writing / Poetry	1.5
llm = OllamaLLM(model="deepseek-r1:1.5b", temperature=1.3)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain. 
# Takes the input question 
# -> run the retriever to fetch all the documents 
# -> put the question and the retrieved documents into the prompt 
# -> pass it to the LLM 
# -> format the output as a string 
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
print(rag_chain.invoke("What is Task Decomposition?"))


<think>
Alright, let me tackle this query step by step. The user is asking about Task Decomposition, specifically wanting an explanation in three sentences. They've provided a context that defines what Task Decomposition is and even includes some technical details.

First, I need to ensure I fully understand the main points of Task Decomposition. From the context, it seems like it's a technique where complex tasks are broken down into simpler parts for easier handling by AI models like LLMs. The technique helps models think step-by-step, improving performance on hard tasks by enhancing test-time computation.

Looking at the context again, I see that there's information about different methods such as CoT (Chain of Thought) and its extensions. YAO et al.'s work discusses this in depth. So while the user's main question is straightforward, the context provides additional insights into various techniques for decomposition.

I need to condense this information without missing any key point

# Indexing
We have (external) documents that we want to load to a 'retriever' (as in the code above). The goal of the retriever is to return relevant chunks from the documents based on the user's question. The relevance of the chunks can be calculate in multiple ways. For example:
- Statistical - Bag of word, sparse representation, search: BM25 
- Machine Learning - vector embedding, dense representation, search: KNN, HNSW

To rephrase, the documents are segmented into chunks, which are vector embedded and indexed. The question is also vector embedded. This is followed with some similarity calculation between the question embedding and the indexed document chunks to filter the relevant chunks.