## RAG FROM SCRATCH

### Environment

In [4]:
import os

In [None]:
# LangSmith Tracking
# os.environ['LANGSMITH_TRACING']='true'
# os.environ['LANGSMITH_ENDPOINT']="https://api.smith.langchain.com"
# os.environ['LANGSMITH_API_KEY']="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"
# os.environ['LANGSMITH_PROJECT']="rag-from-scratch"

## Part 1: Overview

In [10]:
import bs4
from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import ChatOllama, OllamaEmbeddings

##### INDEXING #####

# Load Documents
loader = WebBaseLoader(
    web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/"),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(
    documents=splits, embedding=OllamaEmbeddings(model='mxbai-embed-large:latest'))

retriever = vectorstore.as_retriever()

##### RETRIEVAL and GENBERATION #####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOllama(model="gemma3:4b")

# Post-processing


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# question
rag_chain.invoke("what is Task Decomposition")

'Task decomposition involves breaking down complex tasks into smaller, more manageable steps. This can be achieved through prompting an LLM with instructions like "Steps for XYZ" or by utilizing task-specific guidance. Alternatively, it can involve using external planners like classical planners or PDDL-based systems to handle long-horizon planning.'

## Part 2: Indexing

In [2]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [3]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str)-> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

In [4]:
from langchain_ollama import OllamaEmbeddings
embd = OllamaEmbeddings(model='mxbai-embed-large:latest')
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

1024

In [5]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity", similarity)

Cosine Similarity 0.727381790126319


In [6]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [8]:
# Index
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OllamaEmbeddings(model='mxbai-embed-large:latest'))

retriever = vectorstore.as_retriever()

## Part 3: Retrival 

In [9]:

# Index
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OllamaEmbeddings(model='mxbai-embed-large:latest'))


retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [12]:
docs = retriever.invoke("What is Task Decomposition?")
len(docs)

1


## Part 4: Generation

In [13]:
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [14]:
# LLM
llm = ChatOllama(model="gemma3:4b")

In [15]:
# Chain
chain = prompt | llm

In [16]:
# Run
chain.invoke({"context":docs,"question":"What is Task Decomposition?"})

AIMessage(content='Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\\\n1.", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.', additional_kwargs={}, response_metadata={'model': 'gemma3:4b', 'created_at': '2025-10-05T15:07:26.2939101Z', 'done': True, 'done_reason': 'stop', 'total_duration': 12894668300, 'load_duration': 9651560700, 'prompt_eval_count': 336, 'prompt_eval_duration': 605757100, 'eval_count': 57, 'eval_duration': 2636844200, 'model_name': 'gemma3:4b'}, id='run--f258001f-8a43-4d6d-ae1e-489b414adb59-0', usage_metadata={'input_tokens': 336, 'output_tokens': 57, 'total_tokens': 393})

In [17]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [18]:
prompt_hub_rag


ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [19]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

'Based on the document, Task Decomposition is the process of breaking down a complicated task into smaller and simpler steps. It can be done by:\n\n1.  LLMs with simple prompting like "Steps for XYZ.\\\\n1."\n2.  Using task-specific instructions (e.g., "Write a story outline.")\n3.  With human inputs.'