In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

from langchain_community.vectorstores import Chroma
import numpy as np
from typing import List


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
## create sample documents
sample_docs = [
    """
  Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligence that enables systems to learn
    and improve from experience without being explicitly programmed. There are three main
    types of machine learning: supervised learning, unsupervised learning, and reinforcement
    learning. Supervised learning uses labeled data to train models, while unsupervised
    learning finds patterns in unlabeled data. Reinforcement learning learns through
    interaction with an environment using rewards and penalties.
    """,
    
    """
    Deep Learning and Neural Networks

    Deep learning is a subset of machine learning based on artificial neural networks.
    These networks are inspired by the human brain and consist of layers of interconnected
    nodes. Deep learning has revolutionized fields like computer vision, natural language
    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly
    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers
    excel at sequential data processing.
    """,
    
    """
    Natural Language Processing (NLP)

    NLP is a field of AI that focuses on the interaction between computers and human language.
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis,
    machine translation, and question answering. Modern NLP heavily relies on transformer
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand
    context and relationships between words in text.
    """
]
sample_docs

['\n  Machine Learning Fundamentals\n    Machine learning is a subset of artificial intelligence that enables systems to learn\n    and improve from experience without being explicitly programmed. There are three main\n    types of machine learning: supervised learning, unsupervised learning, and reinforcement\n    learning. Supervised learning uses labeled data to train models, while unsupervised\n    learning finds patterns in unlabeled data. Reinforcement learning learns through\n    interaction with an environment using rewards and penalties.\n    ',
 '\n    Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly\n    effective for image proc

In [5]:
## save sample documents to files
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"{temp_dir}/doc_{i}.txt","w") as f:
        f.write(doc)
print(f"Sample document create in : {temp_dir}")

Sample document create in : C:\Users\WIN11~1\AppData\Local\Temp\tmp7wu4vb4x


In [6]:
## save sample documents to files
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"{temp_dir}/doc_{i}.txt","w") as f:
        f.write(doc)

In [8]:
### 2. Document Loading
from langchain_community.document_loaders import DirectoryLoader,TextLoader
# Load documents from directory
loader = DirectoryLoader(
    "data/text_files",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"First document preview:")
print(documents[0].page_content[:200] + "..")

Loaded 5 documents
First document preview:

  Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligence that enables systems to learn
    and improve from experience without being explicitly programmed. There ar..


In [9]:
### Document Splitting
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Maximum size of each chunk
    chunk_overlap=50,  # Overlap between chunks to maintain context
    length_function=len,
    separators=[" "]  # Hierarchy of separators
)
chunks=text_splitter.split_documents(documents)

print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"Chunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 8 chunks from 5 documents
Chunk example:
Content: Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligence that enables systems to learn
    and improve from experienc...
Metadata: {'source': 'data\\text_files\\doc_0.txt'}


In [10]:
### Embedding Models
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
sample_text="MAchine LEarning is fascinating"
embeddings=OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000019765FEAE10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000019765FBBFE0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [11]:
vector=embeddings.embed_query(sample_text)
vector

[-0.022832060232758522,
 0.011113880202174187,
 0.0084602115675807,
 -0.029978575184941292,
 -0.005767131689935923,
 0.020467406138777733,
 -0.003254685318097472,
 0.01150142028927803,
 -0.02147895283997059,
 -0.04414023086428642,
 0.006433832924813032,
 0.04385121911764145,
 -0.01824725605547428,
 0.005139841232448816,
 -0.0012406229507178068,
 0.0039312392473220825,
 0.03694117069244385,
 0.009537442587316036,
 0.004381180740892887,
 -0.0077376775443553925,
 -0.021649733185768127,
 0.020966609939932823,
 -0.0054321386851370335,
 -0.04061952233314514,
 -0.007409253157675266,
 0.012361892499029636,
 0.014647725969552994,
 -0.03649451211094856,
 -0.013715000823140144,
 -0.002336739329621196,
 0.016841599717736244,
 -0.008920005522668362,
 -0.019271939992904663,
 -0.0407508946955204,
 -0.015199478715658188,
 -0.012171406298875809,
 0.0001829939428716898,
 0.0011905382853001356,
 -0.014844780787825584,
 -0.0009926626225933433,
 0.017235709354281425,
 0.01280198059976101,
 -0.0105818323791

In [12]:
### Initialize the ChromaDB Vector Store And Stores the chunks in Vector Representation
## Create a Chromdb vector store
persist_directory="./chroma_db"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"
)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 8 vectors
Persisted to: ./chroma_db


In [13]:
### Test Similarity Search
query="What are the types of machine learning?"
similar_docs=vectorstore.similarity_search(query,k=3)
similar_docs

[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications'),
 Document(metadata={'source': 'data\\text_files\\doc_0.txt'}, page_content='Machine Learning Fundamentals\n    Machine learning is a subset of artificial intelligence that enables systems to learn\n    and improve from experience without being explicitly programmed. There are three main\n    types of machine learning: supervised learning, unsupervised learning, and reinforcement\n    learning. Superv

In [14]:
query="what is NLP?"
similar_docs=vectorstore.similarity_search(query,k=3)
similar_docs

[Document(metadata={'source': 'data\\text_files\\doc_2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language.\n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis,\n    machine translation, and question answering. Modern NLP heavily relies on transformer\n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand\n    context and relationships between words in text.'),
 Document(metadata={'source': 'data\\text_files\\doc_1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Network

In [15]:
query="what is Deep Learning?"
similar_docs=vectorstore.similarity_search(query,k=3)
similar_docs

[Document(metadata={'source': 'data\\text_files\\doc_1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly\n    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers'),
 Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Lear

In [16]:
print(f"Query: {query}")
print(f"Top {len(similar_docs)} similar chunks:")
for i, doc in enumerate(similar_docs):
    print(f"--- Chunk {i+1} ---")
    print(doc.page_content[:200] + "...")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")

Query: what is Deep Learning?
Top 3 similar chunks:
--- Chunk 1 ---
Deep Learning and Neural Networks

    Deep learning is a subset of machine learning based on artificial neural networks.
    These networks are inspired by the human brain and consist of layers of in...
Source: data\text_files\doc_1.txt
--- Chunk 2 ---
Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing...
Source: data\text_files\machine_learning.txt
--- Chunk 3 ---
Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligence that enables systems to learn
    and improve from experience without being explicitly programmed. There are t...
Source: data\text_files\doc_0.txt


In [17]:
results_scores=vectorstore.similarity_search_with_score(query,k=3)
results_scores

[(Document(metadata={'source': 'data\\text_files\\doc_1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly\n    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers'),
  0.23752863705158234),
 (Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1.

In [18]:
#### Initialize LLM, RAG Chain, Prompt Template,Query the RAG system
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(
    model_name="gpt-3.5-turbo")


In [19]:
test_response=llm.invoke("What is Large Language Models")
test_response

AIMessage(content='Large language models are artificial intelligence models that are trained on vast amounts of text data to understand and generate human-like text. These models use machine learning techniques to analyze and generate text in a way that simulates human language. These models have been used in various applications such as text generation, translation, summarization, and more. Some popular examples of large language models include GPT-3 (Generative Pre-trained Transformer), BERT (Bidirectional Encoder Representations from Transformers), and T5 (Text-to-Text Transfer Transformer).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 106, 'prompt_tokens': 12, 'total_tokens': 118, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_finger

In [20]:
from langchain.chat_models.base import init_chat_model
llm=init_chat_model("openai:gpt-3.5-turbo")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000019767E78A10>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000019767E2E390>, root_client=<openai.OpenAI object at 0x0000019767E78BC0>, root_async_client=<openai.AsyncOpenAI object at 0x0000019767E2CAA0>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [21]:
llm.invoke("What is AI")

AIMessage(content='AI, or artificial intelligence, refers to the simulation of human intelligence processes by machines, especially computer systems. This includes learning, reasoning, problem-solving, perception, and language understanding. AI technology has the ability to improve over time by learning from data and patterns, making it increasingly capable of performing tasks that typically require human intelligence.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 65, 'prompt_tokens': 10, 'total_tokens': 75, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CLnya6qbQI6qSH8lsWYL38vcOGiX3', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--754f760e-1b87-43b1-843c-ee7372be75d8-0'

In [22]:
### Modern RAG Chain

from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain


In [23]:
## Convert vector store to retriever
retriever=vectorstore.as_retriever(
    search_kwarg={"k":3} ## Retrieve top 3 relevant chunks
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000197681EF260>, search_kwargs={})

In [24]:
## Create a prompt template
from langchain_core.prompts import ChatPromptTemplate
system_prompt="""You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.
    Context: {context}"""
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [25]:
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer the question.\n    If you don't know the answer, just say that you don't know.\n    Use three sentences maximum and keep the answer concise.\n    Context: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [26]:
### Create a document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer the question.\n    If you don't know the answer, just say that you don't know.\n    Use three sentences maximum and keep the answer concise.\n    Context: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000019767E78A10>, async_client=

In [28]:
### Create The Final RAG Chain
from langchain.chains import create_retrieval_chain
rag_chain=create_retrieval_chain(retriever,document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000197681EF260>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer the question.\n    If yo

In [30]:
response=rag_chain.invoke({"input":"What is Deep LLearning?"})
response

{'input': 'What is Deep LLearning?',
 'context': [Document(metadata={'source': 'data\\text_files\\doc_1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly\n    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers'),
  Document(metadata={'source': 'data\\text_files\\doc_0.txt'}, page_content='Machine Learning Fundamentals\n    Machine learning is a subset of artificial intelligence that enables systems to learn\n    and improve from experience without being explicitly programmed. There are three main\n    types of machine learning: supervised learning, unsupervised learning, and reinforc

In [31]:
# Function to query the modern RAG system
def query_rag_modern(question):
    print(f"Question: {question}")
    print("-" * 50)

    # Using create_retrieval_chain approach
    result = rag_chain.invoke({"input": question})
    print(f"Answer: {result['answer']}")
    print("\nRetrieved Context:")
    for i, doc in enumerate(result['context']):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content[:200] + "...")
    return result

# Test queries
test_questions = [
    "What are the three types of machine learning?",
    "What is deep learning and how does it relate to neural networks?",
    "What are CNNs best used for?"
]
for question in test_questions:
    result = query_rag_modern(question)
    print("\n" + "="*80 + "\n")

Question: What are the three types of machine learning?
--------------------------------------------------
Answer: The three types of machine learning are supervised learning, unsupervised learning, and reinforcement learning. Supervised learning uses labeled data for training, unsupervised learning finds patterns in unlabeled data, and reinforcement learning learns through rewards and penalties.

Retrieved Context:

--- Source 1 ---
Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing...

--- Source 2 ---
Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligence that enables systems to learn
    and improve from experience without being explicitly programmed. There are t...

--- Source 3 ---
Deep Learning and Neural Networks

    Deep learning is a subset of machine learning based on artificial neural networ

In [32]:
### Create RAG Chain Alternative - Using LCEL (LangChain Expression Language)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

In [33]:
# Create a custom prompt
custom_prompt = ChatPromptTemplate.from_template("""Use the following context to answer the question.
     If you don't know the answer based on the context, say you don't know.
    Provide specific details from the context to support your answer.
    Context:
    {context}

    Question: {question}

    Answer:"""
)
custom_prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following context to answer the question.\n     If you don't know the answer based on the context, say you don't know.\n    Provide specific details from the context to support your answer.\n    Context:\n    {context}\n\n    Question: {question}\n\n    Answer:"), additional_kwargs={})])

In [34]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [36]:
## Build the chain ussing LCEL
rag_chain_lcel=(
    {
        "context":retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | custom_prompt
    | llm
    | StrOutputParser()
)


In [37]:
response=rag_chain_lcel.invoke("What is Deep Learning")
response

'Deep learning is a subset of machine learning based on artificial neural networks. These networks are inspired by the human brain and consist of layers of interconnected nodes. Deep learning has revolutionized fields like computer vision, natural language processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers are used for tasks like text generation and translation.'

In [38]:
retriever.get_relevant_documents("What is Deep Learning")

  retriever.get_relevant_documents("What is Deep Learning")


[Document(metadata={'source': 'data\\text_files\\doc_1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks.\n    These networks are inspired by the human brain and consist of layers of interconnected\n    nodes. Deep learning has revolutionized fields like computer vision, natural language\n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly\n    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers'),
 Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Lear

In [39]:
# Query using the LCEL approach - Fixed version
def query_rag_lcel(question):
    print(f"Question: {question}")
    print("-" * 50)

    # Method 1: Pass string directly (when using RunnablePassthrough)
    answer = rag_chain_lcel.invoke(question)
    print(f"Answer: {answer}")
    # Get source documents separately if needed
    docs = retriever.get_relevant_documents(question)
    print("\nSource Documents:")
    for i, doc in enumerate(docs):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content[:200] + "...")

In [40]:
# Test LCEL chain
print("Testing LCEL Chain:")
query_rag_lcel("What are the key concepts in reinforcement learning?")

Testing LCEL Chain:
Question: What are the key concepts in reinforcement learning?
--------------------------------------------------
Answer: The key concepts in reinforcement learning are learning through interaction with an environment and using rewards and penalties. This means that the system learns through trial and error, receiving rewards for correct actions and penalties for incorrect actions.

Source Documents:

--- Source 1 ---
learning learns through
    interaction with an environment using rewards and penalties....

--- Source 2 ---
through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems...

--- Source 3 ---
Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing...

--- Source 4 ---
Machine Learning Fundamentals
    Machine learning is a subset of artificial intelligenc

In [41]:
### Add New Documents To Existing Vector Store

new_document = """
    Reinforcement Learning in Detail

    Reinforcement learning (RL) is a type of machine learning where an agent learns to make
    decisions by interacting with an environment. The agent receives rewards or penalties
    based on its actions and learns to maximize cumulative reward over time. Key concepts
    in RL include: states, actions, rewards, policies, and value functions. Popular RL
    algorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and
    Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo),
    robotics, and autonomous systems.
    """

In [42]:
new_doc=Document(
    page_content=new_document,
    metadata={"source": "manual_addition", "topic": "reinforcement_learning"}
)

In [43]:
## split the documents
new_chunks=text_splitter.split_documents([new_doc])
new_chunks

[Document(metadata={'source': 'manual_addition', 'topic': 'reinforcement_learning'}, page_content='Reinforcement Learning in Detail\n\n    Reinforcement learning (RL) is a type of machine learning where an agent learns to make\n    decisions by interacting with an environment. The agent receives rewards or penalties\n    based on its actions and learns to maximize cumulative reward over time. Key concepts\n    in RL include: states, actions, rewards, policies, and value functions. Popular RL\n    algorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and'),
 Document(metadata={'source': 'manual_addition', 'topic': 'reinforcement_learning'}, page_content='(DQN), Policy Gradient methods, and\n    Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo),\n    robotics, and autonomous systems.')]

In [44]:
### Add new documents to vectorstore
vectorstore.add_documents(new_chunks)

['2df44598-51a5-4e3d-a41c-7acaa54e408a',
 'f7cb4feb-4e6c-48a4-bb50-95a074c45eb5']

In [45]:
print(f"Added {len(new_chunks)} new chunks to the vector store")
print(f"Total vectors now: {vectorstore._collection.count()}")

Added 2 new chunks to the vector store
Total vectors now: 10


In [None]:
## query with the updated vector
new_question="What are the keys concepts in reinforcement learning"
result=query_rag_lcel(new_question)
result

Question: What are the keys concepts in reinforcement learning
--------------------------------------------------
Answer: The key concepts in reinforcement learning are states, actions, rewards, policies, and value functions.

Source Documents:

--- Source 1 ---
Reinforcement Learning in Detail

    Reinforcement learning (RL) is a type of machine learning where an agent learns to make
    decisions by interacting with an environment. The agent receives rewar...

--- Source 2 ---
(DQN), Policy Gradient methods, and
    Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo),
    robotics, and autonomous systems....

--- Source 3 ---
through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems...

--- Source 4 ---
learning learns through
    interaction with an environment using rewards and penalties....


In [48]:
# create_history_aware_retriever: Makes the retriever understand conversation context
# MessagesPlaceholder: Placeholder for chat history in prompts
# HumanMessage/AIMessage: Structured message types for conversation history
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

In [50]:
## create a prompt that includes the chat history
contextualize_q_system_prompt = """Given a chat history and the latest user question
    which might reference context in the chat history, formulate a standalone question
    which can be understood without the chat history. Do NOT answer the question,
    just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

In [51]:
## create history aware retriever
history_aware_retriever = create_history_aware_retriever(
     llm, retriever, contextualize_q_prompt)
history_aware_retriever

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000197681EF260>, search_kwargs={}))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIM

In [52]:
# Create a new document chain with history
qa_system_prompt = """You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.

    Context: {context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", qa_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# Create conversational RAG chain
conversational_rag_chain = create_retrieval_chain(
    history_aware_retriever,
    question_answer_chain
)


In [53]:
# First question
chat_history=[]
result1 = conversational_rag_chain.invoke({
    "chat_history": chat_history,
    "input": "What is machine learning?"
    })
print(f"Q: What is machine learning?")
print(f"A: {result1['answer']}")

Q: What is machine learning?
A: Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.


In [56]:
chat_history.extend([
    HumanMessage(content="What is machine learning?"),
    AIMessage(content=result1['answer'])
    ])

In [58]:
result2 = conversational_rag_chain.invoke({
    "chat_history": chat_history,
    "input": "What are its main types?"  # Refers to ML from previous question
})
result2

{'chat_history': [HumanMessage(content='What is machine learning?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.', additional_kwargs={}, response_metadata={})],
 'input': 'What are its main types?',
 'context': [Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Re