In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

### vectore stores

from langchain.vectorstores import Chroma
import numpy as np
from typing import List

### 1. Sample Data

In [3]:
### create sample documents
## create sample documents
sample_docs = [
    """
    Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine learning: supervised learning, unsupervised learning, and reinforcement 
    learning. Supervised learning uses labeled data to train models, while unsupervised 
    learning finds patterns in unlabeled data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties.
    """,
    
    """
    Deep Learning and Neural Networks
    
    Deep learning is a subset of machine learning based on artificial neural networks. 
    These networks are inspired by the human brain and consist of layers of interconnected 
    nodes. Deep learning has revolutionized fields like computer vision, natural language 
    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly 
    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers 
    excel at sequential data processing.
    """,
    
    """
    Natural Language Processing (NLP)
    
    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, 
    machine translation, and question answering. Modern NLP heavily relies on transformer 
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand 
    context and relationships between words in text.
    """
]

sample_docs


['\n    Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through \n    interaction with an environment using rewards and penalties.\n    ',
 '\n    Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly \n    effective f

In [4]:
### Save sample documents to the temp files

import tempfile

temp_dir = tempfile.mkdtemp()

for i, doc in enumerate(sample_docs):
    with open(f"{temp_dir}/doc_{i}.text", "w") as f:
        f.write(doc)
print(f"Sample document create in:{temp_dir}")

Sample document create in:C:\Users\HP\AppData\Local\Temp\tmp69z0mpks


In [5]:
### Save sample documents to the temp files
#os.makedirs('E:/workspace/ultimate_rag_bootcamp/2-Vectore Stores/data')

for i, doc in enumerate(sample_docs):
    with open(f"E:/workspace/ultimate_rag_bootcamp/2-Vectore Stores/data/doc{i}.txt", "w") as f:
        f.write(doc)

print(f"Sample documents created!")

Sample documents created!


### 2. Document Loading

In [6]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader

# Load documents from directory
loader = DirectoryLoader(
    "data", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document preview:")
print(documents[0].page_content[:200] + "...")


Loaded 3 documents

First document preview:

    Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. Ther...


In [7]:
print(documents[0])

page_content='
    Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine learning: supervised learning, unsupervised learning, and reinforcement 
    learning. Supervised learning uses labeled data to train models, while unsupervised 
    learning finds patterns in unlabeled data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties.
    ' metadata={'source': 'data\\doc0.txt'}


### Document Splitting

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
## Initialize text splitter 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, 
    chunk_overlap = 50,
    length_function = len, 
    separators= [" "]
)

chunks = text_splitter.split_documents(documents)

print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nChunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 5 chunks from 3 documents

Chunk example:
Content: Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experie...
Metadata: {'source': 'data\\doc0.txt'}


In [9]:
chunks

[Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through'),
 Document(metadata={'source': 'data\\doc0.txt'}, page_content='data. Reinforcement learning learns through \n    interaction with an environment using rewards and penalties.'),
 Document(metadata={'source': 'data\\doc1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of inter

### Embedding model

In [10]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [11]:
sample_text = "Machine Learning is fascinating"
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001485A31DA90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001485A31E3C0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [12]:
vector = embeddings.embed_query(sample_text)
print(vector)

[-0.02172444760799408, 0.016208980232477188, 0.010213345289230347, -0.022516079246997833, -0.0037213172763586044, 0.01783117651939392, 4.82096329506021e-05, 0.01027174387127161, -0.015547124668955803, -0.04134652763605118, 0.007929293438792229, 0.03628527745604515, -0.019128933548927307, -0.008234266191720963, -0.0013058676850050688, 0.00581719446927309, 0.03880292549729347, 0.008811768144369125, -0.0005584409227594733, -0.008591149002313614, -0.031224025413393974, 0.022048886865377426, -0.005914526060223579, -0.03441650792956352, -0.014898247085511684, 0.0023018959909677505, 0.003834871109575033, -0.03885483369231224, -0.012523352168500423, -0.002739888848736882, 0.027590306475758553, -0.004736811853945255, -0.0170655008405447, -0.03981517627835274, -0.008513283915817738, -0.012211889959871769, -0.004152821376919746, 0.0028583090752363205, -0.01670212857425213, -0.00013068816042505205, 0.020076295360922813, 0.02541007660329342, -0.008435418829321861, -0.019661013036966324, -0.01436616

### Initialize the ChromaDB vectore Store and Stores the chunks in vector representation 

In [13]:
### Create a Chromadb vectore store
persistent_directory = "./chroma_db"

##Initialize Chromadb with OpenAI embeddings

vectorestore = Chroma.from_documents(
    documents=chunks, 
    embedding=OpenAIEmbeddings(), 
    persist_directory=persistent_directory, 
    collection_name="rag_collection"
)

print(f"Vectore store created with {vectorestore._collection.count()} vectors")
print(f"Persisted to: {persistent_directory}")

Vectore store created with 17 vectors
Persisted to: ./chroma_db


### Test Similarity Search

In [14]:
query = "What are the types of machine learning?"

similar_docs = vectorestore.similarity_search(query, k=3)
similar_docs

[Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through'),
 Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, whi

In [15]:
query = "What is NLP?"

similar_docs = vectorestore.similarity_search(query, k=3)
similar_docs


[Document(metadata={'source': 'data\\doc2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand \n    context and relationships between words in text.'),
 Document(metadata={'source': 'data\\doc2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mecha

In [16]:
print(f"Query: {query}")
print(f"\nTop {len(similar_docs)} similar chunks:")
for i, doc in enumerate(similar_docs):
    print(f"\n--- Chunk {i+1} ---")
    print(doc.page_content[:200] + "...")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")

Query: What is NLP?

Top 3 similar chunks:

--- Chunk 1 ---
Natural Language Processing (NLP)

    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recogn...
Source: data\doc2.txt

--- Chunk 2 ---
Natural Language Processing (NLP)

    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recogn...
Source: data\doc2.txt

--- Chunk 3 ---
Natural Language Processing (NLP)

    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recogn...
Source: data\doc2.txt


### Advanced Similary Search with scores

In [17]:
result_scores = vectorestore.similarity_search_with_score(query, k=3)
result_scores

[(Document(metadata={'source': 'data\\doc2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand \n    context and relationships between words in text.'),
  0.20716717839241028),
 (Document(metadata={'source': 'data\\doc2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These 

#### Understanding Similarity Scores
The similarity score represents how closely related a document chunk is to your query. The scoring depends on the distance metric used:

ChromaDB default: Uses L2 distance (Euclidean distance)

- Lower scores = MORE similar (closer in vector space)
- Score of 0 = identical vectors
- Typical range: 0 to 2 (but can be higher)


Cosine similarity (if configured):

- Higher scores = MORE similar
- Range: -1 to 1 (1 being identical)

### Initialize LLM, RAG Chain, Prompt Template,Query the RAG system

In [18]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model = "gpt-4o-mini", 
    temperature=0
)

In [19]:
test_response = llm.invoke("What is llm")
test_response

AIMessage(content="LLM can refer to different things depending on the context. Here are a few common meanings:\n\n1. **Large Language Model**: In the context of artificial intelligence and natural language processing, LLM refers to a type of AI model that is trained on vast amounts of text data to understand and generate human-like text. Examples include models like OpenAI's GPT-3 and GPT-4, Google's BERT, and others.\n\n2. **Master of Laws (Legum Magister)**: In legal education, LLM is an advanced, postgraduate academic degree in law. It is typically pursued by individuals who already hold a first degree in law and wish to specialize in a particular area of law or enhance their legal knowledge.\n\n3. **Low-Level Memory**: In computing, LLM can sometimes refer to low-level memory management or operations, though this usage is less common.\n\nIf you have a specific context in mind, please provide more details!", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'co

In [20]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("openai:gpt-4o-mini")
llm.invoke("what is openai")

AIMessage(content="OpenAI is an artificial intelligence research organization that aims to develop and promote friendly AI for the benefit of humanity. Founded in December 2015, OpenAI conducts research in various areas of artificial intelligence, including machine learning, natural language processing, robotics, and more.\n\nOne of the organization's most notable projects is the development of language models, including the GPT (Generative Pre-trained Transformer) series, which can generate human-like text based on the input they receive. OpenAI also engages in discussions about the ethical implications of AI, seeking to ensure that advanced AI technologies are used responsibly and safely.\n\nIn addition to research, OpenAI has also released products like ChatGPT, DALL-E (an image generation model), and Codex (a model that assists with programming tasks) for public use. Overall, OpenAI's mission is centered around advancing AI while ensuring its beneficial impact on society.", additio

### Modern RAG Chain

In [21]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [22]:
### Convert vectore Store to retriever 

retriever = vectorestore.as_retriever(
    search_kwargs={"k":3} ###retrieve top 3 relavant chunks
)

retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001485A31E900>, search_kwargs={'k': 3})

In [23]:
### Create a prompt template 
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """You are a assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that ypu don't know.
Use three sentences maximum and keep the answer concise.

Context: {context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system",system_prompt),
    ("human","{input}")
])

In [24]:
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that ypu don't know.\nUse three sentences maximum and keep the answer concise.\n\nContext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [25]:
###Create a document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm, prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that ypu don't know.\nUse three sentences maximum and keep the answer concise.\n\nContext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001485D62E490>, async_client=<openai.resourc

In [26]:
#### Create the final RAG chain

from langchain.chains import create_retrieval_chain 
rag_chain = create_retrieval_chain(retriever, document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001485A31E900>, search_kwargs={'k': 3}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you d

In [27]:
chat = rag_chain.invoke({"input":"What is Machine Learning"})
chat

{'input': 'What is Machine Learning',
 'context': [Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through'),
  Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervi

In [28]:
print(chat['answer'])

Machine learning is a subset of artificial intelligence that allows systems to learn and improve from experience without being explicitly programmed. It encompasses three main types: supervised learning, unsupervised learning, and reinforcement learning. Each type utilizes different methods, such as labeled or unlabeled data, to train models and discover patterns.


In [29]:
# Function to query the modern RAG system
def query_rag_modern(question):
    print(f"Question: {question}")
    print("-" * 50)
    
    # Using create_retrieval_chain approach
    result = rag_chain.invoke({"input": question})
    
    print(f"Answer: {result['answer']}")
    print("\nRetrieved Context:")
    for i, doc in enumerate(result['context']):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content[:200] + "...")
    
    return result

# Test queries
test_questions = [
    "What are the three types of machine learning?",
    "What is deep learning and how does it relate to neural networks?",
    "What are CNNs best used for?"
]

for question in test_questions:
    result = query_rag_modern(question)
    print("\n" + "="*80 + "\n")

Question: What are the three types of machine learning?
--------------------------------------------------


Answer: The three main types of machine learning are supervised learning, unsupervised learning, and reinforcement learning. Supervised learning uses labeled data, unsupervised learning identifies patterns in unlabeled data, and reinforcement learning involves learning through interaction and feedback.

Retrieved Context:

--- Source 1 ---
Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are...

--- Source 2 ---
Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are...

--- Source 3 ---
Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are...


Question

### Create RAG Chain Alternative - Use LCEL(Langchain Expression Language)

In [30]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

In [31]:
# Create a custom prompt
custom_prompt = ChatPromptTemplate.from_template("""Use the following context to answer the question. 
If you don't know the answer based on the context, say you don't know.
Provide specific details from the context to support your answer.

Context:
{context}

Question: {question}

Answer:""")
custom_prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following context to answer the question. \nIf you don't know the answer based on the context, say you don't know.\nProvide specific details from the context to support your answer.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), additional_kwargs={})])

In [32]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001485A31E900>, search_kwargs={'k': 3})

In [33]:
###Format the output documents from the prompt
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [34]:
###Build the chain using LCEL

rag_chain_lcel = (
    {
        "context":retriever | format_docs, 
        "question":RunnablePassthrough()
    }
    |custom_prompt
    |llm
    |StrOutputParser()
)

rag_chain_lcel

{
  context: VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001485A31E900>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following context to answer the question. \nIf you don't know the answer based on the context, say you don't know.\nProvide specific details from the context to support your answer.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001485D62E490>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001485D62F4

In [35]:
response = rag_chain_lcel.invoke("what is deep learning")
print(response)

Deep learning is a subset of machine learning that is based on artificial neural networks. These networks are inspired by the human brain and consist of layers of interconnected nodes. Deep learning has significantly transformed various fields, such as computer vision, natural language processing, and speech recognition. Specific types of neural networks used in deep learning include Convolutional Neural Networks (CNNs), which are particularly effective for image processing, and Recurrent Neural Networks (RNNs) and Transformers, which are used for other applications.


In [36]:
retriever.get_relevant_documents("what is deep learning")

  retriever.get_relevant_documents("what is deep learning")


[Document(metadata={'source': 'data\\doc1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly \n    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers'),
 Document(metadata={'source': 'data\\doc1.txt'}, page_content='Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural 

In [37]:
# Query using the LCEL approach - Fixed version
def query_rag_lcel(question):
    print(f"Question: {question}")
    print("-" * 50)
    
    # Method 1: Pass string directly (when using RunnablePassthrough)
    answer = rag_chain_lcel.invoke(question)
    print(f"Answer: {answer}")
    
    # Get source documents separately if needed
    docs = retriever.get_relevant_documents(question)
    print("\nSource Documents:")
    for i, doc in enumerate(docs):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content[:200] + "...")

In [38]:
# Test LCEL chain
print("Testing LCEL Chain:")
query_rag_lcel("What are the key concepts in reinforcement learning?")

Testing LCEL Chain:
Question: What are the key concepts in reinforcement learning?
--------------------------------------------------
Answer: The key concepts in reinforcement learning (RL) include states, actions, rewards, policies, and value functions.

Source Documents:

--- Source 1 ---
Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or p...

--- Source 2 ---
data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties....

--- Source 3 ---
data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties....


### Add New Documents to Existing Vectore Store

In [39]:
vectorestore

<langchain_community.vectorstores.chroma.Chroma at 0x1485a31e900>

In [40]:
# Add new documents to the existing vector store
new_document = """
Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or penalties 
based on its actions and learns to maximize cumulative reward over time. Key concepts 
in RL include: states, actions, rewards, policies, and value functions. Popular RL 
algorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and 
Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo), 
robotics, and autonomous systems.
"""

In [41]:
new_doc = Document(
    page_content=new_document, 
    metadata={"source":"manual_addition", "topic":"reinforcement_learning"}
)
new_doc

Document(metadata={'source': 'manual_addition', 'topic': 'reinforcement_learning'}, page_content='\nReinforcement Learning in Detail\n\nReinforcement learning (RL) is a type of machine learning where an agent learns to make \ndecisions by interacting with an environment. The agent receives rewards or penalties \nbased on its actions and learns to maximize cumulative reward over time. Key concepts \nin RL include: states, actions, rewards, policies, and value functions. Popular RL \nalgorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and \nActor-Critic methods. RL has been successfully applied to game playing (like AlphaGo), \nrobotics, and autonomous systems.\n')

In [42]:
new_chunks = text_splitter.split_documents([new_doc])
new_chunks

[Document(metadata={'source': 'manual_addition', 'topic': 'reinforcement_learning'}, page_content='Reinforcement Learning in Detail\n\nReinforcement learning (RL) is a type of machine learning where an agent learns to make \ndecisions by interacting with an environment. The agent receives rewards or penalties \nbased on its actions and learns to maximize cumulative reward over time. Key concepts \nin RL include: states, actions, rewards, policies, and value functions. Popular RL \nalgorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and \nActor-Critic methods. RL has been'),
 Document(metadata={'source': 'manual_addition', 'topic': 'reinforcement_learning'}, page_content='methods, and \nActor-Critic methods. RL has been successfully applied to game playing (like AlphaGo), \nrobotics, and autonomous systems.')]

In [43]:
### Add new documents to vectorestore
vectorestore.add_documents(new_chunks)

['75f4efbd-75dc-4d68-a972-6dfd8ebcb6d6',
 '1ef66af9-385b-4931-abf5-e0b9e1186025']

In [44]:
print(f"Added {len(new_chunks)} new chunks to the vector store")
print(f"Total vectors now: {vectorestore._collection.count()}")

Added 2 new chunks to the vector store
Total vectors now: 19


In [45]:
## query with the updated vector
new_question="What are the keys concepts in reinforcement learning"
result=query_rag_lcel(new_question)
result

Question: What are the keys concepts in reinforcement learning
--------------------------------------------------
Answer: The key concepts in reinforcement learning (RL) include: states, actions, rewards, policies, and value functions. These components are essential for understanding how an agent learns to make decisions in an environment based on the feedback it receives.

Source Documents:

--- Source 1 ---
Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or p...

--- Source 2 ---
Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or p...

--- Source 3 ---
data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties....


### Advanced Rag Technique - Conversational Memory

In [46]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

In [48]:
## create a prompt that includes the chat history

contextualize_q_system_prompt = """Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is."""

contextualize_q_system_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt), 
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

In [49]:
### Create history aware retriever

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_system_prompt
)
history_aware_retriever

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001485A31E900>, search_kwargs={'k': 3}))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(ta

In [50]:
# Create a new document chain with history
qa_system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context: {context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", qa_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

##Create conversational RAG chain
conversational_rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
print("Conversational RAG chain created!")

Conversational RAG chain created!


In [52]:
chat_history = []

#First Question
result1 = conversational_rag_chain.invoke({
    "chat_history":chat_history, 
    "input":"What is machine learning?"
})
print(f"Q: What is machine learning?")
print(f"A: {result1['answer']}")

Q: What is machine learning?
A: Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It encompasses three main types: supervised learning, unsupervised learning, and reinforcement learning. Each type involves different methods of training and analyzing data.


In [None]:
chat_history.extend([
    HumanMessage(content="What is machine learning?"),
    AIMessage(content = result1['answer'])
])

In [56]:
chat_history

[HumanMessage(content='What is machine learning?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It encompasses three main types: supervised learning, unsupervised learning, and reinforcement learning. Each type involves different methods of training and analyzing data.', additional_kwargs={}, response_metadata={})]

In [59]:
### Follow up question

result2 = conversational_rag_chain.invoke({
    "chat_history":chat_history,
    "input":"What are its main types?"
})
result2

{'chat_history': [HumanMessage(content='What is machine learning?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It encompasses three main types: supervised learning, unsupervised learning, and reinforcement learning. Each type involves different methods of training and analyzing data.', additional_kwargs={}, response_metadata={})],
 'input': 'What are its main types?',
 'context': [Document(metadata={'source': 'data\\doc0.txt'}, page_content='Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models,

### Using GROQ LLM's

In [60]:
load_dotenv()
os.getenv("GROQ_API_KEY")

'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'

In [61]:
from langchain_groq import ChatGroq
from langchain.chat_models import init_chat_model

In [62]:
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [64]:
llm = ChatGroq(model="gemma2-9b-it", api_key = os.getenv("GROQ_API_KEY"))
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x0000014864E98A50>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000014864E9A0D0>, model_name='gemma2-9b-it', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [66]:
llm=init_chat_model(model="groq:llama-3.1-8b-instant")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x0000014864EFC8A0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000014864EFCD60>, model_name='llama-3.1-8b-instant', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [68]:
response = llm.invoke("What is the llm?")

In [75]:
print(response.content)

LLM can refer to several things depending on the context:

1. **Large Language Model**: An LLM is a type of artificial intelligence (AI) model designed to process and understand human language. It's a computer program that can generate coherent and context-specific text based on the input it receives. LLMs are typically trained on massive datasets of text and can perform a wide range of natural language processing (NLP) tasks, such as language translation, text summarization, question answering, and more.

Examples of popular LLMs include:

- BERT (Bidirectional Encoder Representations from Transformers)
- RoBERTa (Robustly Optimized BERT Pretraining Approach)
- ALBERT (A Lite BERT)
- T5 (Text-to-Text Transfer Transformer)

2. **Limited Liability Membership** or **Limited Liability Member**: In the context of business or organizational law, LLM can refer to a Limited Liability Membership structure, which is similar to a Limited Liability Company (LLC) but often used in non-profit organ