### This files has RAG full implementation

In [None]:
### Creating the txt files from txt data
text=['''Trees are the foundation of life on Earth, providing oxygen, shelter, and food for countless species. They absorb carbon dioxide, helping to fight climate change, and regulate temperatures through shade and moisture. Forests act as natural habitats for animals and serve as barriers against soil erosion. Beyond their ecological role, trees also 
      bring peace and beauty to human surroundings, reminding us of the importance of balance in nature.''',

      '''Animals play a vital role in maintaining ecological balance by forming interconnected food chains. From insects that pollinate crops to predators that control populations, each species contributes to the health of ecosystems. They also provide companionship, work assistance, and inspiration to humans. Protecting wildlife is essential,
        not only for biodiversity but also for ensuring the stability of natural environments that sustain all life.''',

        '''Cars revolutionized human mobility by offering independence and speed, connecting people and places like never before. They have become an essential part of modern life, driving economic growth and shaping urban design. Over the years, advancements like electric vehicles and autonomous driving have made cars smarter and cleaner. Yet, the challenge remains
          to balance convenience with sustainability and reduce the environmental impact of transportation.''',

          '''Petrol, derived from crude oil, has long been the main energy source for vehicles and industries. It powers engines efficiently but also contributes to pollution and greenhouse gas emissions. The global demand for petrol has driven innovation in energy exploration and production. However, with rising environmental concerns, 
          the world is gradually shifting toward renewable alternatives like solar and electric energy to ensure a cleaner future.''',

          '''Fire is both a life-giving and destructive force. It provides warmth, light, and a means to cook food, marking one of humanity’s greatest discoveries. However, when uncontrolled, fire can cause immense damage to forests, homes, and lives. In nature, fire also plays a role in regeneration, helping certain plants release seeds and recycle nutrients.
            Understanding and respecting fire’s dual nature is key to using it safely and wisely.'''
]



: 

In [None]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

os.makedirs("textfiles",exist_ok=True)

for i,text_sent in enumerate(text):
    with open(f'textfiles/doc{i}.txt','w',encoding="utf-8") as f:
        f.write(text_sent)



In [None]:
### now time to make documents,page_content

loader=DirectoryLoader(
    "textfiles",
    glob="*.txt",
    load_hidden=TextLoader,
    loader_kwargs={"encoding":"utf-8"}


)



In [None]:
docs=loader.load()


for i,document in enumerate(docs):
    print(f'this is document {i+1} ')
    print(document)
    print("---------------------------------------------")

In [None]:
### We have list of documents so now we have to make chunks

splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=2,
    length_function=len,
    separators=[" "]

)

In [None]:
chunks=splitter.split_documents(docs)
chunks



In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings=HuggingFaceEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2"
)



In [None]:
### initialize vector store

## creating vectore store folder


'''from langchain.vectorstores import Chroma
os.makedirs("vec_dir",exist_ok=True)

vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="vec_dir",
    collection_name="RAG"
)

'''
import os, json, hashlib
from langchain_chroma import Chroma
from langchain.schema import Document  # just for type hints

# ---- 1) init / reopen existing collection ----
vectorstore = Chroma(
    persist_directory="vec_dir",
    collection_name="RAG",
    embedding_function=embeddings,   # your embedding function
)

# ---- 2) make a stable ID per chunk (hash of content + key metadata) ----
def make_id(doc: Document) -> str:
    payload = {
        "text": doc.page_content.strip(),
        "source": doc.metadata.get("source"),
        "page": doc.metadata.get("page"),
    }
    s = json.dumps(payload, sort_keys=True, ensure_ascii=False)
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

# If you create chunks elsewhere, ensure each has doc.metadata["source"]/["page"] if available.
pairs = [(make_id(d), d) for d in chunks]

# local de-dup in case the same text appears twice in `chunks` this run
unique = {}
for _id, d in pairs:
    if _id not in unique:
        unique[_id] = d
ids = list(unique.keys())
docs = list(unique.values())

# ---- 3) add only missing IDs ----
# try efficient filtered get; fall back to full get if needed (older versions)
try:
    existing = set(vectorstore.get(ids=ids, include=[]).get("ids", []))
except Exception:
    existing = set(vectorstore.get(include=[]).get("ids", []))

to_add_ids  = [i for i in ids if i not in existing]
to_add_docs = [d for i, d in zip(ids, docs) if i not in existing]

if to_add_docs:
    vectorstore.add_documents(to_add_docs, ids=to_add_ids)
    vectorstore.persist()

# ---- query as usual ----
# results = vectorstore.similarity_search("how houses are made", k=5)


In [None]:
query="how houses are made"
output=vectorstore.similarity_search(query,k=3)
output

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:

os.environ["HUGGING_FACE_HUB_TOKEN"]=os.getenv("HUGGING_FACE_HUB_TOKEN")



In [None]:
import sys, subprocess

def pip_install(args):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *args])

# Ensure dependencies are available
try:
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
except Exception:
    pip_install(["transformers", "accelerate"])  # lightweight helper libs
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

try:
    import torch
except Exception:
    # Install CPU-only PyTorch wheels
    pip_install(["torch", "--index-url", "https://download.pytorch.org/whl/cpu"])
    import torch

from langchain_huggingface import HuggingFacePipeline

# Use a small, fully open-source chat model that runs on CPU
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,  # CPU-friendly
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.2,
    do_sample=False,
)

llm = HuggingFacePipeline(pipeline=pipe)

print(llm.invoke("color of trees"))

In [None]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain


In [None]:
#first covert vectorestore to retriver

retriver=vectorstore.as_retriever(
    search_kwargs={"k":3} #top k chunks to be retrived
)

In [None]:
# creating prompt templete to instruct llms

system_prompt="""Act as a assistant for question and answering
use the following context for answering the questions.
if you dont know the answer say that you dont know.
use these sentances and make the answers consise.

context : {context}
"""
 
prompt=ChatPromptTemplate.from_messages([
  ("system",system_prompt),
  ("human","{input}")
 ])




In [None]:
### create document chain which combines all the retrived chunks and provide to llm

document_chain=create_stuff_documents_chain(llm,prompt)

In [None]:
rag_chain=create_retrieval_chain(retriver,document_chain)

In [None]:
rag_chain

In [None]:
response=rag_chain.invoke({"input":"question for llm"})