In [45]:
import os
from langchain_community.document_loaders import WebBaseLoader

# Set user agent (important for govt sites)
os.environ["USER_AGENT"] = "Mozilla/5.0 (compatible; NyayaSathi-Chatbot/1.0)"

urls = [
    # DOJ detailed pages (KEEP THESE)
    "https://doj.gov.in/about-department",
    "https://doj.gov.in/acts",
    "https://doj.gov.in/schemes",
    "https://doj.gov.in/notifications",
    "https://doj.gov.in/divisions",
    "https://doj.gov.in/tele-law",
    "https://doj.gov.in/fast-track-special-courts",
    "https://doj.gov.in/",
    "https://www.mha.gov.in/",
    "https://www.cybercrime.gov.in/"
]

loader = WebBaseLoader(urls)
docs = loader.load()
docs[0].page_content
print("Total documents:", len(docs))
print("Characters in first document:", len(docs[0].page_content))


Total documents: 10
Characters in first document: 7759


In [46]:
from langchain_core.documents import Document


tele_law_text = """
Tele-Law Scheme provides free legal advice through
Common Service Centres (CSCs) to people in rural and
remote areas.

Beneficiaries include:
- Women
- Senior Citizens
- Scheduled Castes and Scheduled Tribes
- Persons with Disabilities
- Economically Weaker Sections
- Victims of trafficking and violence

Legal advice is provided by Panel Lawyers via phone or video conferencing.
"""

docs.append(
    Document(
        page_content=tele_law_text,
        metadata={"source": "tele-law-manual"}
    )
)


In [47]:
print("Total documents:", len(docs))
print("Characters in first document:", len(docs[0].page_content))

Total documents: 11
Characters in first document: 7759


In [48]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=550
)

chunks = splitter.split_documents(docs)

print("Total chunks:", len(chunks))


Total chunks: 21


In [50]:
print("Total documents:", len(docs))
for i, d in enumerate(docs):
    print(f"\n--- DOC {i} ---")
    print(len(d.page_content))
    print(d.page_content[:300])


Total documents: 11

--- DOC 0 ---
7759








About Department | Department of Justice | India



































 





























Search
Search



























Accessibility Tools






                    Accessibility Tools                


Color Contrast                    







          

--- DOC 1 ---
8960








Acts and Rules | Department of Justice | India




































 





























Search
Search



























Accessibility Tools






                    Accessibility Tools                


Color Contrast                    







           

--- DOC 2 ---
6716








Schemes - Consolidated Guidelines | Department of Justice | India



































 





























Search
Search



























Accessibility Tools






                    Accessibility Tools                


Color Contrast                    


--- DOC 3 ---
60

In [51]:
chunks

[Document(metadata={'source': 'https://doj.gov.in/about-department', 'title': 'About Department | Department of Justice | India', 'description': 'Last updated: 19-04-2024 As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law & Justice, Government of India. It is one of the oldest Ministries of the Government of India. Till 31.12.2009, Department of Justice was part of Ministry of Home Affairs and Union Home Secretary had […]', 'language': 'en-US'}, page_content='About Department | Department of Justice | India\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAccessibility Tools\n\n\n\n\n\n\n                    Accessibility Tools                \n\n\nColor Contrast                    \n\n\n\n\n\n\n\n                                    High Contrast                                \n\n\n\n\n\

In [52]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [53]:
#embedding
from langchain_community.embeddings import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [54]:
test_vector = embeddings.embed_query("How to file FIR in India?")
print(test_vector)


[0.028185252100229263, 0.01421379204839468, -0.1087973564863205, -0.02211626060307026, 0.05251743271946907, 0.04121776297688484, -0.0016690349439159036, 0.04440738633275032, -0.0685327798128128, 0.025728542357683182, -0.038103241473436356, -0.009669344872236252, -0.04625983163714409, 0.004555045627057552, -0.019555458799004555, -0.02870132587850094, -0.0873713418841362, 0.028609639033675194, 0.0015641107456758618, -0.01840139552950859, 0.06755425035953522, 0.02817469649016857, -0.03577043116092682, -0.09433072805404663, 0.06374569982290268, -0.009954841807484627, 0.03545618802309036, -0.06253333389759064, 0.005689638201147318, -0.0016872066771611571, 0.036294277757406235, 0.05550382658839226, 0.03586658462882042, 0.04755360633134842, 0.010297764092683792, -0.006142353173345327, 0.026045462116599083, 0.0211983360350132, 0.05489516630768776, -0.050692956894636154, 0.04263262823224068, -0.0004723531601484865, 0.005868059583008289, -0.03177216649055481, -0.029734322801232338, -0.0328589603

In [55]:
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(chunks, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 8})
db.save_local("faiss_index")


In [56]:
docs = retriever.invoke("Who can benefit from the Tele-Law scheme?")
for d in docs:
    print("----")
    print(d.page_content)


----
Tele-Law Scheme provides free legal advice through
Common Service Centres (CSCs) to people in rural and
remote areas.

Beneficiaries include:
- Women
- Senior Citizens
- Scheduled Castes and Scheduled Tribes
- Persons with Disabilities
- Economically Weaker Sections
- Victims of trafficking and violence

Legal advice is provided by Panel Lawyers via phone or video conferencing.
----
Tele-Law | Department of Justice | India



































 





























Search
Search



























Accessibility Tools






                    Accessibility Tools                


Color Contrast                    







                                    High Contrast                                









                                    Normal Contrast                                








                                    Highlight Links                                







Invert







Saturation                                






In [16]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

docs = [
    Document(page_content="To file an FIR in India, visit the nearest police station."),
    Document(page_content="You can also file an online FIR on the official state police website."),
    Document(page_content="FIR stands for First Information Report under Indian law.")
]

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

db = FAISS.from_documents(docs, embeddings)
db.save_local("faiss_index")

print("✅ FAISS index rebuilt successfully (LangChain 1.x)")



✅ FAISS index rebuilt successfully (LangChain 1.x)


In [12]:
from langchain_community.vectorstores import FAISS

db= FAISS.from_documents(chunks,embeddings)

In [13]:
retriever=db.as_retriever()

In [14]:
db.save_local("faiss_index")

In [13]:
def get_response(question):
    # Retriever से query पूछो और पहला जवाब ले लो
    docs = retriever.get_relevant_documents(question)
    if docs:
        return docs[0].page_content  # पहला relevant document का content return करो
    else:
        return "No relevant information found."


In [15]:
from fastapi import FastAPI

app = FastAPI()

@app.post("/query")
async def query_rag(question: str):
    response = get_response(question)
    return {"answer": response}

