### Implement a legal document question-answering and summarization system that retrieves relevant case law content using FAISS and generate multi-level summaries or answers using a language model like T5 or LLaMA

In [1]:
#prepareing the data

    # import requests
    # from bs4 import BeautifulSoup
import trafilatura 
urls = [
    "https://www.scconline.com/",
    "https://nludelhi.ac.in/library/e-databases/",
    "https://www.aironline.in/",
    "https://indiankanoon.org/search/?formInput=document+of+Supreme+Court+of+India"
]
all_text = []
for url in urls:
    download = trafilatura.fetch_url(url)
    if download:
        text = trafilatura.extract(download)
        if text:
            all_text.append(text)
    # try:
    #     html = requests.get(url).text
    #     soup = BeautifulSoup(html, 'html.parser')
    #     # Remove script and style elements
    #     for script in soup(["script", "style"]):
    #         script.decompose()

    #     text = soup.get_text()
    #     all_text.append(text)
    # except Exception as e:
    #     print(f"Could not retrieve text from {url}: {e}")

print(all_text)
print(len(all_text))

['The Surest Way to Legal Research!™\nUniting the authentic and reliable content from India’s leading law publisher with cutting-edge technology to create a powerful legal research resource.\nNow available at your desk or on the move, spend less time researching, and have more time to focus on crafting your arguments.', "-\nAIROnline 2025 SC 905\nSupreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran, Atul S. Chandurkar , JJ\nSanjay D. Jain v. State of MaharashtraCRIMINAL APPEAL - 4292 of 2025 , (ARISING OUT OF SPECIAL LEAVE PETITION (CRL.) NO.12584 OF 2024), decided on 26/09/2025\n-\nAIROnline 2025 SC 899\nSupreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran , J\nRaghav Prashad v. State of U.P.CRIMINAL APPEAL - 596 of 2014 , decided on 26/09/2025\n-\nAIROnline 2025 SC 904\nSupreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran , J\nState of Telangana v. Jerusalem MathaiSpecial Leave Petition (Crl.) - 5248 of 2016 , With Special Leave Petition (Crl.) No.9333 of 2016, , decid

In [2]:
#chunking the data
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,  #number of charecters in each chunk
    chunk_overlap = 200,
    length_function = len,
)
all_chunks = []
for text in all_text:
    chunks = splitter.split_text(text)
    all_chunks.extend(chunks)

print("number of chunks:", len(all_chunks))
for i, chunk in enumerate(all_chunks[:6]):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
print("chunk size;")

number of chunks: 6
--- Chunk 1 ---
The Surest Way to Legal Research!™
Uniting the authentic and reliable content from India’s leading law publisher with cutting-edge technology to create a powerful legal research resource.
Now available at your desk or on the move, spend less time researching, and have more time to focus on crafting your arguments.
--- Chunk 2 ---
-
AIROnline 2025 SC 905
Supreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran, Atul S. Chandurkar , JJ
Sanjay D. Jain v. State of MaharashtraCRIMINAL APPEAL - 4292 of 2025 , (ARISING OUT OF SPECIAL LEAVE PETITION (CRL.) NO.12584 OF 2024), decided on 26/09/2025
-
AIROnline 2025 SC 899
Supreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran , J
Raghav Prashad v. State of U.P.CRIMINAL APPEAL - 596 of 2014 , decided on 26/09/2025
-
AIROnline 2025 SC 904
Supreme Court Of IndiaHon'ble Judge(s): K. Vinod Chandran , J
State of Telangana v. Jerusalem MathaiSpecial Leave Petition (Crl.) - 5248 of 2016 , With Special Leave Petiti

In [3]:
#conversion from chunks to vectors (embedding the chunks)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(all_chunks, embedding=embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#loading the model (t5 model)
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipeline)

#create a retriver 
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})

#connect the retriver with the model
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever = retriever)

query = input("Enter your query: ")
response = qa_chain.run(query)
print("Response:", response)   

Device set to use mps:0
  llm = HuggingFacePipeline(pipeline=pipeline)
  response = qa_chain.run(query)


Response: The Surest Way to Legal Research!TM Uniting the authentic and reliable content from India’s leading law publisher with cutting-edge technology to create a powerful legal research resource.


In [34]:
#alternative method 
#conversion of text to vector using sentence transformer and faiss
from sentence_transformers import SentenceTransformer
import faiss
from langchain.vectorstores import FAISS

embedding_model = SentanceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(all_chunks)
vector_store = FAISS.from_texts(all_chunks, embeddings)

#set up retriver for rag pipeline
retriver = vector_store.as_retriever(search_type="similarity", search_keyword="content", search_kwargs={"k":3})

NameError: name 'LRScheduler' is not defined