# 1. Mounting Google Drive for PDF access

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# AI LEGAL WORD DOCUMENT ASSISTANT USING GOOGLE GEN AI

## Extracting and Preprocessing of text

In [None]:
# Extract and read PDF contents
%pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5


In [None]:
# read PDF
import fitz
# Use for environment and path related works
import os

# List of paths of pdf files to read
pdf_folder_path = "/content/drive/MyDrive/LegalAssistant"
pdf_files = [os.path.join(pdf_folder_path, file) for file in os.listdir(pdf_folder_path) if file.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files.")

# function to extract text from pdf
def extract_text_as_dict(pdf_file):
    doc = fitz.open(pdf_file)
    data = {}
    current_title = None
    current_text = ""

    for page in doc:
        blocks = page.get_text("blocks")
        for block in blocks:
            lines = block[4].splitlines()
            if len(lines) == 1:
                if current_title:
                    data[current_title] = current_text
                current_title = lines[0].strip()
                current_text = ""
            else:
                current_text += " ".join(lines)

    # Store the last title and text
    if current_title:
        data[current_title] = current_text

    return data

Found 1 PDF files.


In [None]:
# Store all text extracted from all PDF
Text = dict()

# Extracting text from each PDF
for pdf_file in pdf_files:
    Text.update(extract_text_as_dict(pdf_file))

import pandas as pd
# Organize contents of dictionary into dataframe for better visualization
series = pd.Series(Text)
df = pd.DataFrame(series).reset_index()
df.columns = ['Title', 'Text']

print('Maximum Length Of a Paragraph = ',df['Text'].apply(len).max())
print(df)

Maximum Length Of a Paragraph =  4822
                                                 Title  \
0                            THE CONSTITUTION OF INDIA   
1                                             PREAMBLE   
2             JUSTICE, social, economic and political;   
3    LIBERTY of thought, expression, belief, faith ...   
4               EQUALITY of status and of opportunity;   
..                                                 ...   
777  1.Published with the Ministry of Law and Justi...   
778                                                371   
779                                       APPENDIX III   
780  1DECLRATION UNDER ARTICLE 370(3) OF THE CONSTI...   
781                                           C.O. 273   

                                                  Text  
0                                                       
1      WE, THE PEOPLE OF INDIA, having solemnly res...  
2                                                       
3                                    

In [None]:
# Iterate over rows to merge titles with consecutive empty text
merged_rows = []
current_title = ""

for index, row in df.iterrows():
    if not row['Text']:
        # Concatenate titles for consecutive empty text rows
        current_title += ' ' + row['Title'].strip()
    elif current_title:
        # If there were consecutive empty text rows, add titles to the next non-empty text row
        merged_rows.append({'Title': current_title.strip(), 'Text': row['Text']})
        current_title = ""
    else:
        # If there were no consecutive empty text rows, keep the row as it is
        merged_rows.append({'Title': row['Title'].strip(), 'Text': row['Text']})

# Create a new DataFrame with merged rows
df = pd.DataFrame(merged_rows)

# Display the result
print(df)

                                                 Title  \
0                            THE CONSTITUTION OF INDIA   
1    JUSTICE, social, economic and political; LIBER...   
2                    (Part I.—Union and its territory)   
3                               (Part II.—Citizenship)   
4    (3) In this article, unless the context otherw...   
..                                                 ...   
511                                                369   
512                                        ORDER, 2019   
513  To article 367, there shall be added the follo...   
514  1.Published with the Ministry of Law and Justi...   
515  371 APPENDIX III 1DECLRATION UNDER ARTICLE 370...   

                                                  Text  
0      WE, THE PEOPLE OF INDIA, having solemnly res...  
1    (a) the territories of the States;  2[(b) the ...  
2    President and unless, where the proposal conta...  
3    Provided that nothing in this article shall ap...  
4    (a) “law” inc

## Loading LLM Model for Embedding creation and Query Searching

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.78.1-py3-none-any.whl.metadata (25 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.78.1-py3-none-any.whl (680 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m680.9/680.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (351 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m351.8/351.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jiter, openai
Successfully installed jiter-0.9.0 openai-1.78.1


In [None]:
import openai

# OpenAI API Key
OPENAI_API_KEY = " "

# Configure OpenAI
client = openai.OpenAI(api_key=OPENAI_API_KEY)

## Embeddings



In [None]:
import numpy as np
import pandas as pd

client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Function to get embeddings using OpenAI
def embed_fn(title, text):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

# Split DataFrame into parts
parts = np.array_split(df, 4)

# Compute embeddings and add a new column
for i in range(4):
    parts[i]['Embeddings'] = parts[i].apply(lambda row: embed_fn(row['Title'], row['Text']), axis=1)

# Merge parts back together
df = pd.concat(parts)

  return bound(*args, **kwds)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import ast

# Load the DataFrame (if reloading from CSV)
df.to_csv('/content/drive/MyDrive/LegalAssistant/data.csv', index=False)

# Convert string embeddings to lists (if necessary)
if isinstance(df['Embeddings'][0], str):
    df['Embeddings'] = df['Embeddings'].apply(ast.literal_eval)

# Extract embeddings as a NumPy array
embeddings = np.array(df['Embeddings'].to_list())

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)

## VectorStore Creation

In [None]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-linux_x86_64.whl size=2389212 sha256=f3f0ffabff34a4f8ec808a30ddf68a4e1af39b60ca2706543b0688c1d65fff9b
  Stored in directory: /root/.cache/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [None]:
import hnswlib
import numpy as np

# Get correct embedding dimensions
num_elements, embedding_dim = embeddings.shape

# Initialize HNSW index with correct dimensions
index = hnswlib.Index(space='cosine', dim=embedding_dim)

# Create index with sufficient capacity
index.init_index(max_elements=num_elements, ef_construction=200, M=16)

# Add data points (convert embeddings to float32 if needed)
index.add_items(embeddings.astype(np.float32))

# Save the index for future use
index.save_index('/content/drive/MyDrive/LegalAssistant/hnsw_index.bin')

print("HNSW index created and saved successfully!")


HNSW index created and saved successfully!


In [None]:
import hnswlib
import pandas as pd
import numpy as np
" \
"# Load DataFrame
df = pd.read_csv('/content/drive/MyDrive/LegalAssistant/data.csv')

# Get the embedding dimension from the data
embedding_dim = 1536  # Ensure this matches your actual embedding size

# Initialize HNSW index
index = hnswlib.Index(space='cosine', dim=embedding_dim)

# Load the pre-built index
index.load_index('/content/drive/MyDrive/LegalAssistant/hnsw_index.bin')

print("HNSW index loaded successfully!")


HNSW index loaded successfully!


## Prepairing Model for Q & A

In [None]:
import openai
import textwrap
import numpy as np

# Set OpenAI API key
openai.api_key = " "

# Function to get embeddings using OpenAI API
def embed_fn(text):
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

import numpy as np

# Function to retrieve nearest neighbors
def relatedTexts(query, dataframe, vectordb):
    """
    Compute the distances between the query and each document in the dataframe
    using the approximate k nearest neighbor.
    """
    query_embedding = embed_fn(query)  # Ensure embed_fn returns a NumPy array

    # Use knn_query instead of knnQuery
    neighbors, distances = vectordb.knn_query(np.array([query_embedding]), k=5)

    return dataframe.iloc[neighbors[0]]['Text']  # Return text from index with max value


# Function to generate a prompt for better search results
def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = textwrap.dedent("""You are a helpful and informative Legal Advisor bot for Indian citizens that answers questions using text \
    from the reference passage included below. Be sure to respond in a complete sentence, being comprehensive. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and provide \
    all relevant background information. Strike a friendly and conversational tone. \
    Also, include references to Indian laws, Code of Criminal Procedure, and Code of Civil Procedure in India. \
    If the passage is irrelevant to the answer, you may ignore it.

    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

    ANSWER:
    """).format(query=query, relevant_passage=escaped)
    return prompt


### Testing model

In [None]:
import openai

# Set OpenAI API key
openai.api_key = " "

# Define query
query = "what can you answer?"

# Retrieve relevant texts
Info = relatedTexts(query, df, index)

# Generate prompt
prompt = make_prompt(query, Info)

# Generate answer using OpenAI GPT-4
response = openai.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "system", "content": "You are a helpful and informative Legal Advisor bot for Indian citizens."},
              {"role": "user", "content": prompt}]
)

# Print the response
print("\n".join(response.choices[0].message.content.split(". ")))



As a Legal Advisor bot, I can provide information about various aspects of Indian law, including the Code of Criminal Procedure and the Code of Civil Procedure
My expertise covers a wide range of topics from legal procedures, practices to explaining complex laws in a simple, understandable language
However, the provided passage doesn't seem relevant to this context
Please feel free to ask anything specific related to Indian laws and procedures, and I'd be more than happy to help!


# Web App Deployment

In [None]:
!pip install anvil-uplink

Collecting anvil-uplink
  Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting argparse (from anvil-uplink)
  Downloading argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting future (from anvil-uplink)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting ws4py-sslupdate (from anvil-uplink)
  Downloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading anvil_uplink-0.5.2-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Downloading future-1.0.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.3/491.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ws4py_sslupdate-0.5.1b0-py2.py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m2.3 M

In [None]:
import anvil.server
anvil.server.connect('RUNWS225LNCC2SXIJ7JZOVL6-UWO7UCPAA2YCJ2DC')

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


In [None]:
@anvil.server.callable
def answer_generation(query):
    Info = relatedTexts(query, df, index)
    prompt = make_prompt(query, Info)
    model = genai.GenerativeModel('models/gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
# anvil.server.wait_forever()

# AI Legal Word Document Assistant Using llama2 13 billion parameters

## LANGCHAIN SETUP for API acess of LLM

In [None]:
import openai

# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)  # Replace with your OpenAI API key

# Function to generate text using OpenAI GPT model
def generate_response(prompt):
    response = client.chat.completions.create(
        model="gpt-4",  # You can use "gpt-3.5-turbo" for a cheaper alternative
        messages=[{"role": "system", "content": "ou are a legal assistant. You should only answer legal-related questions. If the user asks something else, politely say you only answer legal queries."},
                  {"role": "user", "content": prompt}],
        temperature=0.75,
        max_tokens=1000,
        top_p=1
    )
    return response.choices[0].message.content

## Extracting and tokenizing Texts

In [None]:
!pip install pdf2image
!pip install pdfminer
!pip install pdfminer.six
!pip install unstructured
!pip install pypdf
!pip install tiktoken
!pip install textract
!pip install transformers

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfm

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.59-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.42-py3-none-any.whl.metadata (15 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.58->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting packaging<25,>=23.2 (from langchain-core<1.0.0,>=0.3.58->langchain)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.4,>=0.1.17->langchain)


In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading langchain_community-0.3.24-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading pydantic_settings-2.9.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.1.0-py3

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
# Function takes in the pdf data and returns the splits for further processing
def get_pdf_splits(pdf_file):
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()

    textSplit = RecursiveCharacterTextSplitter(chunk_size=128,
                                             chunk_overlap=12,
                                             length_function=len)

    doc_list = []

    #Pages will be list of pages
    for pg in pages:
        pg_splits = textSplit.split_text(pg.page_content)
        doc_list.extend(pg_splits)

    return doc_list

def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store,
  new doc_list and embedding function that is
  initialized on appropriate model. Local or online.
  New embedding is merged with the existing index. If no
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, embed_fn)
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, embed_fn)

  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

## LOAD THE DOCUMENT

In [None]:
pdf_folder_path = "/content/drive/MyDrive/LegalAssistant"
pdf_files = [os.path.join(pdf_folder_path, file) for file in os.listdir(pdf_folder_path) if file.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files.")
all_docs=[]
for pdf_file in pdf_files:
    all_docs.extend(get_pdf_splits(pdf_file))

Found 1 PDF files.


## Create & Store embeddings into vector database


In [None]:
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2" # embedding model
model_kwargs = {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

  embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store,
  new doc_list and embedding function that is
  initialized on appropriate model. Local or online.
  New embedding is merged with the existing index. If no
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, embed_fn)
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, embed_fn)

  if os.path.exists(index_store):
    # Set allow_dangerous_deserialization=True when loading
    local_db = FAISS.load_local(index_store, embed_fn, allow_dangerous_deserialization=True)
    print("Loaded existing index.")
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

## Load saved vector database for searching

In [None]:
!pip install -U langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.3.16-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.16-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.16


In [None]:
from langchain_openai import OpenAIEmbeddings

In [None]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = " "

embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002")


In [None]:
vectorstore = FAISS.load_local(
    "/content/drive/MyDrive/LegalAssistant/vectorstore",
    embedding_function,
    allow_dangerous_deserialization=True  # Enable deserialization with caution
)


## Query against given data

In [None]:
import openai
import os

OPENAI_API_KEY = " "
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = OPENAI_API_KEY


In [None]:
# Define the initial query
query1 = "What is our Fundamental Duty as per the Constitution of India?"

# Call OpenAI API to get the first response
response1 = openai.chat.completions.create(
    model="gpt-4o-search-preview-2025-03-11",
    messages=[{"role": "system", "content": "You are an AI Assitant."},
              {"role": "user", "content": query1}]
)

# Extract the answer
answer1 = response1.choices[0].message.content
print("Q1:", query1)
print("A1:", answer1, "\n")

# Define a follow-up query
query2 = "Among them, which is the most fundamental?"

# Include chat history in the new request
response2 = openai.chat.completions.create(
    model="gpt-4o-search-preview-2025-03-11",
    messages=[
        {"role": "system", "content": "You are an AI Assitant."},
        {"role": "user", "content": query1},
        {"role": "assistant", "content": answer1},
        {"role": "user", "content": query2}
    ]
)

# Extract and print the second answer
answer2 = response2.choices[0].message.content
print(f"Q2: {query2}\nA2:\n{answer2}")

Q1: What is our Fundamental Duty as per the Constitution of India?
A1: The Constitution of India outlines the Fundamental Duties of its citizens in Article 51A, introduced by the 42nd Amendment Act in 1976. These duties serve as a constant reminder to every citizen that while the Constitution specifically confers on them certain fundamental rights, it also requires citizens to observe basic norms of democratic conduct and democratic behaviour. ([drishtiias.com](https://www.drishtiias.com/daily-updates/daily-news-analysis/fundamental-duties-3?utm_source=openai))

Originally, there were ten Fundamental Duties, and an eleventh was added by the 86th Amendment Act in 2002. The Fundamental Duties are as follows:

1. To abide by the Constitution and respect its ideals and institutions, the National Flag, and the National Anthem.
2. To cherish and follow the noble ideals that inspired the national struggle for freedom.
3. To uphold and protect the sovereignty, unity, and integrity of India.
4.

In [None]:
import openai
import os
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = " "

# Initialize OpenAI Embeddings
embedding_function = OpenAIEmbeddings()

# Load vector store
vectorstore = FAISS.load_local(
    "/content/drive/MyDrive/LegalAssistant/vectorstore",
    embedding_function,
    allow_dangerous_deserialization=True
)

# Initialize OpenAI Chat Model with correct parameters
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Create Conversational Retrieval Chain
chain = ConversationalRetrievalChain.from_llm(
    llm, vectorstore.as_retriever(), return_source_documents=True
)

# Query
chat_history = []
query = "What are the fundamental duties as per the Constitution of India?"

# Run the retrieval-augmented generation (RAG) model
result = chain.invoke({"question": query, "chat_history": chat_history})

# Extract retrieved text
retrieved_text = "\n\n".join([doc.page_content for doc in result["source_documents"]])

# Summarization step using GPT-4o
summary_prompt = f"Summarize the following legal text:\n\n{retrieved_text}\n\nProvide a concise summary."
summary_response = openai.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are an AI Assitant."},
        {"role": "user", "content": summary_prompt}
    ]
)
summary_text = summary_response.choices[0].message.content

# Print results
print("🔹 **Summary:**")
print(summary_text)
print("\n🔹 **Full Retrieved Text:**")
print(retrieved_text)


  embedding_function = OpenAIEmbeddings()
  llm = ChatOpenAI(model_name="gpt-4o", temperature=0)


🔹 **Summary:**
The text outlines some of the fundamental duties of every Indian citizen, as specified in Part IVA of the Constitution of India. These duties include abiding by the Constitution, respecting its ideals and institutions, as well as the National Flag and the National Anthem.

🔹 **Full Retrieved Text:**
25 
1[PART  IVA 
 
FUNDAMENTAL DUTIES 
51A. Fundamental duties.—It shall be the duty of every citizen of 
India—

India— 
(a) to abide by the Constitution and respect its ideals and 
institutions, the National Flag and the National Anthem;

THE CONSTITUTION OF  INDIA 
(Part III.—Fundamental Rights) 
15
(b) to manage its own affairs in matters of religion;

THE CONSTITUTION OF  INDIA 
(Part III.—Fundamental Rights) 
16


## User interaction with Model

In [None]:
import re
import ipywidgets as widgets
from IPython.display import display, clear_output

def format_answer(answer):
    # Split the answer into bullet points based on patterns like (1), (2), 1., 2., 1), 2), ARTICLE 1., ARTICLE (1), ARTICLE 1), etc.
    points = re.split(r'\(\d+\)|\d+\.\s|\d+\)\s|ARTICLE\s+\(\d+\)|ARTICLE\s+\d+\.\s|ARTICLE\s+\d+\)\s', answer)
    bullet_points = "<br>".join([f"- {point.strip()}" for point in points if point.strip()])
    return bullet_points

def on_submit(_):
    query = input_box.value
    input_box.value = ""
    if query.lower() == 'exit':
        print("Thank you for using AI law assistant!")
        return

    result = chain.invoke({"question": query, "chat_history": chat_history})

    # Clean up the result by removing unnecessary text
    answer = result['answer']
    if answer.lower().startswith("i don't have specific information on"):
        # Remove the unnecessary text part
        answer = answer.split(". ", 1)[-1]  # Remove the part before the actual answer

    # Apply the formatting function to the result['answer']
    formatted_answer = format_answer(answer)

    chat_history.append((query, formatted_answer))
    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="cornflowerblue">Legal Assistant:</font></b> {formatted_answer}'))


def clear_history(_):
    global chat_history
    chat_history = []
    clear_output()
    print("Welcome to AI Law Assistent! Type 'exit' to stop.")
    display(input_box)
    display(clear_button)

all_chat_histories = []
chat_history = []

def start_new_conversation(_):
    global chat_history
    all_chat_histories.append(chat_history[:])
    chat_history = []
    print("Starting a new conversation. Welcome to AI Law Assistant! Type 'exit' to stop.")
    display(input_box)
    display(clear_button)
    display(new_conversation_button)
    display(view_conversations_button)

print("Welcome to AI Law Assistent! Type 'exit' to stop.")

input_box = widgets.Text(placeholder='Please enter your query here:')
input_box.on_submit(on_submit)

clear_button = widgets.Button(description="Clear History")
clear_button.on_click(clear_history)

new_conversation_button = widgets.Button(description="Start New Conversation")
new_conversation_button.on_click(start_new_conversation)

display(input_box)
display(clear_button)
display(new_conversation_button)

Welcome to AI Law Assistent! Type 'exit' to stop.


Text(value='', placeholder='Please enter your query here:')

Button(description='Clear History', style=ButtonStyle())

Button(description='Start New Conversation', style=ButtonStyle())

In [None]:
import re
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

css_style = '''
<style>
    .widget-box {
        border: 2px solid #6c757d;
        padding: 20px;
        background: linear-gradient(to right, #3b3b4f, #50506d);
        color: white;
        border-radius: 12px;
        font-size: 16px;
        width: 100%;
        height: 100%;
        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
    }
   .header-title {
        text-align: center;
        color: #A7C7E7;
        font-weight: bold;
        font-size: 40px;
        font-family: 'Book Antiqua';
        font-style: bold;
        margin-bottom: 4px;
    }
    .header-subtitle {
        text-align: center;
        color: #C19A6B;
        font-weight: 600;
        font-size: 19px;
        font-family: 'Book Antiqua';
        font-style: italic;
        margin-top: 10px;
    }
    h4 {
      font-size: 20px;
      font-family: 'Book Antiqua';
    }
    .chat-history {
        border: 1px solid #dee2e6;
        height: 300px;
        overflow-y: auto;
        padding: 15px;
        border-radius: 10px;
        font-family: 'Book Antiqua';
        font-size: 14px;
        background-color: #ffffff;
        color: white;
    }
   .user-message {
        color: #00ffff;
        font-weight: bold;
        margin-top: 10px;
    }
    .bot-message {
        color: #ffa500;
        font-weight: bold;
        margin-bottom: 10px;
    }
    button {
       background-color: #4e73df;
       color: white !important;
       width: 75%;
       border-radius: 8px;
       border: 2px solid black;
       font-weight: bold;
       font-family: 'verdana';
       font-family: 20px !important;
       font-size: 16px !important;
       padding-top: 20px;
       padding-bottom: 30px !important;
       transition: background-color 0.3s ease;
       border: 2px solid #375aab !important;
    }
    button:hover {
       background-color: #375aab !important;
       cursor: pointer;
      }
</style>
'''

# Display the CSS styles
display(HTML(css_style))

# Format answer with bullet points
def format_answer(answer):
    points = re.split(r'\(\d+\)|\d+\.\s|\d+\)\s|ARTICLE\s+\(\d+\)|ARTICLE\s+\d+\.\s|ARTICLE\s+\d+\)\s', answer)
    bullet_points = "<br>".join([f"- {point.strip()}" for point in points if point.strip()])
    return bullet_points

# Function to handle user query
def on_submit(_):
    query = input_box.value
    input_box.value = ""
    if query.lower() == 'exit':
        print("Thank you for using AI law assistant!")
        return

    result = chain({"question": query, "chat_history": chat_history})

    # Clean up the result by removing unnecessary text
    answer = result['answer']
    if answer.lower().startswith("i don't have specific information on"):
        # Remove the unnecessary text part
        answer = answer.split(". ", 1)[-1]  # Remove the part before the actual answer

    # Format the cleaned answer
    formatted_answer = format_answer(answer)
    chat_history.append((query, formatted_answer))

    chat_display.value += f'<div class="user-message">User:</div> {query}<div class="bot-message">Legal Assistant:</div> {formatted_answer}<hr>'

# Function to clear chat history
def clear_history(_):
    global chat_history
    chat_history = []
    chat_display.value = ""

chat_history = []

# Widgets
input_box = widgets.Text(placeholder='Ask a legal question...', layout=widgets.Layout(width="75%"))
input_box.on_submit(on_submit)

submit_button = widgets.Button(description="Ask", button_style="primary")
submit_button.on_click(on_submit)

clear_button = widgets.Button(description="Clear History", button_style="primary")
clear_button.on_click(clear_history)

chat_display = widgets.HTML(value="", layout=widgets.Layout(height="300px", width="100%"))

# Container for widgets
container = widgets.VBox([
    widgets.HTML("""
        <div class='header-title'>AI Legal Assistant</div>
        <div class='header-subtitle'>Your AI-Driven Legal Document Assistant</div>
    """),
    widgets.HBox([input_box, submit_button]),
    widgets.HTML("<h4>Chat History:</h4>"),
    widgets.Box([chat_display], layout=widgets.Layout(padding="10px", border="1px solid #ccc", border_radius='10px')),
    widgets.HBox([clear_button], layout=widgets.Layout(justify_content='flex-end', padding='10px'))
], layout=widgets.Layout(padding="20px", width="100%", border="", background_color="#f8f9fa"))

# Display the container
display(container)


VBox(children=(HTML(value="\n        <div class='header-title'>AI Legal Assistant</div>\n        <div class='h…

  result = chain({"question": query, "chat_history": chat_history})
