In [None]:
#!pip install langchain_community langchain_openai chromadb PyPDF2
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.4.0


In [None]:
import os, torch
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [None]:
# Define the path to the document and the directory for storing the vector database
file_path = "family_law_manual.pdf" # Path to the file
persistent_directory = os.path.join("db", "chroma_db") # Directory to persist the vector store

print("Initializing vector store...")

# Check if the file exists at the specified path
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist. Please check the path.")

# Load the document based on its file type (PDF or text)
if file_path.endswith(".pdf"):
  loader = PyPDFLoader(file_path)
else:
  loader = TextLoader(file_path)
documents = loader.load()

Initializing vector store...


In [None]:
# Split the document into smaller chunks for processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) # Define chunk size and overlap
document_chunks = text_splitter.split_documents(documents) # Split the documents into chunks

print(f"Total number of chunks: {len(document_chunks)}") # Print the total number of chunks created

# Generate embeddings for the document chunks using a pre-trained model
print("\n--- Generating embeddings ---")
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2")  # Load the embedding model
print("\n--- Embeddings generated successfully ---")

# Create a vector store from the document chunks and embeddings, and persist it to disk
print("\n--- Creating vector store ---")
vector_store = Chroma.from_documents(
    document_chunks, embeddings, persist_directory=persistent_directory) # Persist the vector store
print("\n--- Vector store created and persisted successfully ---")

# Load the existing vector store with the embedding function for querying
db = Chroma(persist_directory=persistent_directory,
            embedding_function=embeddings)

Total number of chunks: 384

--- Generating embeddings ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Embeddings generated successfully ---

--- Creating vector store ---

--- Vector store created and persisted successfully ---


In [None]:
# Load the Hugging Face model and tokenizer for text generation
model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

# Create a Hugging Face pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

# Wrap the pipeline in a LangChain HuggingFacePipeline for integration
llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Define the user's question for querying the document
query = "What are woman's rights in pakistan family law?" # input("Enter the question: ")

# Retrieve relevant document chunks based on the query
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2},
)
relevant_docs = retriever.invoke(query) # Retrieve relevant documents

# Combine the query and the relevant document contents for input to the model
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease base your response on the documents provided and keep it as simple as possible.. If the answer cannot be found within them, use your own knowledge."
)

# Define the messages for the model, including system and user roles
messages = [
    {"role": "system", "content": "You are a helpful pakistan legal assistant."},
    {"role": "user", "content": combined_input},
]

# Format the prompt using the tokenizer's chat template
formatted_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [None]:
# Invoke the model with the formatted prompt and generate a response
result = llm.invoke(formatted_prompt)

# Print the user's query and the model's response
print("You:")
print(query)
for output in result.split("\n"):
    if "Assistant:" in output:
      print(output[:10] + "\n" + output[10:])
print(result)

You:
What are woman's rights in pakistan family law?
Assistant:
Woman's Rights in Pakistan Family Law:
<｜begin▁of▁sentence｜>You are a helpful pakistan legal assistant.

User: Here are some documents that might help answer the question: What are woman's rights in pakistan family law?

Relevant Documents:
. Introduction to Handbook on Family Law 
Given that the law is a living process, continually evolving and changing, what is presented in this handbook can only be a snapshot of the law at the time of writing; legal precedents may change - for the better or for the worse. This handbook cannot therefore claim to be a definitive volume on 
family law in Pakistan. What it does however hope to achieve is the filling 
of a gap in material on the subject by providing lawyers and those 
working for women's legal rights a detailed examination of family law 
from the woman's perspective.|It highlights not (as is usually done) the 
negative aspects of the law vis a vis women but the positive deve

According to these documents, the primary aim of the Handbook on Family Law in Pakistan is to examine family law from a woman's perspective and focus on positive changes that enhance women’s legal rights instead of concentrating solely on negative aspects. Customary practices, Islamic jurisprudence, statutory law, and women's aspirations present multiple contradictions in the country's family law system. For instance, critics argue that the 1961 Muslim Family Laws Ordinance disregards Islamic principles, whereas proponents state that the ordinance falls short in protecting women's interests adequately. Additionally, despite protective provisions like the nikahnamah clause 18, which allows couples to delegate divorce right, customary practices often undermine the effectiveness of such stipulations.
