# Controlling the Metadata and Filtering queries using metadata

In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import chromadb
import openai
import os
import utils
from dotenv import load_dotenv


load_dotenv()  # take environment variables from .env.

True

In [2]:
documents = SimpleDirectoryReader("./documents_4_v0").load_data()

In [5]:
print(documents[0])

Doc ID: ce88fb5c-f181-4a3a-8701-6a3fabb898c8
Text: Ashoka Kumar Thakur vs Union Of India And Ors on 10 April, 2008
Author: Arijit Pasayat Bench: Arijit Pasayat, C.K. Thakker
CASE NO.: Writ Petition (civil)  265 of 2006 PETITIONER: Ashoka Kumar
Thakur RESPONDENT: Union of India and Ors DATE OF JUDGMENT: 10/04/2008
BENCH: Dr. ARIJIT PASAYAT & C.K. THAKKER JUDGMENT: J U D G M E N T
WRIT ...


In [4]:
documents[0].metadata

{'page_label': '1',
 'file_name': 'Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_on_10_April_2008.PDF',
 'file_path': '/home/darth/Documents/code/legal_AI/legal_ai_v1/legal_ai/documents_4_v0/Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_on_10_April_2008.PDF',
 'file_type': 'application/pdf',
 'file_size': 524049,
 'creation_date': '2024-03-11',
 'last_modified_date': '2024-02-27'}

In [6]:
documents[0].metadata["file_name"]

'Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_on_10_April_2008.PDF'

In [7]:
len(documents)

1320

### Reconfiguring Metadata

In [8]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter

# documents = [
#     Document(text="text", metadata={"author": "LlamaIndex"}),
#     Document(text="text", metadata={"author": "John Doe"}),
# ]
documents_with_metadata = []

for document in documents:
    documents_with_metadata.append(Document(text=document.text, metadata={
                                                                            "page": document.metadata["page_label"], 
                                                                            "pdf_name": document.metadata["file_name"]
                                                                        }))

In [36]:
# Saving metadata info in csv
import pandas as pd

pdf_names = []
pages = []
doc_ids = []

for doc in documents_with_metadata:
    pdf_names.append(doc.metadata["pdf_name"])
    pages.append(doc.metadata["page"])
    doc_ids.append(doc.id_)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'pdf_name': pdf_names,
    'page': pages,
    'id': doc_ids
})

df.head()

Unnamed: 0,pdf_name,page,id
0,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,1,f9698246-3e79-489c-a7f9-d1fbd85f5bc3
1,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,2,951eb96d-6348-4532-9812-033910bdf5de
2,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,3,7a1d2071-01f0-483b-afa0-f5c568adf031
3,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,4,68856886-2e5e-49f3-97f7-13141f09db90
4,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,5,0bbc9e3b-dd12-4b37-919b-4be69acec31d


In [39]:
df = pd.read_csv('metadata_doc4_gpt003s_0.csv')

# Adding a new column 'legible_name' with '_' replaced by space in 'pdf_name'
df['legible_name'] = df['pdf_name'].str.replace('_', ' ')

df.head()

Unnamed: 0,pdf_name,page,id,legible_name
0,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,1,f9698246-3e79-489c-a7f9-d1fbd85f5bc3,Ashoka Kumar Thakur vs Union Of India And Ors ...
1,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,2,951eb96d-6348-4532-9812-033910bdf5de,Ashoka Kumar Thakur vs Union Of India And Ors ...
2,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,3,7a1d2071-01f0-483b-afa0-f5c568adf031,Ashoka Kumar Thakur vs Union Of India And Ors ...
3,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,4,68856886-2e5e-49f3-97f7-13141f09db90,Ashoka Kumar Thakur vs Union Of India And Ors ...
4,Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_...,5,0bbc9e3b-dd12-4b37-919b-4be69acec31d,Ashoka Kumar Thakur vs Union Of India And Ors ...


In [40]:
# Convert the DataFrame to a CSV file
csv_file_path = 'metadata_doc4_gpt003s_0.csv'
df.to_csv(csv_file_path, index=False)

In [9]:
len(documents_with_metadata)

1320

In [35]:
documents_with_metadata[0].id_

'f9698246-3e79-489c-a7f9-d1fbd85f5bc3'

In [14]:
documents_with_metadata[0].metadata

{'page': '1',
 'pdf_name': 'Ashoka_Kumar_Thakur_vs_Union_Of_India_And_Ors_on_10_April_2008.PDF'}

### Chroma Database with metadata

In [12]:
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from IPython.display import Markdown, display
import chromadb

In [16]:
# # Create to disk
# db = chromadb.PersistentClient(path="./chroma_db_doc4_gpt003s_metadata_0")
# chroma_collection = db.get_or_create_collection("quickstart")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex.from_documents(
#     documents_with_metadata, storage_context=storage_context
# )

In [17]:
# load from disk
db = chromadb.PersistentClient(path="./chroma_db_doc4_gpt003s_metadata_0")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
)

### Query without filtering

In [18]:
# Converting the index to a query engine for retrieval
query_engine = index.as_query_engine()

In [22]:
# response = query_engine.query("Who was the judge in Ashoka Kumar Thakur case?")
response = query_engine.query("Which case is about women empowerment?")

In [24]:
response.response

'The case about women empowerment is Shayara Bano vs Union Of India And Ors Ministry Of Women on 22 August 2017.'

In [25]:
response.source_nodes[0].metadata

{'page': '101',
 'pdf_name': 'Shayara_Bano_vs_Union_Of_India_And_Ors_Ministry_Of_Women_on_22_August_2017.PDF'}

### Query based filtering validation

In [26]:
from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter

filters = MetadataFilters(filters=[ExactMatchFilter(key="pdf_name", value="Shayara_Bano_vs_Union_Of_India_And_Ors_Ministry_Of_Women_on_22_August_2017.PDF")])
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("Which case is about women empowerment?")

In [28]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='eee037d9-7fdc-4902-bdc8-7f752a4c4c2d', embedding=None, metadata={'page': '101', 'pdf_name': 'Shayara_Bano_vs_Union_Of_India_And_Ors_Ministry_Of_Women_on_22_August_2017.PDF'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3c6cd441-457e-47d9-a85e-b018d2b95897', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page': '101', 'pdf_name': 'Shayara_Bano_vs_Union_Of_India_And_Ors_Ministry_Of_Women_on_22_August_2017.PDF'}, hash='bfe77d5f10c6ab63f27d3adf25d2cf89ffe0b3580ad19ea9ed240a229bc4ee4d'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='4a5af030-ab34-4dd6-8939-3ecaef4d5098', node_type=<ObjectType.TEXT: '1'>, metadata={'page': '100', 'pdf_name': 'Shayara_Bano_vs_Union_Of_India_And_Ors_Ministry_Of_Women_on_22_August_2017.PDF'}, hash='c6e45f98e1a653a7282f5243375e65d71c50a9606c64723e7122da692524af41'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='63c

In [29]:
from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter

filters = MetadataFilters(filters=[ExactMatchFilter(key="pdf_name", value="Shayara_Bano_vs_Union_Of_India_An21d_Ors_Ministry_Of_Women_on_22_August_2017.PDF")])
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("Which case is about women empowerment?")

In [30]:
response

Response(response='Empty Response', source_nodes=[], metadata=None)