# Re-Klasifikasi KKR/KKPRL

Pada metadata awal karena KKPR dan KKPRL dianggap ijin yang sama, sehingga `permit_type` nya menjadi 1 yaitu `KKPR/KKPRL`. Di dalam notebook ini, kategori tersebut akan di re-klasifikasi sehingga menjadi 2 kategori baru yaitu `KKPR` dan `KKPRL`. Berikut adalah statistik jumlah halaman untuk jenis kategori `KKPR/KKPRL`. 

```text
┌───────────┬───────────┬───────┬─────────────────┐
│ min_pages │ max_pages │ count │   permit_type   │
│   int32   │   int32   │ int64 │     varchar     │
├───────────┼───────────┼───────┼─────────────────┤
│         2 │        20 │    98 │ KKPR/KKPRL      │
│        12 │       310 │    58 │ Ijin Lingkungan │
│         1 │       636 │    82 │ PLO             │
└───────────┴───────────┴───────┴─────────────────┘
```

Karena jumlah halaman maksimal adalah 20, maka re-klasifikasi bisa dilakukan dengan memasukkan 10-15 halaman dokumen.

## Reclassification

In [1]:
import os
import io
import base64

from pydantic import BaseModel, Field
from typing import List, Any
from pdf2image import convert_from_path

from openai import AzureOpenAI

from dotenv import load_dotenv

load_dotenv()

api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    base_url=f"{api_base}/openai/deployments/{deployment_name}",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

reclassification_prompt = """
You are an expert in extracting metadata from permit images. Given an image of a permit, extract the following metadata fields:
- Permit Type: Permit type is categorized as KKPR or KKPRL
  - KKPR (Kesesuaian Kegiatan Pemanfaatan Ruang): is a location permit for land
  - KKPRL (Kesesuaian Kegiatan Pemanfaatan Ruang Laut): is a location permit for water

Return the extracted metadata in the JSON format. Do not include any additional text or explanations.
"""

class PermitType(BaseModel):
    permit_type: str = Field(..., description="The type/category of the permit is it KKPR or KKPRL.")

def base64_encoded_image(image, format: str = 'JPEG') -> str:
    """
    Generate a base64 encoded string from image object.
    
    Args:
        image: Image object (e.g., PIL.Image).
        
    Returns:
        str: Base64 encoded string of the image.
    """

    buffer = io.BytesIO()
    image.save(buffer, format=format)
    buffer.seek(0)
    encoded_str = base64.b64encode(buffer.read()).decode('utf-8')
    buffer.close()

    image_data = f"data:image/{format.lower()};base64,{encoded_str}"

    return image_data

def create_message(system_prompt: str, image_content: List[Any]) -> dict:
    """
    Create a message payload for the chat API.

    Args:
        system_prompt (str): The system prompt to include in the message.
        image_content (List[Any]): The content to include in the message.

    Returns:
        dict: The message payload.
    """
    message = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": [
            {
                "type" : "text",
                "text": "Extract the metadata from the permit image."
            }
            ]
        }
    ]

    for content in image_content:
        message[1]['content'].append({"type": "image_url", "image_url": {"url" : base64_encoded_image(content)}})

    return message

In [2]:
import instructor

instructor_client = instructor.from_provider("azure_openai/gpt-4o")

FILE_PATH = "data/folder/PGN_Permits/2. KKPR-L_PGN/KKPRL/KKPRL_Pembangunan Terminal Khusus_Teluk Bima_Bima.pdf"

images = convert_from_path(FILE_PATH)
message_payload = create_message(reclassification_prompt, images)

user, completion = instructor_client.chat.completions.create_with_completion(
    messages=message_payload,
    response_model=PermitType
)

In [3]:
user

PermitType(permit_type='KKPRL')

In [25]:
import logging
import asyncio

from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log

logging.basicConfig(level=logging.INFO)

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    before_sleep=before_sleep_log(logging.getLogger(), logging.WARNING)
)
async def reclassify_permit_metadata_async(file_path: str):
    loop = asyncio.get_event_loop()
    images = await loop.run_in_executor(
        None,
        lambda: convert_from_path(file_path, last_page=10) 
    )
    message_payload = create_message(reclassification_prompt, images)
    logging.info(f"Processing file: {file_path} with {len(images)} pages.")

    user, completion = await loop.run_in_executor(
        None,
        instructor_client.chat.completions.create_with_completion,
        message_payload,
        PermitType
    )

    return {
        "file_path": file_path,
        "file_name": file_path.split("/")[-1],
        "permit_type": user.permit_type,
        "usage": completion.usage
    }

In [9]:
import duckdb

query="""
    SELECT filepath, permit_type FROM document_details
    WHERE permit_type='KKPR/KKPRL';
"""

con = duckdb.connect('data/metadata_document.db')
kkpr_file = con.sql(query).df()

kkpr_file

Unnamed: 0,filepath,permit_type
0,data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TER...,KKPR/KKPRL
1,data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TER...,KKPR/KKPRL
2,data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Non...,KKPR/KKPRL
3,data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Non...,KKPR/KKPRL
4,data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Non...,KKPR/KKPRL
...,...,...
93,data/folder/PPN_Permits/KKPR/KKPR Stranas/17. ...,KKPR/KKPRL
94,data/folder/PPN_Permits/KKPR/KKPR Stranas/2. R...,KKPR/KKPRL
95,data/folder/PPN_Permits/KKPR/KKPR Stranas/8. R...,KKPR/KKPRL
96,data/folder/PPN_Permits/KKPR/KKPR Stranas/10. ...,KKPR/KKPRL


In [26]:
kkpr_reclassification = await reclassify_permit_metadata_async(kkpr_file.iloc[0]['filepath'])

INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKPR Jargas Kota Bandar Lampung.pdf with 10 pages.
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deploy

In [27]:
kkpr_reclassification

{'file_path': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKPR Jargas Kota Bandar Lampung.pdf',
 'file_name': 'KKPR Jargas Kota Bandar Lampung.pdf',
 'permit_type': 'KKPR',
 'usage': CompletionUsage(completion_tokens=8, prompt_tokens=9281, total_tokens=9289, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))}

In [28]:
tasks = [reclassify_permit_metadata_async(file_path) for file_path in kkpr_file['filepath'].tolist()]
reclassified_results = await asyncio.gather(*tasks)

INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Cluster PMO/KKPR Cluster Kota Bekasi.pdf with 3 pages.
INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Non Cluster PMO/KKPR Wijayakusuma.pdf with 4 pages.
INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR Non Cluster PMO/KKPR_Hotel Ratna_Sukabumi_Probolinggo.pdf with 4 pages.
INFO:root:Processing file: data/folder/KPI_Permits/KKPR_KKPRL/RU III Plaju/KKPR Kilang RU III Area Banyuasi 1.pdf with 5 pages.
INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPRL/KKPRL_Pembangunan Terminal Khusus_Laut Flores_Manggarai Barat.pdf with 4 pages.
INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPRL/KKPRL_Pembangunan Terminal Khusus_Laut Flores_Sumbawa.pdf with 4 pages.
INFO:root:Processing file: data/folder/PGN_Permits/2. KKPR-L_PGN/KKPRL/KKPRL_Pembangunan Terminal Khusus Selat Lombok_Lombok Barat.pdf with 4 pages.
INFO:root:Processing file: data/folder/P

In [29]:
reclassified_results

[{'file_path': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKPR Jargas Kota Bandar Lampung.pdf',
  'file_name': 'KKPR Jargas Kota Bandar Lampung.pdf',
  'permit_type': 'KKPR',
  'usage': CompletionUsage(completion_tokens=8, prompt_tokens=9281, total_tokens=9289, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))},
 {'file_path': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKKPR Jargas Kota Dumai.pdf',
  'file_name': 'KKKPR Jargas Kota Dumai.pdf',
  'permit_type': 'KKPR',
  'usage': CompletionUsage(completion_tokens=8, prompt_tokens=8373, total_tokens=8381, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_toke

In [30]:
import json

# Convert usage objects to dictionaries for JSON serialization
def serialize_metadata(metadata_list):
    serialized = []
    for item in metadata_list:
        serialized_item = item.copy()
        if 'usage' in serialized_item and serialized_item['usage'] is not None:
            serialized_item['usage'] = serialized_item['usage'].model_dump()
        serialized.append(serialized_item)
    return serialized

In [32]:
# Write metadata_result to JSON file
output_file = "data/kkpr_reclassification.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(serialize_metadata(reclassified_results), f, indent=2, ensure_ascii=False)

print(f"Metadata saved to {output_file}")

Metadata saved to data/kkpr_reclassification.json


## Metadata Update in CosmosDB

In [1]:
import json

with open("data/kkpr_reclassification.json", "r", encoding="utf-8") as f:
    reclassified_results = json.load(f)

reclassified_results

[{'file_path': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKPR Jargas Kota Bandar Lampung.pdf',
  'file_name': 'KKPR Jargas Kota Bandar Lampung.pdf',
  'permit_type': 'KKPR',
  'usage': {'completion_tokens': 8,
   'prompt_tokens': 9281,
   'total_tokens': 9289,
   'completion_tokens_details': {'accepted_prediction_tokens': None,
    'audio_tokens': 0,
    'reasoning_tokens': 0,
    'rejected_prediction_tokens': None},
   'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}},
 {'file_path': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKKPR Jargas Kota Dumai.pdf',
  'file_name': 'KKKPR Jargas Kota Dumai.pdf',
  'permit_type': 'KKPR',
  'usage': {'completion_tokens': 8,
   'prompt_tokens': 8373,
   'total_tokens': 8381,
   'completion_tokens_details': {'accepted_prediction_tokens': None,
    'audio_tokens': 0,
    'reasoning_tokens': 0,
    'rejected_prediction_tokens': None},
   'prompt_tokens_details': {'audio_tokens': 

In [2]:
import os

from dotenv import load_dotenv

from azure.cosmos import CosmosClient

load_dotenv()

cosmos_client = CosmosClient(
    url=os.getenv("COSMOS_DB_URI"),
    credential=os.getenv("COSMOS_DB_KEY")
)

database_id = "permitMetadataDB"
container_id = "permitMetadataContainer"

database = cosmos_client.get_database_client(database_id)
container = database.get_container_client(container_id)

In [3]:
file_name = reclassified_results[0]['file_name']
print(f"{file_name}")

query = """
        SELECT c.id, c.documentTitle, c.permitType, c.organization
        FROM c
        WHERE c.documentTitle = @documentTitle
        """

results = container.query_items(
    query=query,
    parameters=[
        {"name": "@documentTitle", "value": file_name}
    ],
    enable_cross_partition_query=True
)

list(results)

KKPR Jargas Kota Bandar Lampung.pdf


[{'id': '73f70259869d4940b0f18f1470b730db',
  'documentTitle': 'KKPR Jargas Kota Bandar Lampung.pdf',
  'permitType': 'KKPR',
  'organization': 'PGN'}]

In [27]:
existing_item = container.read_item('73f70259869d4940b0f18f1470b730db', "KKPR/KKPRL")
existing_item['permitType'] = reclassified_results[0]['permit_type']
container.delete_item(
    item='73f70259869d4940b0f18f1470b730db',
    partition_key="KKPR/KKPRL"
)

new_item = container.create_item(body=existing_item)

print(new_item)

{'id': '73f70259869d4940b0f18f1470b730db', 'filepath': 'data/folder/PGN_Permits/2. KKPR-L_PGN/KKPR TERBIT JARGAS MANDIRI- CGP/KKPR Jargas Kota Bandar Lampung.pdf', 'documentTitle': 'KKPR Jargas Kota Bandar Lampung.pdf', 'organization': 'PGN', 'keywords': ['2. KKPR-L_PGN', 'KKPR TERBIT JARGAS MANDIRI- CGP', 'KKPR Jargas Kota Bandar Lampung.pdf'], 'permitType': 'KKPR', 'permits': [{'issueDate': '2024-08-06', 'expirationDate': '2027-08-06', 'permitSummary': 'The permit approves the utilization of spatial planning for business activities involving the development of gas pipelines. Located in Bandar Lampung, it outlines conditions such as adhering to infrastructure regulations for residential areas and public facilities and ensuring the protection of urban planning standards. Compliance with local zoning plans and technical considerations from the city are mandatory. The permit remains valid for 3 years from the issuance date, subject to renewal contingent on land area control. Any misuse o

In [4]:
for result in reclassified_results:
    file_name = result['file_name']
    new_permit_type = result['permit_type']
    
    # Query to find the item
    query = """
        SELECT c.id, c.permitType
        FROM c
        WHERE c.documentTitle = @documentTitle
    """
    
    items = list(container.query_items(
        query=query,
        parameters=[{"name": "@documentTitle", "value": file_name}],
        enable_cross_partition_query=True
    ))
    
    if items and len(items) == 1:
        item = items[0]
        item_id = item['id']
        old_permit_type = item['permitType']
        
        # Read the full item
        existing_item = container.read_item(item_id, old_permit_type)
        
        # Update permitType
        existing_item['permitType'] = new_permit_type
        
        # Delete old and create new (since partition key is changing)
        container.delete_item(item_id, old_permit_type)
        container.create_item(body=existing_item)
        
        print(f"Updated {file_name}: {old_permit_type} -> {new_permit_type}")

Updated KKPR Jargas Kota Bandar Lampung.pdf: KKPR -> KKPR
Updated KKKPR Jargas Kota Dumai.pdf: KKPR/KKPRL -> KKPR
Updated KKPR_Hotel Ratna_Sukabumi_Probolinggo.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Wijayakusuma.pdf: KKPR/KKPRL -> KKPR
Updated KKPR_Danau Indah_Bekasi Barat.pdf: KKPR/KKPRL -> KKPR
Updated KKPRL_Pembangunan Terminal Khusus_Teluk Bima_Bima.pdf: KKPR/KKPRL -> KKPRL
Updated KKPRL_Pembangunan Terminal Khusus_Laut Flores_Sumbawa.pdf: KKPR/KKPRL -> KKPRL
Updated KKPRL_Pembangunan Terminal Khusus_Laut Flores_Manggarai Barat.pdf: KKPR/KKPRL -> KKPRL
Updated KKPRL_Pembangunan Terminal Khusus Selat Lombok_Lombok Barat.pdf: KKPR/KKPRL -> KKPRL
Updated KKPR Cluster Kota Bekasi.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Cluster Kota Jakarta Barat.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Kilang RU III Area Banyuasi 1.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Industri Kimia (KBLI 20117) Area Plaju.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Kilang RU III Area Plaju.pdf: KKPR/KKPRL -> KKPR
Updated KKPR Industr

In [6]:
from collections import Counter

query = """
    SELECT c.permitType
    FROM c
"""
results = container.query_items(
    query=query,
    enable_cross_partition_query=True,
)

permit_types = [item['permitType'] for item in results]

type_counts = Counter(permit_types)
print(type_counts)

Counter({'PLO': 76, 'KKPR': 58, 'Ijin Lingkungan': 56, 'KKPRL': 40, 'KKPR/KKPRL': 2})


In [13]:
query = """
        SELECT c.documentTitle, c.permitType, c.organization, c.filepath,
               p.issueDate, p.expirationDate, p.permitSummary, p.permitNumber, p.installation
        FROM c
        JOIN p in c.permits
        WHERE c.permitType = @permitType
        """

results = container.query_items(
    query=query,
    parameters=[
        {"name": "@permitType", "value": "KKPR/KKPRL"}
    ],
    enable_cross_partition_query=True
)

list_results = list(results)

list_results

[{'documentTitle': 'RU V Balikpapan_SK 1019 2021_Lampiran I.pdf',
  'permitType': 'KKPR/KKPRL',
  'organization': 'KPI',
  'filepath': 'data/folder/KPI_Permits/PERLING/RU V Balikpapan/RU V Balikpapan_SK 1019 2021_Lampiran I.pdf',
  'issueDate': '2021-11-11',
  'expirationDate': '2024-04-04',
  'permitSummary': 'The document outlines the environmental feasibility of operational activities and development of a refinery unit in Balikpapan, managed by PT Kilang Pertamina Internasional. It includes pre-construction and construction phases considerations such as noise and air pollution, ensuring compliance with regulations. The focus is on mitigating environmental impacts like habitat disruption, improving air/water quality, and managing waste properly, while coordinating with local authorities and implementing proper handling measures.',
  'permitNumber': 'SK.1019/MENLHK/SETJEN/PLA.4/11/2021'},
 {'documentTitle': 'SKKL - SK.319_MENLH_SETJEN_PLA.4_4_2023.pdf',
  'permitType': 'KKPR/KKPRL',
 