In [None]:
!pip install pillow opencv-python
!pip install paddleocr paddlepaddle
!pip install gliner

Collecting paddleocr
  Downloading paddleocr-2.9.1-py3-none-any.whl.metadata (8.5 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecti

In [None]:
import json
from paddleocr import PaddleOCR
import cv2
import re  # Import regex for ID validation
from gliner import GLiNER

def extract_text_with_paddleocr(image_path):
    """
    Extract text from an image using PaddleOCR.
    """
    # Initialize PaddleOCR
    ocr = PaddleOCR(use_angle_cls=True, lang="en")  # Enable angle classification for rotated text

    # Load the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Image at path '{image_path}' could not be loaded.")

    # Perform OCR on the image directly (no preprocessing required)
    results = ocr.ocr(image_path, cls=True)

    # Extract and format the text from the OCR results
    extracted_text = "\n".join([line[1][0] for line in results[0]])

    return extracted_text

def is_valid_id(id_text):
    """
    Validate if the ID is in the correct format: 13 digits with two dashes (xxxxx-xxxxxxx-x).
    """
    pattern = r"^\d{5}-\d{7}-\d{1}$"
    return re.match(pattern, id_text) is not None

def save_to_json(data, file_path):
    """
    Save the extracted data to a JSON file.
    """
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to JSON: {e}")

def main():
    # Path to your CNIC card image
    image_path = "/content/test_card.png"  # Replace with the path to your image file

    # Extract text from the image
    try:
        extracted_text = extract_text_with_paddleocr(image_path)
        print("Extracted Text:")
        print(extracted_text)

        # Initialize GLiNER model
        model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
        model.eval()

        # Define labels for entity extraction
        labels = ["name", "father's name", "id", "nationality", "date", "M or F","country of stay"]

        # Predict entities
        entities = model.predict_entities(extracted_text, labels, threshold=0.3)

        # Initialize a dictionary to store the extracted information
        extracted_info = {}

        # Filter entities and populate the dictionary
        seen_name = False
        seen_fathers_name = False
        seen_nationality = False
        seen_gender = False
        dates = []

        for entity in entities:
            if entity["label"] == "name":
                if not seen_name:
                    extracted_info["name"] = entity["text"]
                    seen_name = True
            elif entity["label"] == "father's name":
                if not seen_fathers_name:
                    extracted_info["father's name"] = entity["text"]
                    seen_fathers_name = True
            elif entity["label"] == "nationality":
                if not seen_nationality:
                    extracted_info["nationality"] = entity["text"]
                    seen_nationality = True
            elif entity["label"] == "M or F":
                if not seen_gender:
                    extracted_info["gender"] = entity["text"]
                    seen_gender = True
            elif entity["label"] == "id":
                if is_valid_id(entity["text"]):
                    extracted_info["id"] = entity["text"]
            elif entity["label"] == "date":
                dates.append(entity["text"])

        # Assign the dates to the dictionary
        if len(dates) == 3:
            extracted_info["date_of_birth"] = dates[0]
            extracted_info["date_of_issuance"] = dates[1]
            extracted_info["date_of_expiry"] = dates[2]
        else:
            extracted_info["dates_detected"] = f"Not exactly three dates detected. Found: {len(dates)} dates."

        # Save the extracted information to a JSON file
        save_to_json(extracted_info, "extracted_info.json")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


[2024/12/16 12:43:51] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Data saved to extracted_info.json
