In [1]:
!pip install paddleocr
!pip install paddlepaddle
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install PyMuPDF
!pip install spacy openpyxl
!pip install gliner-spacy

Collecting paddleocr
  Downloading paddleocr-2.9.1-py3-none-any.whl.metadata (8.5 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting albumentations==1.4.10 (from paddleocr)
  Downloading albumentations-1.4.10-py3-none-any.whl.metadata (38 

In [2]:
from paddleocr import PaddleOCR
import spacy
from gliner_spacy.pipeline import GlinerSpacy
import openpyxl
from openpyxl.styles import Font

def extract_text_from_image(image_path):
    """
    Extracts text from an image using PaddleOCR.

    Parameters:
        image_path (str): Path to the image file.

    Returns:
        str: Extracted text concatenated into a single string.
    """
    # Initialize PaddleOCR with angle classification and English language support
    ocr = PaddleOCR(use_angle_cls=True, lang='en')

    # Perform OCR on the image to extract text
    results = ocr.ocr(image_path, cls=True)

    # Extract text from OCR results and store it in a list
    extracted_text = []
    for line in results[0]:
        extracted_text.append(line[1][0])  # Append the recognized text to the list

    # Return the extracted text as a single concatenated string
    return "\n".join(extracted_text)

def extract_personal_details(text):
    """
    Extracts personal details using GlinerSpacy.

    Parameters:
        text (str): Input text to process.

    Returns:
        dict: Extracted personal details categorized by label.
    """
    # Load the spaCy model and add the GlinerSpacy pipeline for personal details extraction
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("gliner_spacy", config={"labels": [
        "name", "phone no.", "old", "email", "school education",
        "role", "college", "year graduated", "birthdate"
    ]})

    # Process the input text using the spaCy model
    doc = nlp(text)

    # Create a dictionary to store extracted details
    personal_details = {}
    for ent in doc.ents:
        # Only take the first occurrence of each label
        if ent.label_ not in personal_details:
            if ent.label_ == "old":  # Special handling for "old" (age)
                cleaned_age = "".join(ent.text.split())  # Remove spaces from age
                if len(cleaned_age) > 2:  # Limit age length to two digits
                    cleaned_age = cleaned_age[:2]
                personal_details[ent.label_] = cleaned_age
            else:
                personal_details[ent.label_] = ent.text.strip()  # Store the entity text

    return personal_details

def save_to_excel_rowwise(details, output_file):
    """
    Saves extracted details to an Excel file in a row-wise format.

    Parameters:
        details (dict): Extracted details categorized by label.
        output_file (str): Path to the Excel file to save data.
    """
    # Create a new Excel workbook and select the active sheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active

    # Write the headers (keys of the details dictionary) to the first row
    headers = list(details.keys())
    sheet.append(headers)

    # Make the first row text bold
    bold_font = Font(bold=True)
    for cell in sheet[1]:
        cell.font = bold_font

    # Write the values (values of the details dictionary) to the second row
    sheet.append(list(details.values()))

    # Save the workbook to the specified output file
    workbook.save(output_file)
    print(f"Details saved to Excel file: {output_file}")

if __name__ == "__main__":
    # Path to the image file
    image_path = "/content/test_form.png"  # Replace with the path to your uploaded image

    # Output Excel file name
    output_excel_file = "extracted_details_rowwise.xlsx"

    # Step 1: Extract text from image using PaddleOCR
    extracted_text = extract_text_from_image(image_path)
    print("Extracted Text from Image:")
    print(extracted_text)

    # Step 2: Extract personal details using GlinerSpacy
    print("\nExtracted Personal Details:")
    personal_details = extract_personal_details(extracted_text)

    # Print the extracted personal details
    for label, entity in personal_details.items():
        print(f"{label}: {entity}")

    # Step 3: Save extracted details to an Excel file with bold first row
    save_to_excel_rowwise(personal_details, output_excel_file)


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:17<00:00, 226.27it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:20<00:00, 477.97it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:16<00:00, 127.77it/s]


[2025/01/20 14:50:47] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/792M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


name: Marion R
birthdate: 02/19/2055
email: marionlyons@email.com
phone no.: 222 555 7777
role: Junior Writer
school education: High School
college: N.Armatrong College
year graduated: 2075
Details saved to Excel file: extracted_details_rowwise.xlsx
