In [3]:
import easyocr

reader = easyocr.Reader(['en'])  # English only
results = reader.readtext('D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg')

# Combine extracted text
extracted_text = " ".join([res[1] for res in results])
print("Extracted Text:", extracted_text)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Text: ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT


In [5]:
import easyocr
import spacy
import re
import json
import cv2

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = "D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
result = reader.readtext(image_path, detail=0)  # Extract only text lines

# Combine all text into one string
text = " ".join(result)
print("Extracted Text:\n", text)

# Patterns for Aadhaar info
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract info
aadhaar_number = re.search(aadhaar_pattern, text)
phone_number = re.search(phone_pattern, text)
dob = re.search(dob_pattern, text)
gender = re.search(gender_pattern, text)

# Use spaCy for Name (Proper Noun)
doc = nlp(text)
names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Create structured data
form_data = {
    "Name": names[0] if names else None,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob.group() if dob else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": text  # Later refine with address extraction
}

# Output as JSON
print(json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Text:
 ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT
{
    "Name": null,
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT"
}


In [8]:
import easyocr
import re
import json

# -------------------------
# Initialize EasyOCR
# -------------------------
reader = easyocr.Reader(['en'])

# -------------------------
# Load Aadhaar image
# -------------------------
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # detailed output: (bbox, text, prob)

# -------------------------
# Regex patterns
# -------------------------
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\b\d{2}/\d{2}/\d{4}\b"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|M|F)\b"

# -------------------------
# Extract Aadhaar Number, DOB, Gender, Phone
# -------------------------
text_lines = [text for bbox, text, prob in results]
full_text = " ".join(text_lines)
print(text_lines)
aadhaar_number = re.search(aadhaar_pattern, full_text)
dob_match = re.search(dob_pattern, full_text)
gender_match = re.search(gender_pattern, full_text)
phone_match = re.search(phone_pattern, full_text)

# -------------------------
# Find Name (text above DOB)
# -------------------------
dob_y = None
for bbox, text, prob in results:
    if re.search(dob_pattern, text):
        dob_y = bbox[0][1]  # top-left y coordinate of DOB
        break

name_candidates = []
if dob_y:
    for bbox, text, prob in results:
        if bbox[0][1] < dob_y:  # lines above DOB
            name_candidates.append(text)

# Heuristic: usually the longest line above DOB is the name
name = max(name_candidates, key=len) if name_candidates else None

# -------------------------
# Extract Address (lines after Aadhaar Number)
# -------------------------
aadhaar_y = None
for bbox, text, prob in results:
    if re.search(aadhaar_pattern, text):
        aadhaar_y = bbox[0][1]  # top-left y coordinate of Aadhaar Number
        break

address_lines = []
if aadhaar_y:
    for bbox, text, prob in results:
        if bbox[0][1] > aadhaar_y:
            address_lines.append(text)

address = ", ".join(address_lines) if address_lines else None

# -------------------------
# Build JSON output
# -------------------------
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender_match.group() if gender_match else None,
    "Phone": phone_match.group() if phone_match else None,
    "Address": address
}

# -------------------------
# Output result
# -------------------------
print("Extracted Text:\n", full_text)
print("\nStructured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


['ATTT', 'TET', 'GOVERNMENT OF INDIA', 'AADAAAR', 'Elon', 'Musk', 'Male', '28/06/1971', '789, Space', 'Colony', '4567', '8901', '2345', 'AT 31renr; A 48TT']
Extracted Text:
 ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT

Structured Data:
 {
    "Name": "GOVERNMENT OF INDIA",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": null
}


In [9]:
import easyocr
import spacy
import re
import json

# Initialize EasyOCR and spaCy
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # Get bbox, text, prob

# Extract text with positions
lines = []
for bbox, text, prob in results:
    # bbox[0][1] is top-left y coordinate
    lines.append({'text': text, 'y': bbox[0][1]})

# Combine all text for regex extraction
all_text = " ".join([line['text'] for line in lines])

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract Aadhaar, DOB, Gender, Phone
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
dob_y = None
if dob_match:
    # Find y-coordinate of DOB line
    for line in lines:
        if dob_match.group() in line['text']:
            dob_y = line['y']
            break

phone_number = re.search(phone_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Extract Name using spaCy + layout
headers = ["GOVERNMENT OF INDIA", "AADHAAR", "UNIQUE IDENTIFICATION AUTHORITY OF INDIA"]

name_candidates = []
for line in lines:
    text_upper = line['text'].upper()
    if dob_y and line['y'] < dob_y and text_upper not in headers:
        # Check if it contains a PERSON entity
        doc = nlp(line['text'])
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                name_candidates.append((ent.text, line['y']))

# Pick the candidate closest to DOB
name = None
if name_candidates:
    # Closest to DOB from above
    name = max(name_candidates, key=lambda x: x[1])[0]

# Address: everything below name & DOB as a fallback
address_candidates = []
for line in lines:
    if name and line['y'] > [y for t, y in name_candidates if t == name][0]:
        address_candidates.append(line['text'])
address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

# Output JSON
print("Structured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Structured Data:
 {
    "Name": null,
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": null
}


In [10]:
import easyocr
import spacy
import re
import json

# Initialize EasyOCR and spaCy
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # Get bbox, text, prob

# Extract text lines
lines = [text for _, text, _ in results]

# Combine all text for regex extraction
all_text = " ".join(lines)

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract Aadhaar, DOB, Gender, Phone
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Find name based on DOB position
name = None
if dob_match:
    dob_text = dob_match.group()
    if dob_text in lines:
        dob_index = lines.index(dob_text)
    else:
        # Search line that contains DOB text
        dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    
    if dob_index is not None and dob_index >= 2:
        name_candidate = lines[dob_index - 2]
        # Optional: clean name (remove unwanted words like "GOVERNMENT")
        if "GOVERNMENT" not in name_candidate.upper() and "INDIA" not in name_candidate.upper():
            name = name_candidate
        else:
            name = lines[dob_index - 1]  # fallback to previous line

# Address: take everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

# Output JSON
print("Structured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Structured Data:
 {
    "Name": "Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


In [18]:
import easyocr
import spacy
import re
import json

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # Get bbox, text, prob

# Extract text lines
lines = [text.strip() for _, text, _ in results]

# Combine all text for regex
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: two lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)

    if dob_index is not None:
        # Take two lines above DOB if available
        candidate_lines = []
        if dob_index - 2 :
            candidate_lines.append(lines[dob_index - 2])
        if dob_index - 1:
            candidate_lines.append(lines[dob_index - 1])

        # Merge and clean
        merged_text = " ".join(candidate_lines)
        # Remove headers like "GOVERNMENT OF INDIA" or "AADHAAR"
        for header in ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION"]:
            merged_text = re.sub(header, "", merged_text, flags=re.IGNORECASE).strip()

        # Use spaCy to get PERSON entity from merged text
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

        if person_names:
            name = " ".join(person_names)  # Take full name if multiple detected
        else:
            name = merged_text  # Fallback to merged text

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

# Output JSON
print("Structured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Structured Data:
 {
    "Name": "Musk Male",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


In [19]:
import easyocr
import spacy
import re
import json

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # Get bbox, text, prob

# Extract text lines
lines = [text.strip() for _, text, _ in results]

# Combine all text for regex
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: two lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)

    if dob_index is not None:
        # Take two lines above DOB if available
        candidate_lines = []
        if dob_index - 2 >= 0:
            candidate_lines.append(lines[dob_index - 2])
        if dob_index - 1 >= 0:
            candidate_lines.append(lines[dob_index - 1])

        # Merge candidate lines
        merged_text = " ".join(candidate_lines)

        # Remove gender words and headers
        merged_text = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", merged_text)
        for header in ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION"]:
            merged_text = re.sub(header, "", merged_text, flags=re.IGNORECASE).strip()

        # Use spaCy to detect PERSON names
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

        if len(person_names) > 1:
            name = " ".join(person_names)  # Combine first + last name
        elif len(person_names) == 1:
            name = person_names[0]
        else:
            # Fallback: take tokens before gender word
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])  # Assume first two tokens are name

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    # Remove Aadhaar number from address
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

# Output JSON
print("Structured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Structured Data:
 {
    "Name": "Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


In [20]:
import easyocr
import spacy
import re
import json

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(image_path)  # Get bbox, text, prob

# Extract text lines
lines = [text.strip() for _, text, _ in results]

# Combine all text for regex
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: take ALL lines above DOB, clean them, detect PERSON
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)

    if dob_index is not None:
        # Take all lines above DOB
        candidate_lines = lines[:dob_index]

        # Remove headers and gender words
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            # Remove gender
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            # Remove header keywords
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())

        # Merge into single string
        merged_text = " ".join(cleaned_lines)

        # Use spaCy to detect PERSON names
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

        if person_names:
            name = " ".join(person_names)
        else:
            # Fallback: take first two cleaned tokens
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    # Remove Aadhaar number from address
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

# Output JSON
print("Structured Data:\n", json.dumps(form_data, indent=4))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Structured Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


In [45]:
import easyocr
import spacy
import re
import json
import cv2
from PIL import ImageFont, ImageDraw, Image
import numpy as np

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Aadhaar image path
aadhar_image = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

# Read Aadhaar image text
results = reader.readtext(aadhar_image)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Extract Name (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)

    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())

        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("Extracted Data:\n", json.dumps(form_data, indent=4))

# ===================== FORM FILLING PART =====================

# Load form image
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path)
draw = ImageDraw.Draw(form_img)

# Font settings
font = ImageFont.truetype("arial.ttf", 12)  # Adjust font size

# Coordinates for fields (you need to find these by inspecting your form)
coords = {
    "Name": (138, 135),
    "Aadhaar Number": (300, 350),
    "Date of Birth": (300, 450),
    "Gender": (300, 550),
    "Phone": (300, 650),
    "Address": (300, 750)
}

# Write text on form
for key, value in form_data.items():
    if value:
        draw.text(coords[key], value, font=font, fill=(0, 0, 0))

# Save filled form
import os

# Ensure output directory exists
output_dir = r"D:\Form_automation\Output"
os.makedirs(output_dir, exist_ok=True)

# Save filled form
output_path = os.path.join(output_dir, "filled_form.jpg")
form_img.save(output_path)

print(f"✅ Form filled and saved at: {output_path}")



Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}
✅ Form filled and saved at: D:\Form_automation\Output\filled_form.jpg


In [46]:
!pip install layoutparser


Collecting layoutparser
  Downloading layoutparser-0.3.4-py3-none-any.whl.metadata (7.7 kB)
Collecting pandas (from layoutparser)
  Downloading pandas-2.3.2-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting iopath (from layoutparser)
  Downloading iopath-0.1.10.tar.gz (42 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pdfplumber (from layoutparser)
  Using cached pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdf2image (from layoutparser)
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting portalocker (from iopath->layoutparser)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting pytz>=2020.1 (from pandas->layoutparser)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->layoutparser)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting pdfminer.six==20250506 (from pdf

  DEPRECATION: Building 'iopath' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'iopath'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [54]:
import easyocr
import spacy
import re
import json
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import layoutparser as lp
from layoutparser.models.detection import Detectron2LayoutModel
# ------------------------------
# Step 1: Extract data from Aadhaar
# ------------------------------
# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Load Aadhaar image
aadhaar_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_path)

# Extract text lines
lines = [text.strip() for _, text, _ in results]

# Combine all text for regex
all_text = " ".join(lines)

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: take ALL lines above DOB, clean them, detect PERSON
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

# ------------------------------
# Step 2: Load form and detect text fields
# ------------------------------
form_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_path).convert("RGB")
image_np = np.array(form_img)

# Load PubLayNet model to detect text boxes
model = Detectron2LayoutModel(
    'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text"}
)

layout = model.detect(image_np)

# ------------------------------
# Step 3: Map extracted data to form fields
# ------------------------------
# You can manually map fields to boxes if form is fixed
# Sort boxes top-to-bottom
text_blocks = sorted([b for b in layout if b.type=='Text'], key=lambda x: x.block.y_1)

# Prepare draw
draw = ImageDraw.Draw(form_img)
font = ImageFont.load_default()

# Fill each field
for block in text_blocks:
    # Example: map based on position
    y_center = (block.block.y_1 + block.block.y_2) / 2
    if y_center < form_img.height * 0.2:
        # Top area -> Name
        draw.text((block.block.x_1, block.block.y_1), name or "", fill="black", font=font)
    elif y_center < form_img.height * 0.35:
        # Aadhaar Number area
        draw.text((block.block.x_1, block.block.y_1), aadhaar_number.group() if aadhaar_number else "", fill="black", font=font)
    elif y_center < form_img.height * 0.5:
        # DOB
        draw.text((block.block.x_1, block.block.y_1), dob_match.group() if dob_match else "", fill="black", font=font)
    elif y_center < form_img.height * 0.65:
        # Gender
        draw.text((block.block.x_1, block.block.y_1), gender.group() if gender else "", fill="black", font=font)
    else:
        # Address
        draw.text((block.block.x_1, block.block.y_1), address or "", fill="black", font=font)

# ------------------------------
# Step 4: Save filled form
# ------------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")

# ------------------------------
# Step 5: Optional - print JSON
# ------------------------------
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}
print(json.dumps(form_data, indent=4))


ModuleNotFoundError: No module named 'layoutparser.models.detection'

In [52]:
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.1/index.html


Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.1/index.html


ERROR: Could not find a version that satisfies the requirement detectron2 (from versions: none)
ERROR: No matching distribution found for detectron2


In [55]:
import easyocr
import spacy
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
from layoutparser.models.detection import Detectron2LayoutModel

# -----------------------------
# Step 1: Extract data from Aadhaar
# -----------------------------
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Aadhaar image path
aadhaar_img_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_img_path)

# Extract text lines
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Extract Name (lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: lines after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

# Structured data
form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# -----------------------------
# Step 2: Fill data into form using Layout Parser
# -----------------------------
# Form image path
form_img_path = r"D:\Form_automation\Form_pic\form.jpg"
form_img = Image.open(form_img_path).convert("RGB")
image_np = np.array(form_img)

# Load Detectron2 model for text detection
model = Detectron2LayoutModel(
    'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text"}
)

layout = model.detect(image_np)

# Sort boxes top-to-bottom
layout = sorted(layout, key=lambda b: b.coordinates[1])

# Prepare to draw text
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", size=24)  # adjust size

# Map fields manually (for fixed form)
fields = ["Name", "Date of Birth", "Aadhaar Number", "Gender", "Phone", "Address"]
field_values = [form_data.get(f) for f in fields]

for box, value in zip(layout, field_values):
    if value:
        x1, y1, x2, y2 = map(int, box.coordinates)
        draw.text((x1+5, y1+5), str(value), fill="black", font=font)

# Save filled form
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


ModuleNotFoundError: No module named 'layoutparser.models.detection'

In [58]:
import easyocr
import spacy
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
from layoutparser.models import Detectron2LayoutModel

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_image_path)

lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: all lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

# Load PubLayNet model
model = Detectron2LayoutModel(
    'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text"}
)

layout = model.detect(image_np)

# Sort detected boxes top-to-bottom
text_blocks = [b for b in layout if b.type=="Text"]
text_blocks = sorted(text_blocks, key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Fill Form
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

# Manual mapping: map fields in order detected
# Adjust these indices based on your form
field_order = ["Name", "Aadhaar Number", "Date of Birth", "Gender", "Phone", "Address"]

for i, field in enumerate(field_order):
    if i >= len(text_blocks):
        break
    box = text_blocks[i].block
    x, y = int(box.x_1), int(box.y_1)
    draw.text((x, y), str(form_data[field]), fill="black", font=font)

# Save filled form
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


ImportError: 
Detectron2LayoutModel requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
that match your environment. Typically the following would work for MacOS or Linux CPU machines:
    pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2' 


In [56]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.15/index.html
!pip install "layoutparser[detectron2]"


Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl.metadata (7.4 kB)
Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------------------------------- 2.5/2.5 MB 72.1 MB/s eta 0:00:00
Installing collected packages: torchaudio
Successfully installed torchaudio-2.8.0+cpu
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.15/index.html


ERROR: Could not find a version that satisfies the requirement detectron2 (from versions: none)
ERROR: No matching distribution found for detectron2






In [64]:
import easyocr
import spacy
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_image_path)

lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: all lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields (Tesseract)
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)
from layoutparser.ocr import TesseractAgent
# Tesseract-based text detection
model = TesseractAgent(languages='eng')
layout = model.detect(image_np)

# Sort detected boxes top-to-bottom
text_blocks = sorted([b for b in layout if b.type=="Text"], key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Fill Form
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

field_order = ["Name", "Aadhaar Number", "Date of Birth", "Gender", "Phone", "Address"]

for i, field in enumerate(field_order):
    if i >= len(text_blocks):
        break
    box = text_blocks[i].block
    x, y = int(box.x_1), int(box.y_1)
    draw.text((x, y), str(form_data[field]), fill="black", font=font)

# Save filled form
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


ImportError: 
TesseractAgent requires the PyTesseract library but it was not found in your environment. You can install it with pip:
`pip install pytesseract`


In [None]:
import easyocr
import spacy
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
from layoutparser.ocr import TesseractAgent

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_image_path)

lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: all lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

form_data = {
    "name": name,
    "aadhaar number": aadhaar_number.group() if aadhaar_number else None,
    "date of birth": dob_match.group() if dob_match else None,
    "gender": gender.group() if gender else None,
    "phone": phone_number.group() if phone_number else None,
    "address": address
}

print("Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields (Tesseract)
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

# Tesseract-based text detection
model = TesseractAgent(languages='eng')
layout = model.detect(image_np)

# ----------------------------
# Step 3: Map Form Fields Dynamically
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

for block in layout:
    text = block.text.lower().strip()
    x1, y1, x2, y2 = block.block.x_1, block.block.y_1, block.block.x_2, block.block.y_2

    for field, value in form_data.items():
        if field in text and value:
            # Write after the label (x2 + offset)
            draw.text((x2 + 20, y1), str(value), fill="black", font=font)

# Save filled form
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


In [72]:
import easyocr
import spacy
import re
import json
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
import pytesseract
import pytesseract

# Path to Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR"


# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"
results = reader.readtext(aadhaar_image_path)

lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract fields
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction: all lines above DOB
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields using LayoutParser + Tesseract
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

# LayoutParser TesseractAgent for structured OCR
from layoutparser.ocr import TesseractAgent
ocr_agent = TesseractAgent(languages='eng')
layout = ocr_agent.detect(image_np)

# Sort detected boxes (top-to-bottom)
text_blocks = sorted([b for b in layout if b.type == "Text"], key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Fill Form Dynamically
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

field_order = ["Name", "Aadhaar Number", "Date of Birth", "Gender", "Phone", "Address"]

for i, field in enumerate(field_order):
    if i >= len(text_blocks):
        break
    box = text_blocks[i].block
    x, y = int(box.x_1), int(box.y_1)
    value = form_data[field] if form_data[field] else ""
    draw.text((x + 150, y), value, fill="black", font=font)  # Offset to write next to label

# Save filled form
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


ImportError: 
TesseractAgent requires the PyTesseract library but it was not found in your environment. You can install it with pip:
`pip install pytesseract`


In [None]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
import pytesseract

# ----------------------------
# Step 0: Setup Tesseract
# ----------------------------
# Update this path to your Tesseract installation
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

# OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
phone_pattern = r"\b[6-9]\d{9}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1:]
    address_candidates = [t for t in address_candidates if not re.match(aadhaar_pattern, t)]
    address = ", ".join(address_candidates) if address_candidates else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Phone": phone_number.group() if phone_number else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields using LayoutParser + Tesseract
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

# LayoutParser OCR agent
from layoutparser.ocr import TesseractAgent
ocr_agent = TesseractAgent(languages='eng')
layout = ocr_agent.detect(image_np)

# Sort detected text boxes top-to-bottom
text_blocks = sorted([b for b in layout if b.type == "Text"], key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Fill Form Dynamically
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)  # Adjust font size

# Map data fields to form labels (order based on form design)
field_order = ["Name", "Aadhaar Number", "Date of Birth", "Gender", "Phone", "Address"]

for i, field in enumerate(field_order):
    if i >= len(text_blocks):
        break
    box = text_blocks[i].block
    x, y = int(box.x_1), int(box.y_1)
    value = form_data[field] if form_data[field] else ""
    draw.text((x + 150, y), value, fill="black", font=font)  # Offset to align next to label

# ----------------------------
# Step 4: Save Filled Form
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


✅ Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Phone": null,
    "Address": "789, Space, Colony, 4567, 8901, 2345, AT 31renr; A 48TT"
}


AttributeError: 'str' object has no attribute 'type'

In [None]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
import pytesseract
from layoutparser.ocr import TesseractAgent

# ----------------------------
# Step 0: Setup Tesseract Path
# ----------------------------
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)
print(all_text)
# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
phone_number = re.search(phone_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: everything after DOB
address = None
if dob_index is not None:
    # Take only the next 2 lines after DOB
    address_candidates = lines[dob_index + 1 : dob_index + 3]
    cleaned_address_lines = []
    for line in address_candidates:
        # Remove only Aadhaar number (12-digit format)
        line = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line)
        if line.strip():
            cleaned_address_lines.append(line.strip())
    # Join lines to form final address
    address = " ".join(cleaned_address_lines) if cleaned_address_lines else None

# Update form_data
form_data["Address"] = address



form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields using LayoutParser + Tesseract
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

ocr_agent = TesseractAgent(languages='eng')

# Detect layout
layout = ocr_agent.detect(image_np, return_response=False)
text_blocks = [b for b in layout if hasattr(b, "type") and b.type == "Text"]
text_blocks = sorted(text_blocks, key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Match Labels to Data
# ----------------------------
# Lowercase mapping for robust matching
label_map = {
    "name": "Name",
    "aadhaar": "Aadhaar Number",
    "aadhar": "Aadhaar Number",
    "date of birth": "Date of Birth",
    "dob": "Date of Birth",
    "gender": "Gender",
    "address": "Address"
}

draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

for block in text_blocks:
    text = block.text.strip().lower()
    matched_key = None
    for label in label_map:
        if label in text:
            matched_key = label_map[label]
            break
    if matched_key and matched_key in form_data:
        value = form_data[matched_key] if form_data[matched_key] else ""
        if value:
            x, y = int(block.block.x_1), int(block.block.y_1)
            draw.text((x + 200, y), value, fill="black", font=font)  # Offset for writing next to label

# ----------------------------
# Step 4: Save Filled Form
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT
✅ Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Address": "789, Space Colony"
}
✅ Form filled and saved at: D:\Form_automation\Output\filled_form.jpg


In [28]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from layoutparser.ocr import TesseractAgent

# ----------------------------
# Step 0: Setup Tesseract Path
# ----------------------------
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: only next 2 lines after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1 : dob_index + 3]
    cleaned_address_lines = []
    for line in address_candidates:
        line = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line)
        if line.strip():
            cleaned_address_lines.append(line.strip())
    address = " ".join(cleaned_address_lines) if cleaned_address_lines else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:\n", json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields dynamically using LayoutParser
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

ocr_agent = TesseractAgent(languages='eng')
layout = ocr_agent.detect(image_np, return_response=False)

# Filter text blocks
text_blocks = [b for b in layout if hasattr(b, "type") and b.type == "Text"]

# Sort top-to-bottom
text_blocks = sorted(text_blocks, key=lambda x: x.block.y_1)

# ----------------------------
# Step 3: Fill Form dynamically
# ----------------------------
draw = ImageDraw.Draw(form_img)
font = ImageFont.truetype("arial.ttf", 24)

# Lowercase mapping for labels
label_map = {
    "name": "Name",
    "aadhaar": "Aadhaar Number",
    "aadhar": "Aadhaar Number",
    "date of birth": "Date of Birth",
    "dob": "Date of Birth",
    "gender": "Gender",
    "address": "Address"
}

# Fill values near detected labels
for block in text_blocks:
    detected_text = block.text.strip().lower()
    for label in label_map:
        if label in detected_text:
            field_name = label_map[label]
            value = form_data.get(field_name, "")
            if value:
                # Use coordinates of the label block + offset
                x = int(block.block.x_2) + 20  # start writing a bit to the right
                y = int(block.block.y_1)
                draw.text((x, y), value, fill="black", font=font)
            break  # Stop checking other labels for this block

# ----------------------------
# Step 4: Save Filled Form
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


✅ Extracted Aadhaar Data:
 {
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Address": "789, Space Colony"
}
✅ Form filled and saved at: D:\Form_automation\Output\filled_form.jpg


In [None]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import pytesseract
import cv2

# ----------------------------
# Step 0: Setup Tesseract Path
# ----------------------------
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data (Your existing code - working fine)
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

print(f"📄 Extracted text from Aadhaar: {all_text}")

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: only next 2 lines after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1 : dob_index + 3]
    cleaned_address_lines = []
    for line in address_candidates:
        line = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line)
        if line.strip():
            cleaned_address_lines.append(line.strip())
    address = " ".join(cleaned_address_lines) if cleaned_address_lines else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:")
print(json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: FIXED Form Field Detection using Tesseract directly
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"

def detect_form_fields_improved(image_path):
    """Improved form field detection using multiple methods"""
    
    # Load image
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not load image: {image_path}")
    
    # Convert to RGB for PIL
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img_rgb)
    
    # Method 1: Use Tesseract to get text with bounding boxes
    print("🔍 Detecting form fields using Tesseract...")
    
    # Get detailed OCR data
    ocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    
    text_blocks = []
    for i in range(len(ocr_data['text'])):
        text = ocr_data['text'][i].strip()
        conf = ocr_data['conf'][i]
        
        if text and conf > 30:  # Only consider confident detections
            x, y, w, h = ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i]
            text_blocks.append({
                'text': text,
                'confidence': conf,
                'bbox': (x, y, x + w, y + h),
                'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h
            })
    
    print(f"📋 Found {len(text_blocks)} text blocks in form")
    
    # Debug: Print all detected text
    print("🔤 Detected text blocks:")
    for i, block in enumerate(text_blocks):
        print(f"  {i}: '{block['text']}' at ({block['x1']}, {block['y1']}) conf: {block['confidence']}")
    
    return text_blocks, pil_img

def find_field_fill_position(text_blocks, field_keywords):
    """Find the best position to fill a field based on label detection"""
    
    for block in text_blocks:
        text_lower = block['text'].lower().strip()
        
        # Check if this block contains any of our field keywords
        for keyword in field_keywords:
            if keyword.lower() in text_lower:
                print(f"  🎯 Found label '{block['text']}' for keyword '{keyword}'")
                
                # Strategy 1: Look for colon and fill after it
                if ':' in block['text']:
                    # Fill right after the colon in the same block
                    return block['x2'] + 10, block['y1']
                
                # Strategy 2: Look for underscore or line after the label
                # Find text blocks that are on the same line (similar y coordinate)
                same_line_blocks = []
                for other_block in text_blocks:
                    if abs(other_block['y1'] - block['y1']) < 15:  # Same line tolerance
                        same_line_blocks.append(other_block)
                
                # Sort by x coordinate
                same_line_blocks.sort(key=lambda x: x['x1'])
                
                # Find our label block in the sorted list
                label_index = -1
                for i, same_block in enumerate(same_line_blocks):
                    if same_block['text'] == block['text']:
                        label_index = i
                        break
                
                # Look for space after the label
                if label_index >= 0 and label_index < len(same_line_blocks) - 1:
                    next_block = same_line_blocks[label_index + 1]
                    # If there's a significant gap, fill in that gap
                    if next_block['x1'] - block['x2'] > 50:
                        return block['x2'] + 20, block['y1']
                    # If next block might be an underscore or line, fill over it
                    elif '_' in next_block['text'] or '___' in next_block['text']:
                        return next_block['x1'] + 5, next_block['y1']
                
                # Strategy 3: Default - fill to the right of the label
                return block['x2'] + 20, block['y1']
    
    return None

# Detect form fields
text_blocks, form_img = detect_form_fields_improved(form_image_path)

# ----------------------------
# Step 3: FIXED Form Filling Logic
# ----------------------------
draw = ImageDraw.Draw(form_img)

# Try to load font
try:
    font = ImageFont.truetype("arial.ttf", 20)
    print("✅ Loaded Arial font")
except:
    try:
        font = ImageFont.truetype("calibri.ttf", 20)
        print("✅ Loaded Calibri font")
    except:
        font = ImageFont.load_default()
        print("⚠️ Using default font")

# Enhanced label mapping with multiple variations
field_keywords = {
    "Name": ["name", "full name", "applicant name", "person name", "naam"],
    "Aadhaar Number": ["aadhaar", "aadhar", "aadhaar number", "aadhar number", "uid", "unique id"],
    "Date of Birth": ["date of birth", "dob", "birth date", "date birth", "birth"],
    "Gender": ["gender", "sex", "male/female", "m/f"],
    "Address": ["address", "residence", "location", "home address", "present address"]
}

print("\n📝 Filling form fields...")

filled_count = 0

# Try to fill each field
for field_name, field_value in form_data.items():
    if field_value:  # Only fill if we have data
        print(f"\n🔍 Looking for field: {field_name} (value: {field_value})")
        
        # Get keywords for this field
        keywords = field_keywords.get(field_name, [field_name.lower()])
        
        # Find position to fill
        position = find_field_fill_position(text_blocks, keywords)
        
        if position:
            x, y = position
            print(f"  ✅ Filling '{field_name}' at position ({x}, {y})")
            
            # Draw the value
            draw.text((x, y), str(field_value), fill="blue", font=font)
            filled_count += 1
        else:
            print(f"  ❌ Could not find position for '{field_name}'")

print(f"\n🎉 Successfully filled {filled_count} out of {len([v for v in form_data.values() if v])} fields")

# ----------------------------
# Step 4: Save Filled Form with Debug Info
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"

# Optional: Draw debug rectangles around detected text blocks
# debug_mode = True  # Set to False to disable debug rectangles

# if debug_mode:
#     print("\n🐛 Adding debug rectangles...")
#     debug_draw = ImageDraw.Draw(form_img)
    
#     for i, block in enumerate(text_blocks):
#         x1, y1, x2, y2 = block['bbox']
#         # Draw red rectangle around detected text
#         debug_draw.rectangle([x1-1, y1-1, x2+1, y2+1], outline="red", width=1)
#         # Add small text with block number
#         debug_draw.text((x1, y1-15), str(i), fill="red", font=font)

# form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")

# ----------------------------
# Step 5: Additional Debugging Information
# ----------------------------
print("\n📊 Summary:")
print(f"  • Extracted {len([v for v in form_data.values() if v])} fields from Aadhaar")
print(f"  • Detected {len(text_blocks)} text blocks in form")
print(f"  • Successfully filled {filled_count} fields")

if filled_count == 0:
    print("\n🔧 Troubleshooting tips:")
    print("  1. Check if the form image is clear and readable")
    print("  2. Verify that field labels match the expected keywords")
    print("  3. Try adjusting the confidence threshold (currently 30)")
    print("  4. Check if debug rectangles appear around text in the output image")

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


📄 Extracted text from Aadhaar: ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT
✅ Extracted Aadhaar Data:
{
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Address": "789, Space Colony"
}
🔍 Detecting form fields using Tesseract...
📋 Found 217 text blocks in form
🔤 Detected text blocks:
  0: 'FORM' at (294, 30) conf: 96
  1: '1:' at (336, 18) conf: 82
  2: 'Aadhaar' at (353, 30) conf: 92
  3: 'Enrolment' at (407, 30) conf: 95
  4: 'and' at (473, 18) conf: 95
  5: 'Update' at (499, 30) conf: 96
  6: 'For' at (97, 48) conf: 96
  7: '(a)' at (121, 48) conf: 94
  8: 'Resident' at (140, 48) conf: 95
  9: 'Indian,' at (193, 48) conf: 96
  10: 'or' at (238, 51) conf: 92
  11: '(b)' at (254, 48) conf: 92
  12: 'Non-Resident' at (274, 48) conf: 95
  13: 'Indian' at (355, 48) conf: 95
  14: 'having' at (397, 48) conf: 96
  15: 'Proof' at (439, 48) conf: 9

In [None]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import pytesseract
import cv2

# ----------------------------
# Step 0: Setup Tesseract Path
# ----------------------------
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data (Your existing code - working fine)
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)

print(f"📄 Extracted text from Aadhaar: {all_text}")

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (all lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: only next 2 lines after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1 : dob_index + 3]
    cleaned_address_lines = []
    for line in address_candidates:
        line = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line)
        if line.strip():
            cleaned_address_lines.append(line.strip())
    address = " ".join(cleaned_address_lines) if cleaned_address_lines else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:")
print(json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: FIXED Form Field Detection using Tesseract directly
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"

def detect_form_fields_improved(image_path):
    """Improved form field detection using multiple methods"""
    
    # Load image
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not load image: {image_path}")
    
    # Convert to RGB for PIL
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img_rgb)
    
    # Method 1: Use Tesseract to get text with bounding boxes
    print("🔍 Detecting form fields using Tesseract...")
    
    # Get detailed OCR data
    ocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    
    text_blocks = []
    for i in range(len(ocr_data['text'])):
        text = ocr_data['text'][i].strip()
        conf = ocr_data['conf'][i]
        
        if text and conf > 30:  # Only consider confident detections
            x, y, w, h = ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i]
            text_blocks.append({
                'text': text,
                'confidence': conf,
                'bbox': (x, y, x + w, y + h),
                'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h
            })
    
    print(f"📋 Found {len(text_blocks)} text blocks in form")
    
    # Debug: Print all detected text
    print("🔤 Detected text blocks:")
    for i, block in enumerate(text_blocks):
        print(f"  {i}: '{block['text']}' at ({block['x1']}, {block['y1']}) conf: {block['confidence']}")
    
    return text_blocks, pil_img

def find_field_fill_position(text_blocks, field_keywords):
    """Find the best position to fill a field based on label detection"""
    
    for block in text_blocks:
        text_lower = block['text'].lower().strip()
        
        # Check if this block contains any of our field keywords
        for keyword in field_keywords:
            if keyword.lower() in text_lower:
                print(f"  🎯 Found label '{block['text']}' for keyword '{keyword}'")
                
                # Strategy 1: Look for colon and fill after it
                if ':' in block['text']:
                    # Fill right after the colon in the same block
                    return block['x2'] + 10, block['y1']
                
                # Strategy 2: Look for underscore or line after the label
                # Find text blocks that are on the same line (similar y coordinate)
                same_line_blocks = []
                for other_block in text_blocks:
                    if abs(other_block['y1'] - block['y1']) < 15:  # Same line tolerance
                        same_line_blocks.append(other_block)
                
                # Sort by x coordinate
                same_line_blocks.sort(key=lambda x: x['x1'])
                
                # Find our label block in the sorted list
                label_index = -1
                for i, same_block in enumerate(same_line_blocks):
                    if same_block['text'] == block['text']:
                        label_index = i
                        break
                
                # Look for space after the label
                if label_index >= 0 and label_index < len(same_line_blocks) - 1:
                    next_block = same_line_blocks[label_index + 1]
                    # If there's a significant gap, fill in that gap
                    if next_block['x1'] - block['x2'] > 50:
                        return block['x2'] + 20, block['y1']
                    # If next block might be an underscore or line, fill over it
                    elif '_' in next_block['text'] or '___' in next_block['text']:
                        return next_block['x1'] + 5, next_block['y1']
                
                # Strategy 3: Default - fill to the right of the label
                return block['x2'] + 20, block['y1']
    
    return None

# Detect form fields
text_blocks, form_img = detect_form_fields_improved(form_image_path)

# ----------------------------
# Step 3: FIXED Form Filling Logic
# ----------------------------
draw = ImageDraw.Draw(form_img)

# Try to load font
try:
    font = ImageFont.truetype("arial.ttf", 20)
    print("✅ Loaded Arial font")
except:
    try:
        font = ImageFont.truetype("calibri.ttf", 20)
        print("✅ Loaded Calibri font")
    except:
        font = ImageFont.load_default()
        print("⚠️ Using default font")

# Enhanced label mapping with multiple variations
field_keywords = {
    "Name": ["name", "full name", "applicant name", "person name", "naam"],
    "Aadhaar Number": ["aadhaar", "aadhar", "aadhaar number", "aadhar number", "uid", "unique id"],
    "Date of Birth": ["date of birth", "dob", "birth date", "date birth", "birth"],
    "Gender": ["gender", "sex", "male/female", "m/f"],
    "Address": ["address", "residence", "location", "home address", "present address"]
}

print("\n📝 Filling form fields...")

filled_count = 0

# Try to fill each field
for field_name, field_value in form_data.items():
    if field_value:  # Only fill if we have data
        print(f"\n🔍 Looking for field: {field_name} (value: {field_value})")
        
        # Get keywords for this field
        keywords = field_keywords.get(field_name, [field_name.lower()])
        
        # Find position to fill
        position = find_field_fill_position(text_blocks, keywords)
        
        if position:
            x, y = position
            print(f"  ✅ Filling '{field_name}' at position ({x}, {y})")
            
            # Draw the value
            draw.text((x, y), str(field_value), fill="blue", font=font)
            filled_count += 1
        else:
            print(f"  ❌ Could not find position for '{field_name}'")

print(f"\n🎉 Successfully filled {filled_count} out of {len([v for v in form_data.values() if v])} fields")

# ----------------------------
# Step 4: Save Filled Form with Debug Info
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"

# Optional: Draw debug rectangles around detected text blocks
debug_mode = True  # Set to False to disable debug rectangles

if debug_mode:
    print("\n🐛 Adding debug rectangles...")
    debug_draw = ImageDraw.Draw(form_img)
    
    for i, block in enumerate(text_blocks):
        x1, y1, x2, y2 = block['bbox']
        # Draw red rectangle around detected text
        debug_draw.rectangle([x1-1, y1-1, x2+1, y2+1], outline="red", width=1)
        # Add small text with block number
        debug_draw.text((x1, y1-15), str(i), fill="red", font=font)

# form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")

# ----------------------------
# Step 5: Additional Debugging Information
# ----------------------------
print("\n📊 Summary:")
print(f"  • Extracted {len([v for v in form_data.values() if v])} fields from Aadhaar")
print(f"  • Detected {len(text_blocks)} text blocks in form")
print(f"  • Successfully filled {filled_count} fields")

if filled_count == 0:
    print("\n🔧 Troubleshooting tips:")
    print("  1. Check if the form image is clear and readable")
    print("  2. Verify that field labels match the expected keywords")
    print("  3. Try adjusting the confidence threshold (currently 30)")
    print("  4. Check if debug rectangles appear around text in the output image")

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


📄 Extracted text from Aadhaar: ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT
✅ Extracted Aadhaar Data:
{
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Address": "789, Space Colony"
}
🔍 Detecting form fields using Tesseract...
📋 Found 217 text blocks in form
🔤 Detected text blocks:
  0: 'FORM' at (294, 30) conf: 96
  1: '1:' at (336, 18) conf: 82
  2: 'Aadhaar' at (353, 30) conf: 92
  3: 'Enrolment' at (407, 30) conf: 95
  4: 'and' at (473, 18) conf: 95
  5: 'Update' at (499, 30) conf: 96
  6: 'For' at (97, 48) conf: 96
  7: '(a)' at (121, 48) conf: 94
  8: 'Resident' at (140, 48) conf: 95
  9: 'Indian,' at (193, 48) conf: 96
  10: 'or' at (238, 51) conf: 92
  11: '(b)' at (254, 48) conf: 92
  12: 'Non-Resident' at (274, 48) conf: 95
  13: 'Indian' at (355, 48) conf: 95
  14: 'having' at (397, 48) conf: 96
  15: 'Proof' at (439, 48) conf: 9

In [31]:
import easyocr
import spacy
import re
import json
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import pytesseract
import cv2

# ----------------------------
# Step 0: Setup Tesseract Path
# ----------------------------
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\T077\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# ----------------------------
# Step 1: Extract Aadhaar Data
# ----------------------------
aadhaar_image_path = r"D:\Form_automation\Aadhar_pic\WhatsApp-Image-2025-04-05-at-1.57.04-PM.jpeg"

reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

results = reader.readtext(aadhaar_image_path)
lines = [text.strip() for _, text, _ in results]
all_text = " ".join(lines)
print(f"📄 Extracted text from Aadhaar: {all_text}")

# Regex patterns
aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
dob_pattern = r"\d{2}/\d{2}/\d{4}"
gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"

# Extract data
aadhaar_number = re.search(aadhaar_pattern, all_text)
dob_match = re.search(dob_pattern, all_text)
gender = re.search(gender_pattern, all_text)

# Name extraction (lines above DOB)
name = None
dob_index = None
if dob_match:
    dob_text = dob_match.group()
    dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
    if dob_index is not None:
        candidate_lines = lines[:dob_index]
        headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
        cleaned_lines = []
        for line in candidate_lines:
            line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
            for header in headers:
                line = re.sub(header, "", line, flags=re.IGNORECASE)
            if line.strip():
                cleaned_lines.append(line.strip())
        merged_text = " ".join(cleaned_lines)
        doc = nlp(merged_text)
        person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if person_names:
            name = " ".join(person_names)
        else:
            tokens = merged_text.split()
            if tokens:
                name = " ".join(tokens[:2])

# Address: only next 2 lines after DOB
address = None
if dob_index is not None:
    address_candidates = lines[dob_index + 1 : dob_index + 3]
    cleaned_address_lines = []
    for line in address_candidates:
        line = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line)
        if line.strip():
            cleaned_address_lines.append(line.strip())
    address = " ".join(cleaned_address_lines) if cleaned_address_lines else None

form_data = {
    "Name": name,
    "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
    "Date of Birth": dob_match.group() if dob_match else None,
    "Gender": gender.group() if gender else None,
    "Address": address
}

print("✅ Extracted Aadhaar Data:")
print(json.dumps(form_data, indent=4))

# ----------------------------
# Step 2: Detect Form Fields using Tesseract
# ----------------------------
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"

def detect_form_fields_improved(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not load image: {image_path}")
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img_rgb)

    ocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    text_blocks = []
    for i in range(len(ocr_data['text'])):
        text = ocr_data['text'][i].strip()
        conf = int(ocr_data['conf'][i])
        if text and conf > 30:
            x, y, w, h = ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i]
            text_blocks.append({
                'text': text,
                'confidence': conf,
                'bbox': (x, y, x + w, y + h),
                'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h
            })
    return text_blocks, pil_img

def find_field_fill_position(text_blocks, field_keywords):
    for block in text_blocks:
        text_lower = block['text'].lower().strip()
        for keyword in field_keywords:
            if keyword.lower() in text_lower:
                # Strategy 1: fill after colon
                if ':' in block['text']:
                    return block['x2'] + 10, block['y1']
                # Strategy 2: check line after label
                same_line_blocks = [b for b in text_blocks if abs(b['y1'] - block['y1']) < 15]
                same_line_blocks.sort(key=lambda x: x['x1'])
                label_index = next((i for i, b in enumerate(same_line_blocks) if b['text'] == block['text']), -1)
                if label_index >= 0 and label_index < len(same_line_blocks) - 1:
                    next_block = same_line_blocks[label_index + 1]
                    if next_block['x1'] - block['x2'] > 50 or '_' in next_block['text']:
                        return block['x2'] + 20, block['y1']
                return block['x2'] + 20, block['y1']
    return None

# Detect form fields
text_blocks, form_img = detect_form_fields_improved(form_image_path)

# ----------------------------
# Step 3: Fill Form Fields
# ----------------------------
draw = ImageDraw.Draw(form_img)
try:
    font = ImageFont.truetype("arial.ttf", 20)
except:
    try:
        font = ImageFont.truetype("calibri.ttf", 20)
    except:
        font = ImageFont.load_default()

field_keywords = {
    "Name": ["name", "full name", "applicant name", "person name", "naam"],
    "Aadhaar Number": ["aadhaar", "aadhar", "aadhaar number", "aadhar number", "uid", "unique id"],
    "Date of Birth": ["date of birth", "dob", "birth date", "date birth", "birth"],
    "Gender": ["gender", "sex", "male/female", "m/f"],
    "Address": ["address", "residence", "location", "home address", "present address"]
}

filled_count = 0
for field_name, field_value in form_data.items():
    if field_value:
        keywords = field_keywords.get(field_name, [field_name.lower()])
        position = find_field_fill_position(text_blocks, keywords)
        if position:
            x, y = position
            draw.text((x, y), str(field_value), fill="blue", font=font)
            filled_count += 1

# ----------------------------
# Step 4: Save Filled Form (NO RED DEBUG BOXES)
# ----------------------------
output_path = r"D:\Form_automation\Output\filled_form.jpg"
form_img.save(output_path)
print(f"✅ Form filled and saved at: {output_path}")
print(f"🎉 Successfully filled {filled_count} fields out of {len([v for v in form_data.values() if v])}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


📄 Extracted text from Aadhaar: ATTT TET GOVERNMENT OF INDIA AADAAAR Elon Musk Male 28/06/1971 789, Space Colony 4567 8901 2345 AT 31renr; A 48TT
✅ Extracted Aadhaar Data:
{
    "Name": "Elon Musk",
    "Aadhaar Number": "4567 8901 2345",
    "Date of Birth": "28/06/1971",
    "Gender": "Male",
    "Address": "789, Space Colony"
}
✅ Form filled and saved at: D:\Form_automation\Output\filled_form.jpg
🎉 Successfully filled 5 fields out of 5


In [None]:
from flask import Flask, request, jsonify, render_template
import easyocr
import spacy
import re
import os

app = Flask(__name__)

# Initialize OCR and NLP
reader = easyocr.Reader(['en'])
nlp = spacy.load("en_core_web_sm")

# Create upload folder
UPLOAD_FOLDER = 'uploads'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

@app.route('/')
def index():
    return render_template('aadhaar_form.html')  # Render HTML page

@app.route('/extract_aadhaar', methods=['POST'])
def extract_aadhaar():
    if 'aadhaar_image' not in request.files:
        return jsonify({"error": "No image uploaded"}), 400
    
    image = request.files['aadhaar_image']
    image_path = os.path.join(UPLOAD_FOLDER, image.filename)
    image.save(image_path)
    
    # OCR extraction using EasyOCR
    results = reader.readtext(image_path)
    lines = [text.strip() for _, text, _ in results]
    all_text = " ".join(lines)
    print(all_text)
    # Regex patterns
    aadhaar_pattern = r"\b\d{4}\s\d{4}\s\d{4}\b"
    dob_pattern = r"\d{2}/\d{2}/\d{4}"
    gender_pattern = r"\b(MALE|FEMALE|Male|Female|F|M)\b"
    
    aadhaar_number = re.search(aadhaar_pattern, all_text)
    dob_match = re.search(dob_pattern, all_text)
    gender = re.search(gender_pattern, all_text)
    
    # Extract Name
    name = None
    if dob_match:
        dob_text = dob_match.group()
        dob_index = next((i for i, line in enumerate(lines) if dob_text in line), None)
        if dob_index is not None:
            candidate_lines = lines[:dob_index]
            headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
            cleaned_lines = []
            for line in candidate_lines:
                line = re.sub(r"\b(MALE|FEMALE|Male|Female|F|M)\b", "", line)
                for header in headers:
                    line = re.sub(header, "", line, flags=re.IGNORECASE)
                if line.strip():
                    cleaned_lines.append(line.strip())
            merged_text = " ".join(cleaned_lines)
            doc = nlp(merged_text)
            person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
            name = " ".join(person_names) if person_names else " ".join(merged_text.split()[:2])
    
    # Extract Address (lines after DOB)
    address = None
    if dob_match:
        dob_index = next((i for i, line in enumerate(lines) if dob_match.group() in line), None)
        if dob_index is not None:
            address_candidates = lines[dob_index + 1: dob_index + 3]
            cleaned_address = [re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '', line) for line in address_candidates if line.strip()]
            address = " ".join(cleaned_address)
    
    # Prepare response
    data = {
        "Name": name,
        "Aadhaar Number": aadhaar_number.group() if aadhaar_number else None,
        "Date of Birth": dob_match.group() if dob_match else None,
        "Gender": gender.group() if gender else None,
        "Address": address
    }
    
    return jsonify(data)

if __name__ == '__main__':
    app.run(debug=True)


In [34]:
!pip install python-dotenv


Collecting python-dotenv
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Using cached python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [4]:
from groq import Groq
import os

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# List all available models
models = client.models.list()
for m in models:
    print(m)

('data', [Model(id='meta-llama/llama-guard-4-12b', created=1746743847, object='model', owned_by='Meta', active=True, context_window=131072, public_apps=None, max_completion_tokens=1024), Model(id='llama3-70b-8192', created=1693721698, object='model', owned_by='Meta', active=True, context_window=8192, public_apps=None, max_completion_tokens=8192), Model(id='openai/gpt-oss-120b', created=1754408224, object='model', owned_by='OpenAI', active=True, context_window=131072, public_apps=None, max_completion_tokens=65536), Model(id='distil-whisper-large-v3-en', created=1693721698, object='model', owned_by='Hugging Face', active=True, context_window=448, public_apps=None, max_completion_tokens=448), Model(id='whisper-large-v3-turbo', created=1728413088, object='model', owned_by='OpenAI', active=True, context_window=448, public_apps=None, max_completion_tokens=448), Model(id='deepseek-r1-distill-llama-70b', created=1737924940, object='model', owned_by='DeepSeek / Meta', active=True, context_windo

In [2]:
!pip install flask easyocr groq


Collecting groq
  Using cached groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Using cached groq-0.31.0-py3-none-any.whl (131 kB)
Installing collected packages: groq
Successfully installed groq-0.31.0


In [5]:
!pip install mysql-connector-python


Collecting mysql-connector-python
  Downloading mysql_connector_python-9.4.0-cp312-cp312-win_amd64.whl.metadata (7.7 kB)
Downloading mysql_connector_python-9.4.0-cp312-cp312-win_amd64.whl (16.4 MB)
   ---------------------------------------- 0.0/16.4 MB ? eta -:--:--
   ------------------------ --------------- 10.2/16.4 MB 53.0 MB/s eta 0:00:01
   ---------------------------------------- 16.4/16.4 MB 49.0 MB/s eta 0:00:00
Installing collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.4.0


In [None]:
from flask import Flask, request, jsonify, render_template
import easyocr
import os
import json
import re
from groq import Groq
from dotenv import load_dotenv
from werkzeug.utils import secure_filename
import mysql.connector  # ✅ Added for DB

app = Flask(__name__)

# Initialize OCR
reader = easyocr.Reader(['en'])

UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# Configure upload settings
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size

# Load environment variables
load_dotenv()

# ✅ Database connection
db = mysql.connector.connect(
    host="localhost",
    user="root",
    password="your_password",  # Change this
    database="aadhaar_db"
)
cursor = db.cursor()

# Initialize Groq client
try:
    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
    print("✅ Groq API client initialized")
except Exception as e:
    print(f"❌ Error initializing Groq client: {e}")
    client = None

def allowed_file(filename):
    """Check if file extension is allowed"""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def extract_aadhaar_data_locally(text):
    """Fallback function to extract Aadhaar data using regex (if Groq fails)"""
    print("🔧 Using local extraction as fallback...")
    
    # Clean the text
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    
    # Extract patterns
    patterns = {
        'aadhaar_number': r'\b\d{4}\s?\d{4}\s?\d{4}\b',
        'date_of_birth': r'\b\d{2}/\d{2}/\d{4}\b',
        'gender': r'\b(MALE|FEMALE|Male|Female|M|F)\b'
    }
    
    extracted = {}
    
    # Extract using patterns
    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            extracted[field] = match.group()
    
    # Simple name extraction (improve as needed)
    words = text.split()
    # Remove common Aadhaar card headers
    headers = ["GOVERNMENT", "INDIA", "AADHAAR", "UNIQUE", "IDENTIFICATION", "AUTHORITY", "OF"]
    clean_words = [word for word in words if word.upper() not in headers and word.isalpha() and len(word) > 2]
    
    # Take first 2-3 words as name
    name = " ".join(clean_words[:3]) if clean_words else ""
    
    # Simple address extraction (last few meaningful words)
    address_words = [word for word in words[-10:] if not re.match(r'\d{4}\s?\d{4}\s?\d{4}', word)]
    address = " ".join(address_words) if address_words else ""
    
    return {
        "name": name,
        "aadhaar_number": extracted.get('aadhaar_number', ''),
        "date_of_birth": extracted.get('date_of_birth', ''),
        "gender": extracted.get('gender', ''),
        "address": address
    }

@app.route('/')
def index():
    """Serve the main form page"""
    try:
        return render_template('aadhaar_form.html')
    except Exception as e:
        return f"Error loading template: {e}", 500

@app.route('/extract_aadhaar', methods=['POST'])
def extract_aadhaar():
    """Extract Aadhaar details from uploaded images"""
    try:
        print("📤 Received request to extract Aadhaar data")
        
        # Check if files were uploaded
        if 'aadhaar_images' not in request.files:
            return jsonify({"error": "No images uploaded"}), 400
        
        files = request.files.getlist('aadhaar_images')
        if not files or all(file.filename == '' for file in files):
            return jsonify({"error": "No images selected"}), 400
        
        combined_text = ""
        processed_files = []
        
        # Process each uploaded image
        for image in files:
            if image and allowed_file(image.filename):
                # Secure filename
                filename = secure_filename(image.filename)
                image_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
                
                # Save image
                image.save(image_path)
                processed_files.append(image_path)
                
                print(f"📸 Processing image: {filename}")
                
                # Extract text using EasyOCR
                try:
                    results = reader.readtext(image_path)
                    text = " ".join([detection[1] for detection in results])
                    combined_text += " " + text
                    print(f"📄 Extracted text: {text[:100]}...")
                except Exception as ocr_error:
                    print(f"❌ OCR Error for {filename}: {ocr_error}")
                    continue
             
        if not combined_text.strip():
            return jsonify({"error": "No text could be extracted from images"}), 400
        
        print(f"🔤 Combined text length: {len(combined_text)} characters")
        
        # Try to use Groq LLM for extraction
        extracted_data = None
        
        if client:
            try:
                print("🤖 Using Groq LLM for data extraction...")
                
                prompt = f"""
You are an expert at extracting information from Indian Aadhaar cards. 

IMPORTANT CONTEXT about Aadhaar cards:
- The cardholder's name appears prominently at the top
- Father's/Husband's name appears below with prefixes like "S/O" (Son Of), "D/O" (Daughter Of), "W/O" (Wife Of), "Father:", "Husband:"
- The cardholder's name is usually in larger font and appears first
- Father's/Husband's name is secondary information

Text from Aadhaar card: {combined_text}

Extract the following information and return ONLY valid JSON:

Required fields:
- name: The CARDHOLDER's name (NOT father's/husband's name). This is the primary name, usually appears first and largest. Ignore any name that comes after S/O, D/O, W/O, Father:, or Husband:
- aadhaar_number: 12-digit number (format: XXXX XXXX XXXX)
- date_of_birth: Date in DD/MM/YYYY format
- gender: Male/Female/Other
- address: Complete address excluding Aadhaar number

EXTRACTION RULES:
1. For NAME: Take the name that appears BEFORE any of these indicators: "S/O", "D/O", "W/O", "Father", "Husband", "Son of", "Daughter of", "Wife of"
2. Skip any text that contains government headers like "GOVERNMENT OF INDIA", "AADHAAR", "UNIQUE IDENTIFICATION"
3. The correct name is usually the first meaningful name that appears after removing headers
4. If you see multiple names, the cardholder's name comes first, parent/spouse name comes after relationship indicators

Return only valid JSON:
{{
    "name": "actual_cardholder_name_here",
    "aadhaar_number": "XXXX XXXX XXXX",
    "date_of_birth": "DD/MM/YYYY",
    "gender": "Male/Female/Other",
    "address": "complete_address_here"
}}
"""
                
                response = client.chat.completions.create(
                    model="llama-3.3-70b-versatile",
                    messages=[
                        {
                            "role": "system", 
                            "content": "You are an expert at extracting structured data from Indian Aadhaar cards. Always return valid JSON only."
                        },
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.1,
                    max_tokens=500
                )
                
                # Get the response content
                llm_response = response.choices[0].message.content.strip()
                print(f"🤖 LLM Response: {llm_response}")
                
                # Try to parse JSON from LLM response
                try:
                    # Clean the response (remove any markdown formatting)
                    if '```json' in llm_response:
                        llm_response = llm_response.split('```json')[1].split('```')[0].strip()
                    elif '```' in llm_response:
                        llm_response = llm_response.split('```')[1].strip()
                    
                    extracted_data = json.loads(llm_response)
                    print("✅ Successfully parsed LLM response")
                    
                except json.JSONDecodeError as json_error:
                    print(f"❌ JSON parsing error: {json_error}")
                    print(f"Raw response: {llm_response}")
                    extracted_data = None
                    
            except Exception as groq_error:
                print(f"❌ Groq API Error: {groq_error}")
                extracted_data = None
        
        # Fallback to local extraction if Groq fails
        if not extracted_data:
            print("🔄 Falling back to local regex extraction...")
            extracted_data = extract_aadhaar_data_locally(combined_text)
        
        # ✅ Insert into Database
        try:
            sql = """INSERT INTO aadhaar_details (name, aadhaar_number, date_of_birth, gender, address) 
                     VALUES (%s, %s, %s, %s, %s)"""
            values = (
                extracted_data.get("name", ""),
                extracted_data.get("aadhaar_number", ""),
                extracted_data.get("date_of_birth", ""),
                extracted_data.get("gender", ""),
                extracted_data.get("address", "")
            )
            cursor.execute(sql, values)
            db.commit()
            print("✅ Data saved in database")
        except Exception as db_error:
            print(f"❌ Database Error: {db_error}")
        
        # Clean up uploaded files (optional)
        for file_path in processed_files:
            try:
                os.remove(file_path)
            except:
                pass
        
        print(f"✅ Final extracted data: {extracted_data}")
        return jsonify(extracted_data)
        
    except Exception as e:
        print(f"❌ General Error: {e}")
        return jsonify({"error": f"Server error: {str(e)}"}), 500

@app.route('/health')
def health_check():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "groq_available": client is not None,
        "upload_folder": os.path.exists(UPLOAD_FOLDER)
    })

if __name__ == '__main__':
    print("🚀 Starting Aadhaar Form Automation Server...")
    print(f"📁 Upload folder: {UPLOAD_FOLDER}")
    print(f"🤖 Groq API: {'Available' if client else 'Not available'}")
    
    # Create templates folder if it doesn't exist
    templates_dir = os.path.join(app.root_path, 'templates')
    if not os.path.exists(templates_dir):
        os.makedirs(templates_dir)
        print(f"📂 Created templates directory: {templates_dir}")
    
    app.run(debug=True, host='0.0.0.0', port=5000)


In [63]:
!pip install pytesseract



In [61]:
!pip install --upgrade layoutparser[ocr]


Collecting google-cloud-vision==1 (from layoutparser[ocr])
  Downloading google_cloud_vision-1.0.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting pytesseract (from layoutparser[ocr])
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting google-api-core<2.0.0dev,>=1.14.0 (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-vision==1->layoutparser[ocr])
  Downloading google_api_core-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting googleapis-common-protos<2.0dev,>=1.56.2 (from google-api-core<2.0.0dev,>=1.14.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-vision==1->layoutparser[ocr])
  Using cached googleapis_common_protos-1.70.0-py3-none-any.whl.metadata (9.3 kB)
Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5 (from google-api-core<2.0.0dev,>=1.14.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-vision==1->layoutparser[ocr])
  Downloading protobuf-3.20.3-cp310-cp310-win

In [None]:
from PIL import Image, ImageDraw, ImageFont
import layoutparser as lp
from layoutparser.ocr import TesseractAgent

# Load form image
form_image_path = r"D:\Form_automation\Aadhar_pic\Aadhaar-Form-1-1.jpg.webp"
form_img = Image.open(form_image_path).convert("RGB")
image_np = np.array(form_img)

# Initialize Tesseract OCR
ocr_agent = TesseractAgent(languages='eng')

# Detect all text blocks
layout = ocr_agent.detect(image_np)
text_blocks = [b for b in layout if hasattr(b, "type") and b.type == "Text"]

# Find the "Name" label
name_block = None
for block in text_blocks:
    if "name" in block.text.strip().lower():
        name_block = block
        break

if name_block:
    # Get coordinates of the blank space near the label
    # Usually we can assume writing starts a bit to the right of the label
    x_start = int(name_block.block.x_2 + 10)  # right of label
    y_start = int(name_block.block.y_1)
    print("✅ Name field coordinates:", x_start, y_start)

    # Example: write the extracted name
    draw = ImageDraw.Draw(form_img)
    font = ImageFont.truetype("arial.ttf", 24)
    extracted_name = "Elon Musk"
    draw.text((x_start, y_start), extracted_name, fill="black", font=font)

# Save filled image
form_img.save(r"D:\Form_automation\Output\filled_form_test.jpg")


In [7]:
!conda create -n form_auto python=3.10 -y
!conda activate form_auto


Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\T077\anaconda3\envs\form_auto

  added / updated specs:
    - python=3.10


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.7.15  |       haa95532_0         127 KB
    expat-2.7.1                |       h8ddb27b_0         259 KB
    openssl-3.0.17             |       h35632f6_0         7.8 MB
    pip-25.1                   |     pyhc872135_2         1.3 MB
    python-3.10.18             |       h981015d_0        16.2 MB
    setuptools-78.1.1          |  py310haa95532_0         1.7 MB
    sqlite-3.50.2              |       hda9a48d_1        1017 KB
    tk-8.6.15                  |       hf199647_0         3.5 MB
    tzdata-2025b               |       h04d1e81_0         116 K

In [1]:
!pip install numpy==1.26.4
!pip install easyocr==1.7.2 spacy==3.8.7 opencv-python scipy scikit-image
!python -m spacy download en_core_web_sm


Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
   ---------------------------------------- 0.0/15.8 MB ? eta -:--:--
   ----------------------------- ---------- 11.5/15.8 MB 55.5 MB/s eta 0:00:01
   ---------------------------------------- 15.8/15.8 MB 47.4 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-1.26.4
Collecting easyocr==1.7.2
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting spacy==3.8.7
  Downloading spacy-3.8.7-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting opencv-python
  Using cached opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting scipy
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting scikit-image
  Downloading scikit_image-0.25.2-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting torch (from easyocr==1.7.2)
  Downloading torch-2.8.0-cp3

In [10]:
!pip install numpy==1.26.4
!pip install scipy==1.15.2
!ip install opencv-python==4.7.0.72
!pip install opencv-python-headless==4.7.0.72
!pip install easyocr==1.7.2
!pip install spacy==3.8.7
!python -m spacy download en_core_web_sm


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
Successfully installed numpy-1.26.4


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.


Collecting scipy==1.15.2
  Downloading scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading scipy-1.15.2-cp312-cp312-win_amd64.whl (40.9 MB)
   ---------------------------------------- 0.0/40.9 MB ? eta -:--:--
   ----------- ---------------------------- 12.1/40.9 MB 58.1 MB/s eta 0:00:01
   -------------------- ------------------- 21.2/40.9 MB 53.8 MB/s eta 0:00:01
   ------------------------- -------------- 26.2/40.9 MB 43.7 MB/s eta 0:00:01
   ------------------------------- -------- 32.2/40.9 MB 39.4 MB/s eta 0:00:01
   -------------------------------------- - 39.8/40.9 MB 39.6 MB/s eta 0:00:01
   ---------------------------------------- 40.9/40.9 MB 37.2 MB/s eta 0:00:00
Installing collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
      Successfully uninstalled scipy-1.13.1
Successfully installed scipy-1.15.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.2 which is incompatible.
'ip' is not recognized as an internal or external command,
operable program or batch file.


Collecting opencv-python-headless==4.7.0.72
  Downloading opencv_python_headless-4.7.0.72-cp37-abi3-win_amd64.whl.metadata (18 kB)
Downloading opencv_python_headless-4.7.0.72-cp37-abi3-win_amd64.whl (38.1 MB)
   ---------------------------------------- 0.0/38.1 MB ? eta -:--:--
   ------------ --------------------------- 12.3/38.1 MB 59.4 MB/s eta 0:00:01
   -------------------- ------------------- 19.9/38.1 MB 48.4 MB/s eta 0:00:01
   -------------------------- ------------- 25.7/38.1 MB 41.8 MB/s eta 0:00:01
   -------------------------------- ------- 30.9/38.1 MB 37.8 MB/s eta 0:00:01
   -------------------------------------- - 36.2/38.1 MB 35.4 MB/s eta 0:00:01
   ---------------------------------------- 38.1/38.1 MB 32.7 MB/s eta 0:00:00
Installing collected packages: opencv-python-headless
  Attempting uninstall: opencv-python-headless
    Found existing installation: opencv-python-headless 4.12.0.88
    Uninstalling opencv-python-headless-4.12.0.88:
      Successfully uninstalle

  You can safely remove it manually.


Collecting numpy>=1.19.0 (from spacy==3.8.7)
  Downloading numpy-2.3.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading numpy-2.3.2-cp312-cp312-win_amd64.whl (12.8 MB)
   ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
   ---------------------------------------  12.6/12.8 MB 56.5 MB/s eta 0:00:01
   ---------------------------------------- 12.8/12.8 MB 53.5 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-2.3.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.3.2 which is incompatible.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.2 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.2 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.2 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.2 which is incompatible.
Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "c:\Users\T077\anaconda3\Lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "c:\Users\T077\anaconda3\Lib\site-