In [1]:
!pip install pdfplumber pytesseract pillow pandas



In [None]:
import pdfplumber
import pytesseract
from PIL import Image, ImageEnhance, ImageOps
import pandas as pd
import re

# Function to sanitize and standardize extracted text
def normalize_text(input_text):
    """
    Normalize text by removing extra spaces, line breaks, and invalid symbols.
    """
    input_text = re.sub(r'\n+', ' ', input_text)  # Remove multiple newlines
    input_text = re.sub(r'[^\x00-\x7F]+', '', input_text)  # Eliminate non-ASCII characters
    input_text = re.sub(r'[^\w\s\.\%\:\$\-]', '', input_text)  # Retain valid characters
    input_text = re.sub(r'\s+', ' ', input_text).strip()  # Standardize spaces
    return input_text

# Function to enhance image quality for OCR processing
def enhance_image_for_ocr(input_image):
    """
    Perform preprocessing on an image: grayscale conversion, contrast adjustment, inversion, and resizing.
    """
    input_image = input_image.convert("L")  # Convert to grayscale
    input_image = ImageEnhance.Contrast(input_image).enhance(2.5)  # Enhance contrast
    input_image = ImageOps.invert(input_image)  # Invert colors for better readability
    input_image = input_image.resize((input_image.width * 2, input_image.height * 2), Image.Resampling.LANCZOS)
    return input_image

# Function to perform OCR with fallback modes
def perform_ocr(pdf_file, page_idx):
    """
    Extract text from a PDF page using OCR with different PSM modes for robustness.
    """
    try:
        with pdfplumber.open(pdf_file) as pdf:
            current_page = pdf.pages[page_idx]
            page_image = current_page.to_image(resolution=300).original
            processed_image = enhance_image_for_ocr(page_image)

            # OCR with PSM mode 6 (assumes uniform block of text)
            extracted_text = pytesseract.image_to_string(processed_image, config="--psm 6")
            if not extracted_text.strip():
                # Fallback to PSM mode 4 for mixed layout content
                extracted_text = pytesseract.image_to_string(processed_image, config="--psm 4")

            print(f"OCR completed on Page {page_idx + 1}")
            return normalize_text(extracted_text)
    except Exception as error:
        return f"Error processing OCR on Page {page_idx + 1}: {error}"

# Function to structure OCR output into readable lines
def structure_ocr_output(text):
    """
    Format extracted OCR text into structured labels and corresponding values.
    """
    degree_regex = r'(Doctoral|Professional|Masters|Bachelors|Associates|Some college|High school|Less than high school)\s(degree|diploma)'
    value_regex = r'(\d+\.\d+\%|\$\d+|\d+)'

    degree_matches = re.findall(degree_regex, text)
    value_matches = re.findall(value_regex, text)

    formatted_output = []
    index = 0

    for degree in degree_matches:
        degree_title = " ".join(degree)
        associated_value = value_matches[index] if index < len(value_matches) else "N/A"
        formatted_output.append(f"{degree_title}: {associated_value}")
        index += 1

    for formatted_line in formatted_output:
        print(formatted_line)

# Function to extract text from specified pages of a PDF
def extract_text_from_pages(pdf_file, page_indices):
    """
    Extract text from given pages, falling back to OCR if text extraction fails.
    """
    extracted_data = {}
    with pdfplumber.open(pdf_file) as pdf_doc:
        for page_idx in page_indices:
            try:
                target_page = pdf_doc.pages[page_idx]
                page_text = target_page.extract_text()

                if page_text and len(page_text.strip()) > 0:
                    extracted_data[f"Page {page_idx + 1}"] = normalize_text(page_text)
                else:
                    print(f"No text found on Page {page_idx + 1}, attempting OCR...")
                    extracted_data[f"Page {page_idx + 1}"] = perform_ocr(pdf_file, page_idx)
            except IndexError:
                extracted_data[f"Page {page_idx + 1}"] = "Invalid page index."
    return extracted_data

# Function to extract tables from a PDF page
def extract_table_from_page(pdf_file, page_index):
    """
    Extract and clean tabular data from a specific PDF page.
    """
    try:
        with pdfplumber.open(pdf_file) as pdf_doc:
            target_page = pdf_doc.pages[page_index]
            tables = target_page.extract_tables()

            if tables:
                clean_table = [
                    [re.sub(r'\s+', ' ', str(cell).strip()) for cell in row] for row in tables[0]
                ]
                dataframe = pd.DataFrame(clean_table[1:], columns=clean_table[0])
                print(f"Table extracted successfully from Page {page_index + 1}")
                return dataframe
            else:
                print(f"No tables detected on Page {page_index + 1}.")
                return pd.DataFrame()
    except Exception as error:
        print(f"Error extracting table from Page {page_index + 1}: {error}")
        return pd.DataFrame()

# Path to the PDF file
pdf_file_path = r"C:\\1244\\sample.pdf.pdf"  # Correct path format

# Extract text and format output for specific pages
print("----- Page 2: Degree Unemployment Information -----")
page_indices = [1, 5]  # Pages 2 and 6 (0-based index)
extracted_pages = extract_text_from_pages(pdf_file_path, page_indices)
page_2_text = extracted_pages.get("Page 2", "No data available.")
structure_ocr_output(page_2_text)

# Extract and display table data from Page 6
print("\n----- Page 6: Tabular Data -----")
page_6_dataframe = extract_table_from_page(pdf_file_path, 5)
if not page_6_dataframe.empty:
    print("Extracted Table Data:")
    print(page_6_dataframe)
else:
    print("No tabular data found on Page 6.")


----- Page 2: Degree Unemployment Information -----
No text found on Page 2, attempting OCR...
