In [None]:
!pip install pytesseract
!pip install pymupdf langdetect
!apt-get -q install -y tesseract-ocr


Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.4
Reading package lists...
Building dependency tree...
Reading state information...
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
# Install missing packages
!pip install pymupdf langdetect

import os
import re
import csv
import cv2
import fitz  # PyMuPDF
import numpy as np
import pytesseract
from langdetect import detect

# Install tesseract OCR if not already
!apt-get -q install -y tesseract-ocr pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


Reading package lists...
E: Unsupported file /usr/bin/tesseract given on commandline


In [None]:
# -----------------------
# Install dependencies (only once in Colab)
# -----------------------
!pip install pymupdf opencv-python-headless pytesseract pandas openpyxl

import os
import re
import csv
import fitz  # PyMuPDF
import cv2
import numpy as np
import pytesseract
import pandas as pd

# -----------------------
# Quality Check Functions
# -----------------------

def check_legibility(page_img, doc_id, page_num):
    """Check if text is faint/blurred (low contrast/sharpness)."""
    gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    if laplacian_var < 50:
        return {
            "document_id": doc_id,
            "check_name": "Legibility",
            "severity": "High",
            "evidence": f"Page {page_num}",
            "confidence": 0.9
        }
    return None


def check_skew(page_img, doc_id, page_num):
    """Detect skew/rotation using Tesseract OSD."""
    try:
        osd = pytesseract.image_to_osd(page_img)
        rotation = int(re.search(r"Rotate: (\d+)", osd).group(1))
        if rotation != 0:
            return {
                "document_id": doc_id,
                "check_name": "Skew",
                "severity": "Medium",
                "evidence": f"Page {page_num} rotated {rotation}°",
                "confidence": 0.85
            }
    except Exception:
        return {
            "document_id": doc_id,
            "check_name": "Skew",
            "severity": "Low",
            "evidence": f"Page {page_num} check failed",
            "confidence": 0.5
        }
    return None


def check_missing_content(page, doc_id, page_num):
    """Check for missing/very little text."""
    text = page.get_text()
    if len(text.strip()) < 20:
        snippet = text.strip()[:50]
        return {
            "document_id": doc_id,
            "check_name": "Missing Content",
            "severity": "High",
            "evidence": f"Page {page_num}, snippet: '{snippet}'",
            "confidence": 0.8
        }
    return None

# -----------------------
# Main Processing Function
# -----------------------

def process_pdfs(input_dir, output_csv="qc_results.csv", output_excel="qc_results.xlsx"):
    """Run QC checks on all PDFs in input_dir and save results to CSV & Excel."""
    results = []

    for file in os.listdir(input_dir):
        if not file.lower().endswith(".pdf"):
            continue

        doc_id = file
        filepath = os.path.join(input_dir, file)
        doc = fitz.open(filepath)

        for page_num, page in enumerate(doc, start=1):
            # Convert page to image (numpy)
            pix = page.get_pixmap(dpi=150)
            page_img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            if pix.n == 1:  # grayscale → BGR
                page_img = cv2.cvtColor(page_img, cv2.COLOR_GRAY2BGR)

            # Run checks
            for check_func in [check_legibility, check_skew]:
                issue = check_func(page_img, doc_id, page_num)
                if issue:
                    results.append(issue)

            issue = check_missing_content(page, doc_id, page_num)
            if issue:
                results.append(issue)

        doc.close()

    # Save as CSV + Excel
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    df.to_excel(output_excel, index=False)
    print(f"QC completed. Results saved to {output_csv} and {output_excel}")

    return df




In [None]:
# Place your PDFs in /content
df = process_pdfs("/content")

# Preview first rows
df.head()


QC completed. Results saved to qc_results.csv and qc_results.xlsx


Unnamed: 0,document_id,check_name,severity,evidence,confidence
0,blank_page.pdf,Legibility,High,Page 1,0.9
1,blank_page.pdf,Skew,Low,Page 1 check failed,0.5
2,blank_page.pdf,Missing Content,High,"Page 1, snippet: ''",0.8


In [None]:
# =============================
# PDF QC Prototype in Colab
# =============================

!apt-get install -y poppler-utils tesseract-ocr > /dev/null
!pip install pdf2image pytesseract pdfplumber langdetect opencv-python-headless pandas PyPDF2 > /dev/null

import os
import re
import cv2
import pytesseract
import pdfplumber
import pandas as pd
from pdf2image import convert_from_path
from langdetect import detect



In [17]:
# -----------------------------
# Helper functions
# -----------------------------

def variance_of_laplacian(image):   #  This function calculates the variance of the Laplacian of an image. It is a focus measure:
                                      # High value → Sharp image
                                      # Low value → Blurry image
    """Check image blur: lower values = blurrier."""
    return cv2.Laplacian(image, cv2.CV_64F).var()

def check_legibility(page_img, thr=150):          #This function checks if a scanned page/image is legible or blurry.
    gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    score = variance_of_laplacian(gray)
    return score < thr, score

def check_blank_page(page_img, thr=0.99):     #This function detects blank pages by checking if more than ~99% of the pixels are white.
    gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    ratio_white = (gray > 245).mean()
    return ratio_white > thr, ratio_white

def check_orientation(page_img):      #This function uses Tesseract’s OSD to detect if a page image is rotated (90°, 180°, or 270°).
    try:
        osd = pytesseract.image_to_osd(page_img)
        rot = int(re.search("Rotate: (\d+)", osd).group(1))
        return rot != 0, rot
    except:
        return False, 0

def check_signatures(text):        # To check if a contract, form, or document contains signature evidence.
    keywords = ["signature", "signed", "authorized", "seal"]
    found_kw = [kw for kw in keywords if kw.lower() in text.lower()]
    return bool(found_kw), found_kw

def check_draft_vs_final(text, filename):    # Flags inconsistencies between document content and filename

    draft = "draft" in text.lower()
    final_name = "final" in filename.lower()
    if draft and final_name:
        return True
    return False

def detect_language(text):  # Checks if the document is in English or some other language.
    try:
        lang = detect(text)
        return lang != "en", lang
    except:
        return False, "unknown"




  rot = int(re.search("Rotate: (\d+)", osd).group(1))


In [18]:
# -----------------------------
# QC main function
# -----------------------------

def process_pdfs(input_dir, output_csv="qc_results.csv"):
    results = []

    for file in os.listdir(input_dir):
        if not file.endswith(".pdf"):
            continue

        doc_id = file
        filepath = os.path.join(input_dir, file)
        print(f"Processing {file}...")

        try:
            pages = convert_from_path(filepath, dpi=150)
            with pdfplumber.open(filepath) as pdf:
                for i, (page_img, page) in enumerate(zip(pages, pdf.pages), start=1):
                    # Convert PIL to cv2
                    img = cv2.cvtColor(np.array(page_img), cv2.COLOR_RGB2BGR)
                    text = page.extract_text() or ""

                    # Check 1: Legibility
                    bad_legibility, score = check_legibility(img)
                    if bad_legibility:
                        results.append([doc_id, "Legibility (blurry)", "High", f"Page {i}", round(score, 2)])

                    # Check 2: Blank page
                    is_blank, ratio = check_blank_page(img)
                    if is_blank:
                        results.append([doc_id, "Blank Page", "Medium", f"Page {i}", round(ratio, 2)])

                    # Check 3: Orientation
                    rotated, rot = check_orientation(img)
                    if rotated:
                        results.append([doc_id, "Orientation Issue", "Medium", f"Page {i} rotated {rot}°", rot])

                    # Check 4: Signatures
                    has_sig, kws = check_signatures(text)
                    if "signature" in text.lower() and not has_sig:
                        results.append([doc_id, "Missing Signature", "High", f"Page {i}", 0.7])

                    # Check 5: Draft vs Final
                    if check_draft_vs_final(text, file):
                        results.append([doc_id, "Draft vs Final Conflict", "High", f"Page {i}", 0.9])

                    # Check 6: Language
                    non_eng, lang = detect_language(text)
                    if non_eng:
                        results.append([doc_id, "Non-English Language", "High", f"Page {i}", lang])

        except Exception as e:
            results.append([doc_id, "Processing Error", "High", str(e), 0.0])

    df = pd.DataFrame(results, columns=["document_id", "check_name", "severity", "evidence", "confidence"])
    df.to_csv(output_csv, index=False)
    print(f"QC results saved to {output_csv}")
    return df



In [21]:
# -----------------------------
# Run on uploaded PDFs
# -----------------------------
import numpy as np
from google.colab import files

print("Upload PDFs for QC...")
uploaded = files.upload()

os.makedirs("pdfs", exist_ok=True)
for fn in uploaded.keys():
    os.rename(fn, f"pdfs/{fn}")

df = process_pdfs("pdfs", "qc_results.csv")

import pandas as pd
df.head()

Upload PDFs for QC...


Saving blank_page - Copy.pdf to blank_page - Copy.pdf
Saving draft_document - Copy.pdf to draft_document - Copy.pdf
Saving final_contract_signed - Copy.pdf to final_contract_signed - Copy.pdf
Saving frontend....pdf to frontend....pdf
Saving missing_signature - Copy.pdf to missing_signature - Copy.pdf
Saving spanish_letter - Copy.pdf to spanish_letter - Copy.pdf
Saving Usha_s_Resume_C.pdf to Usha_s_Resume_C.pdf
Saving USHASREE AADHAAR.pdf to USHASREE AADHAAR (1).pdf
Processing frontend....pdf...
Processing USHASREE AADHAAR (1).pdf...
Processing spanish_letter.pdf...
Processing usha PAN.pdf...
Processing final_contract_signed - Copy.pdf...
Processing blank_page - Copy.pdf...
Processing USHASREE AADHAAR.pdf...
Processing final_contract_signed.pdf...
Processing draft_document.pdf...
Processing missing_signature.pdf...
Processing missing_signature - Copy.pdf...
Processing draft_document - Copy.pdf...
Processing Usha_s_Resume_C.pdf...
Processing spanish_letter - Copy.pdf...
QC results saved 

Unnamed: 0,document_id,check_name,severity,evidence,confidence
0,USHASREE AADHAAR (1).pdf,Processing Error,High,Unable to get page count.\nCommand Line Error:...,0.0
1,spanish_letter.pdf,Blank Page,Medium,Page 1,1.0
2,spanish_letter.pdf,Non-English Language,High,Page 1,es
3,usha PAN.pdf,Processing Error,High,Unable to get page count.\nCommand Line Error:...,0.0
4,final_contract_signed - Copy.pdf,Blank Page,Medium,Page 1,1.0


In [22]:
from google.colab import files
files.download("qc_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## README (short)

PDF Document QC Prototype

This prototype checks PDFs for basic document quality issues.




### Implemented QC checks:

Blurry/low-resolution pages (OpenCV Laplacian variance).

Blank/cut-off pages (OCR + pixel density).

Orientation (Tesseract OSD).

Signature presence (keyword + ink detection).

Draft vs Final conflict (text vs filename).

(Optional) Dates near signatures.


### Libraries used:

pdfplumber, PyPDF2, pdf2image, pytesseract, opencv-python, langdetect, pandas