## 1. doc 

In [5]:
import re
import os
import sys 
from docx import Document
import pdfplumber

if sys.platform == 'win32':
    try:
        import win32com.client as win32
    except ImportError:
        print("Warning: The 'pywin32' library is not installed. .doc files cannot be processed.")
        print("To enable .doc support on Windows, run: pip install pywin32")
        win32 = None
else:
    win32 = None
    
def extract_paragraphs_from_doc(doc_path):
    """
    Extracts paragraphs from a .doc file using MS Word automation (Windows only).
    """
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        print("Please manually save it as .docx or .pdf to analyze.")
        return []
    
    word = None
    doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        # Get the full absolute path, which COM objects often require
        abs_path = os.path.abspath(doc_path)
        doc = word.Documents.Open(abs_path)
        paragraphs = [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
        return paragraphs
    except Exception as e:
        print(f"Error processing .doc file with MS Word: {e}")
        return []
    finally:
        if doc:
            doc.Close(False) # Close the document without saving changes
        if word:
            word.Quit() # Quit the Word application

doc_file = "7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc"
extract_paragraphs_from_doc(doc_file)

Skipping .doc file '7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc' as 'pywin32' is not available on this system.
Please manually save it as .docx or .pdf to analyze.


[]

## 2. docx

In [6]:
def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path); blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip(): blocks.append(cell.text)
        return blocks
    except Exception as e: print(f"Error reading DOCX file {doc_path}: {e}"); return []





docx_file2 = "s5NpVVwHUlH3SB7MbcSmcgkfuHK796YdA4F94ZQd.docx"
extract_paragraphs_from_docx(docx_file2)


Error reading DOCX file s5NpVVwHUlH3SB7MbcSmcgkfuHK796YdA4F94ZQd.docx: Package not found at 's5NpVVwHUlH3SB7MbcSmcgkfuHK796YdA4F94ZQd.docx'


[]

## 3. pdf

#### 3.1 without  reconstruction

In [7]:
def extract_paragraphs_from_pdf(pdf_path):
    all_paragraphs = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                words = page.extract_words(keep_blank_chars=False)
                if not words: continue
                lines = {}
                for word in words:
                    line_top = round(word['top'], 2)
                    if line_top not in lines: lines[line_top] = []
                    lines[line_top].append(word)
                for line_top in lines: lines[line_top].sort(key=lambda w: w['x0'])
                sorted_lines = sorted(lines.items(), key=lambda item: item[0])
                reconstructed_lines = []; line_heights = []; last_top = None
                for top, words_in_line in sorted_lines:
                    text = " ".join(w['text'] for w in words_in_line)
                    reconstructed_lines.append({'top': top, 'text': text})
                    if last_top is not None: line_heights.append(top - last_top)
                    last_top = top
                if not reconstructed_lines: continue
                avg_line_height = sum(line_heights) / len(line_heights) if line_heights else 12
                paragraph_break_threshold = avg_line_height * 1.5
                current_paragraph = reconstructed_lines[0]['text']
                for i in range(1, len(reconstructed_lines)):
                    prev_line, curr_line = reconstructed_lines[i-1], reconstructed_lines[i]
                    if (curr_line['top'] - prev_line['top']) > paragraph_break_threshold:
                        all_paragraphs.append(current_paragraph)
                        current_paragraph = curr_line['text']
                    else:
                        current_paragraph += " " + curr_line['text']
                all_paragraphs.append(current_paragraph)
        if all_paragraphs: print("Successfully extracted paragraphs using coordinate-based method."); return all_paragraphs
        print("Coordinate-based method failed. Attempting OCR...")
    except Exception as e: print(f"Coordinate-based parsing failed: {e}. Attempting OCR...")

pdf_file = "0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf" 
extract_paragraphs_from_pdf(pdf_file)

Coordinate-based parsing failed: [Errno 2] No such file or directory: '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf'. Attempting OCR...


#### 3.2 with reconstruction

In [8]:
def _reconstruct_paragraphs_from_page(page):
    """Helper function to reconstruct paragraphs on a single page using coordinates."""
    words = page.extract_words(keep_blank_chars=False, x_tolerance=2)
    if not words: return []
    lines = {};
    for word in words:
        line_top = round(word['top'], 2)
        if line_top not in lines: lines[line_top] = []
        lines[line_top].append(word)
    for line_top in lines: lines[line_top].sort(key=lambda w: w['x0'])
    sorted_lines = sorted(lines.items(), key=lambda item: item[0])
    reconstructed_lines = []; line_heights = []; last_top = None
    for top, words_in_line in sorted_lines:
        text = " ".join(w['text'] for w in words_in_line)
        reconstructed_lines.append({'top': top, 'text': text})
        if last_top is not None: line_heights.append(top - last_top)
        last_top = top
    if not reconstructed_lines: return []
    avg_line_height = sum(line_heights) / len(line_heights) if line_heights else 12
    paragraph_break_threshold = avg_line_height * 1.5
    page_paragraphs = []; current_paragraph = reconstructed_lines[0]['text']
    for i in range(1, len(reconstructed_lines)):
        prev_line, curr_line = reconstructed_lines[i-1], reconstructed_lines[i]
        if (curr_line['top'] - prev_line['top']) > paragraph_break_threshold:
            page_paragraphs.append(current_paragraph)
            current_paragraph = curr_line['text']
        else:
            current_paragraph += " " + curr_line['text']
    page_paragraphs.append(current_paragraph)
    return page_paragraphs

def extract_paragraphs_from_pdf(pdf_path):
    """
    FINAL PDF METHOD: Uses coordinate geometry and cross-page stitching
    to perfectly reconstruct all paragraphs, even those split across pages.
    """
    all_paragraphs = []
    carry_over_paragraph = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                paragraphs_on_page = _reconstruct_paragraphs_from_page(page)
                if not paragraphs_on_page: continue

                # If there's a carry-over from the previous page, stitch it to the first paragraph
                if carry_over_paragraph:
                    paragraphs_on_page[0] = carry_over_paragraph + " " + paragraphs_on_page[0]
                    carry_over_paragraph = ""

                # Check if the last paragraph on THIS page is incomplete
                last_para = paragraphs_on_page[-1]
                # A simple but effective heuristic: if it doesn't end with punctuation, it's likely incomplete.
                if not last_para.strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry_over_paragraph = paragraphs_on_page.pop()

                all_paragraphs.extend(paragraphs_on_page)
        
        # Add any final carry-over from the very last page
        if carry_over_paragraph:
            all_paragraphs.append(carry_over_paragraph)

        if all_paragraphs: print("Successfully extracted paragraphs using coordinate-based method."); return all_paragraphs
        print("Coordinate-based method failed. Attempting OCR as last resort...")
    except Exception as e: print(f"Coordinate-based parsing failed: {e}. Attempting OCR...")


pdf_file = "0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf" 
extract_paragraphs_from_pdf(pdf_file)

Coordinate-based parsing failed: [Errno 2] No such file or directory: '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf'. Attempting OCR...
