In [None]:
import json
import pandas as pd

# Update these paths as needed
json_input_path = "zer/analyzeDocResponse.json"
excel_output_path = "zer/Extracted_Tables.xlsx"

# Load Textract JSON
with open(json_input_path, "r", encoding="utf-8") as f:
    textract_data = json.load(f)

# Build a block map for quick lookup
blocks = textract_data["Blocks"]
block_map = {block["Id"]: block for block in blocks}

tables = []
for block in blocks:
    if block["BlockType"] == "TABLE":
        table_cells = []
        for rel in block.get("Relationships", []):
            if rel["Type"] == "CHILD":
                for cell_id in rel["Ids"]:
                    cell = block_map[cell_id]
                    if cell["BlockType"] == "CELL":
                        row = cell["RowIndex"]
                        col = cell["ColumnIndex"]
                        text = ""
                        # Extract text within cell
                        for child_rel in cell.get("Relationships", []):
                            if child_rel["Type"] == "CHILD":
                                for word_id in child_rel["Ids"]:
                                    word = block_map[word_id]
                                    if word["BlockType"] == "WORD":
                                        text += word["Text"] + " "
                        table_cells.append({
                            "row": row,
                            "col": col,
                            "text": text.strip()
                        })
        tables.append(table_cells)

# Write tables to Excel
writer = pd.ExcelWriter(excel_output_path, engine="xlsxwriter")

for idx, cells in enumerate(tables):
    if not cells:
        continue

    max_row = max(cell["row"] for cell in cells)
    max_col = max(cell["col"] for cell in cells)

    # Initialize empty table
    data = [["" for _ in range(max_col)] for _ in range(max_row)]

    # Fill the table with extracted text
    for cell in cells:
        data[cell["row"] - 1][cell["col"] - 1] = cell["text"]

    # Use first row as header if possible
    if max_row > 1:
        df = pd.DataFrame(data[1:], columns=data[0])
    else:
        df = pd.DataFrame(data)

    df.to_excel(writer, sheet_name=f"Table_{idx + 1}", index=False)

writer.close()

print(f"✅ Tables successfully written to: {excel_output_path}")


: 

In [None]:
# ===== File: processing_engine.py =====

import os
import shutil
import json
import re
import logging
import io
from PIL import Image, ImageEnhance
from PyPDF2 import PdfReader
from utils.loader import load_textract_json
from utils.config import REGEX, SEGMENT_NAMES, FUZZY_REGEX, FUZZY_HEADERS
from textracts.segment_extraction import extract_segment
from preprocessing.extract_trade_data import extract_table_data
from postprocessing.report_generator import generate_csv_reports
from textracts.merge_pages import merge_textract_pages

logging.basicConfig(level=logging.INFO)

class TextractProcessor:

    def __init__(self, pdf_file):
        self.pdf_file_name = pdf_file
        self.home_folder = os.getcwd()
        self.pdf_path = os.path.join(self.home_folder, self.pdf_file_name)
        self.output_folder = os.path.join(self.home_folder, "pdf_pages")
        self.region_name = 'us-east-1'

    def initialize_folder(self):
        try:
            if os.path.exists(self.output_folder):
                shutil.rmtree(self.output_folder)
            os.makedirs(self.output_folder)
            logging.info(f"Output folder initialized: {self.output_folder}")
        except Exception as e:
            logging.error(f"Failed to initialize output folder: {e}")
            raise

    def enhance_image(self, image):
        try:
            contrast_enhancer = ImageEnhance.Contrast(image)
            image = contrast_enhancer.enhance(1.35)
            return image
        except Exception as e:
            logging.error(f"Failed to enhance image: {e}")
            raise

    def extract_images_from_pdf(self):
        try:
            os.makedirs(self.output_folder, exist_ok=True)
            reader = PdfReader(self.pdf_path)
            enhanced_image_paths = []

            for page_number, page in enumerate(reader.pages):
                if "/XObject" in page.get("/Resources", {}):
                    xObject = page["/Resources"]["/XObject"].get_object()
                    for obj in xObject:
                        if xObject[obj]["/Subtype"] == "/Image":
                            size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
                            data = xObject[obj].get_data()
                            filter_type = xObject[obj]["/Filter"]

                            if filter_type == "/DCTDecode":
                                image = Image.open(io.BytesIO(data))
                            elif filter_type == "/JPXDecode":
                                image = Image.open(io.BytesIO(data))
                            elif filter_type == "/FlateDecode":
                                mode = "RGB"
                                image = Image.frombytes(mode, size, data)
                            else:
                                continue

                            enhanced_image = self.enhance_image(image)
                            image_path = os.path.join(self.output_folder, f"page_{page_number+1}.png")
                            enhanced_image.save(image_path)
                            enhanced_image_paths.append(image_path)

            return enhanced_image_paths

        except Exception as e:
            logging.error(f"Failed to extract images: {e}")
            raise

    def analyze_images_with_textract(self, jpg_files):
        import boto3
        textract = boto3.client('textract', region_name=self.region_name)

        responses = []
        for jpg_file in jpg_files:
            with open(jpg_file, 'rb') as img:
                response = textract.analyze_document(
                    Document={'Bytes': img.read()},
                    FeatureTypes=['TABLES', 'FORMS', 'LAYOUT']
                )
                responses.append(response)
        merged = {"Blocks": []}
        for page in responses:
            if "Blocks" in page:
                merged["Blocks"].extend(page["Blocks"])
        with open("textract_output.json", "w") as f:
            json.dump(merged, f, indent=4)

        return merged

    def process_textract_data(self, textract_data):
        blocks = textract_data.get("Blocks", [])
        lines = [block['Text'] for block in blocks if block['BlockType'] == 'LINE']

        demat = next((re.search(REGEX['demat_number'], line).group(0)
                      for line in lines if re.search(REGEX['demat_number'], line)), "NA")

        text_all = " ".join(lines)
        date_range_match = re.search(REGEX['date_range'], text_all)
        date_range = date_range_match.group(1) if date_range_match else "NA"

        segment_info = {
            "demat": demat,
            "date_range": date_range,
            "segments": {},
            "metadata": {},
            "missing_segments": [],
            "empty_segments": []
        }

        current_segment = None
        skip_page = any(re.search(r"holdings|positions", block.get("Text", ""), re.IGNORECASE)
                        for block in blocks if block['BlockType'] == 'LINE')
        if skip_page:
            logging.info("Page skipped due to Holdings/Positions presence.")
            return []

        for block in blocks:
            if block['BlockType'] in ["WORD", "LINE"]:
                text = block.get("Text", "").lower()
                for seg_name, pattern in SEGMENT_NAMES.items():
                    if pattern.search(text):
                        current_segment = seg_name
            
            if block['BlockType'] == 'TABLE' and current_segment:
                partial_textract = {"Blocks": [block]}
                trades = extract_table_data(partial_textract, current_segment)
                
                if current_segment not in segment_info['segments']:
                    segment_info['segments'][current_segment] = {"is_empty": False, "trades": []}

                segment_info['segments'][current_segment]['trades'].extend(trades)

        found_segments = list(segment_info['segments'].keys())
        segment_info['missing_segments'] = [s for s in SEGMENT_NAMES.keys() if s not in found_segments]

        for seg in found_segments:
            trades = segment_info['segments'][seg]['trades']
            if not trades:
                segment_info['segments'][seg]['is_empty'] = True
                segment_info['empty_segments'].append(seg)
            segment_info['metadata'][seg] = len(trades)

        return [segment_info]

    def main(self):
        try:
            logging.info("Initializing output folder...")
            self.initialize_folder()

            logging.info("Extracting images from PDF...")
            jpg_files = self.extract_images_from_pdf()
            if not jpg_files:
                raise ValueError("No images generated from PDF.")

            logging.info("Analyzing images with Textract...")
            textract_responses = self.analyze_images_with_textract(jpg_files)

            logging.info("Processing Textract data...")
            all_segment_info = self.process_textract_data(textract_responses)

            if not all_segment_info:
                logging.warning("PDF skipped, no valid segments processed.")
                return

            logging.info("Generating final reports...")
            generate_csv_reports(all_segment_info)

            logging.info("✅ Processing completed successfully.")

        except Exception as e:
            logging.error(f"Processing failed: {e}")
