In [1]:
import os
import tkinter as tk
import pandas as pd
import numpy as np
import PyPDF2
import docx
from PIL import Image
import pytesseract
from tkinter import filedialog, messagebox, scrolledtext
from tkinter import ttk

# ----------------- Helper Functions -----------------
def extract_text_from_pdf(path):
    text = ""
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
    return text

def extract_text_from_excel(path):
    text = ""
    # pandas will auto-detect engine; openpyxl often needed for .xlsx
    xls = pd.read_excel(path, sheet_name=None, dtype=str)  # read all sheets as strings
    for sheet_name, df in xls.items():
        text += f"--- Sheet: {sheet_name} ---\n"
        # Convert dataframe to readable lines
        text += df.fillna("").to_string(index=False) + "\n\n"
    return text

def extract_text_from_docx(path):
    text = ""
    doc = docx.Document(path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_text_from_image(path):
    img = Image.open(path)
    # Pillow + pytesseract
    text = pytesseract.image_to_string(img)
    return text

def extract_text(path):
    path_lower = path.lower()
    if path_lower.endswith(".pdf"):
        return extract_text_from_pdf(path)
    elif path_lower.endswith((".xlsx", ".xls")):
        return extract_text_from_excel(path)
    elif path_lower.endswith(".docx"):
        return extract_text_from_docx(path)
    elif path_lower.endswith((".png", ".jpg", ".jpeg", ".tiff", ".bmp")):
        return extract_text_from_image(path)
    else:
        raise ValueError("Unsupported file type. Supported: PDF, Excel (.xls/.xlsx), Word (.docx), Image (.png/.jpg/.jpeg)")

# ----------------- GUI App -----------------
class TextExtractorGUI:
    def __init__(self, root):
        self.root = root
        root.title("File → Text Extractor & Highlighter")
        root.geometry("900x650")

        # Top frame: file selection
        top = ttk.Frame(root, padding=8)
        top.pack(fill="x")

        self.file_path_var = tk.StringVar()
        ttk.Label(top, text="Selected file:").pack(side="left")
        self.file_entry = ttk.Entry(top, textvariable=self.file_path_var, width=60)
        self.file_entry.pack(side="left", padx=6)
        ttk.Button(top, text="Browse...", command=self.browse_file).pack(side="left", padx=6)
        ttk.Button(top, text="Extract", command=self.extract_and_display).pack(side="left")

        # Middle: search controls
        mid = ttk.Frame(root, padding=8)
        mid.pack(fill="x")

        ttk.Label(mid, text="Search string:").pack(side="left")
        self.search_var = tk.StringVar()
        self.search_entry = ttk.Entry(mid, textvariable=self.search_var, width=40)
        self.search_entry.pack(side="left", padx=6)

        self.case_insensitive = tk.BooleanVar(value=True)
        ttk.Checkbutton(mid, text="Case-insensitive", variable=self.case_insensitive).pack(side="left", padx=6)

        ttk.Button(mid, text="Find & Highlight", command=self.find_and_highlight).pack(side="left", padx=6)
        ttk.Button(mid, text="Clear Highlights", command=self.clear_highlights).pack(side="left", padx=6)

        # Buttons for saving (ensures original isn't modified)
        save_frame = ttk.Frame(root, padding=8)
        save_frame.pack(fill="x")
        ttk.Button(save_frame, text="Save highlighted copy (output_highlighted.txt)", command=self.save_highlighted_copy).pack(side="left", padx=6)

        # Large text area to show extracted text
        text_frame = ttk.Frame(root, padding=8)
        text_frame.pack(fill="both", expand=True)

        self.text_widget = scrolledtext.ScrolledText(text_frame, wrap="word", font=("Helvetica", 11))
        self.text_widget.pack(fill="both", expand=True)
        
        # Status bar
        self.status_var = tk.StringVar(value="Ready")
        status = ttk.Label(root, textvariable=self.status_var, relief="sunken", anchor="w")
        status.pack(fill="x", side="bottom")

        # Internal storage
        self.extracted_text = ""
        self.current_file = None

    def browse_file(self):
        filetypes = [
            ("All supported", ("*.pdf", "*.xlsx", "*.xls", "*.docx", "*.png", "*.jpg", "*.jpeg", "*.tiff", "*.bmp")),
            ("PDF files", "*.pdf"),
            ("Excel files", ("*.xlsx", "*.xls")),
            ("Word (.docx)", "*.docx"),
            ("Images", ("*.png", "*.jpg", "*.jpeg", "*.tiff", "*.bmp")),
            ("All files", "*.*"),
        ]
        path = filedialog.askopenfilename(title="Select input file", filetypes=filetypes)
        if path:
            self.file_path_var.set(path)
            self.current_file = path
            self.status_var.set(f"Selected: {os.path.basename(path)}")

    def extract_and_display(self):
        path = self.file_path_var.get().strip()
        if not path:
            messagebox.showwarning("No file", "Please select a file first.")
            return
        try:
            self.status_var.set("Extracting text...")
            self.root.update_idletasks()
            text = extract_text(path)
            if not text:
                text = "(No text could be extracted from this file.)"
            self.extracted_text = text
            self.text_widget.delete("1.0", tk.END)
            self.text_widget.insert(tk.END, text)
            self.clear_highlights()
            self.status_var.set(f"Extracted text from {os.path.basename(path)} — {len(text)} characters")
        except Exception as e:
            messagebox.showerror("Extraction error", f"Failed to extract text:\n{e}")
            self.status_var.set("Error during extraction")

    def clear_highlights(self):
        """
        Removes the widget boxes and restores the original text string.
        """
        # Find all tagged widget positions
        ranges = self.text_widget.tag_ranges("highlight_marker")
        
        # Iterate backwards to avoid index shifting problems
        for i in range(len(ranges)-2, -2, -2):
            start = ranges[i]
            # Check if there is a window (embedded widget) at this index
            win_name = self.text_widget.window_cget(start, "window")
            if win_name:
                widget = self.root.nametowidget(win_name)
                # We stored the original text in the widget object
                if hasattr(widget, 'original_text'):
                    orig_text = widget.original_text
                    # Delete the widget (it counts as 1 character in the text index)
                    self.text_widget.delete(start, f"{start} + 1 chars")
                    # Insert the plain text back
                    self.text_widget.insert(start, orig_text)
        
        self.text_widget.tag_delete("highlight_marker")
        self.status_var.set("Highlights cleared")

    def find_and_highlight(self):
        """
        Finds text matches and replaces them with a Frame+Label widget combo
        to create a Red Border with Unfilled (Transparent-looking) center.
        """
        self.clear_highlights()
        query = self.search_var.get()
        if not query:
            messagebox.showinfo("No query", "Enter a search string first.")
            return
        
        full_text = self.text_widget.get("1.0", tk.END)
        # Tkinter usually adds a newline at the end; check length
        if len(full_text) <= 1:
            messagebox.showinfo("No text", "No extracted text to search. Extract a file first.")
            return

        # Prepare strings for searching (handles case-insensitivity)
        if self.case_insensitive.get():
            search_text = full_text.lower()
            search_query = query.lower()
        else:
            search_text = full_text
            search_query = query

        # 1. Identify all match indices first
        matches = []
        start_search = 0
        while True:
            idx = search_text.find(search_query, start_search)
            if idx == -1:
                break
            matches.append(idx)
            start_search = idx + len(query)

        if not matches:
             self.status_var.set(f"No matches found for '{query}'")
             messagebox.showinfo("No matches", "No occurrences found.")
             return
             
        self.status_var.set(f"Found {len(matches)} matches for '{query}'")

        # 2. Get the background color of the text widget to fake transparency
        bg_color = self.text_widget.cget("background")
        if not bg_color or bg_color == "": 
            bg_color = "white"

        # 3. Apply highlighting in REVERSE order
        # We process from the end of the text to the beginning.
        # This prevents the indices of earlier matches from changing when we modify later matches.
        for idx in reversed(matches):
            start_pos = f"1.0 + {idx} chars"
            end_pos = f"1.0 + {idx + len(query)} chars"
            
            # Extract the actual text (preserving original case)
            match_str = self.text_widget.get(start_pos, end_pos)
            
            # --- CREATE THE CUSTOM RED BORDER WIDGET ---
            # Outer Frame: Red background. This acts as the border color.
            fr = tk.Frame(self.text_widget, background="red", borderwidth=0)
            
            # Inner Label: Mimics the original text.
            # bg=bg_color makes it look "unfilled" (matching the text area).
            # fg="black" keeps the font color black (or whatever the label default is).
            lbl = tk.Label(fr, text=match_str, 
                           background=bg_color, 
                           foreground="black", 
                           borderwidth=0, 
                           padx=0, pady=0)
            
            # Pack the label inside the frame with 1px padding.
            # This reveals 1px of the Red Frame behind it, creating the border effect.
            lbl.pack(padx=1, pady=1)
            
            # Attach the original string to the widget for restoration later
            fr.original_text = match_str
            
            # Remove the plain text
            self.text_widget.delete(start_pos, end_pos)
            # Insert the Frame widget in its place
            self.text_widget.window_create(start_pos, window=fr)
            
            # Mark this location with a tag so we can find it to clear later
            self.text_widget.tag_add("highlight_marker", start_pos)

    def save_extracted_text(self):
        if not self.extracted_text:
            messagebox.showinfo("No text", "No extracted text to save. Extract a file first.")
            return
        outpath = "output_text.txt"
        with open(outpath, "w", encoding="utf-8") as f:
            f.write(self.extracted_text)
        self.status_var.set(f"Saved extracted text → {outpath}")
        messagebox.showinfo("Saved", f"Extracted text saved to {outpath}\n(Original file was NOT modified.)")

    def save_highlighted_copy(self):
        """
        Save a separate text file showing the matches marked inline.
        This does NOT modify the original input file.
        """
        if not self.extracted_text:
            messagebox.showinfo("No text", "No extracted text to save. Extract a file first.")
            return

        query = self.search_var.get()
        if not query:
            messagebox.showinfo("No query", "Enter a search string first before saving highlighted copy.")
            return

        # Note: We use self.extracted_text (raw string), not the content of the
        # text widget (which now contains embedded widgets/boxes).
        text = self.extracted_text
        if self.case_insensitive.get():
            import re
            pattern = re.compile(re.escape(query), flags=re.IGNORECASE)
            marked = pattern.sub(lambda m: f"<<HIGHLIGHT>>{m.group(0)}<</HIGHLIGHT>>", text)
        else:
            marked = text.replace(query, f"<<HIGHLIGHT>>{query}<</HIGHLIGHT>>")

        outpath = "output_highlighted.txt"
        with open(outpath, "w", encoding="utf-8") as f:
            f.write(marked)

        self.status_var.set(f"Saved highlighted copy → {outpath}")
        messagebox.showinfo("Saved", f"Highlighted copy saved to {outpath}\n(Original file was NOT modified.)")

def main():
    root = tk.Tk()
    app = TextExtractorGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()