In [4]:
!pip install pytesseract
!pip install PyPDF2
# !apt install tesseract-ocr
!pip install pytesseract pdf2image
# !apt update
# !apt install poppler-utils
!pip install pdf2image
!pip install stanza



In [5]:
import threading
import re
import spacy
from spacy.matcher import Matcher
from tkinter import filedialog, messagebox
import tkinter as tk
from tkinter.ttk import Progressbar, Style
from pdf2image import convert_from_path
import pytesseract
from ttkthemes import ThemedTk
import atexit
import os

In [6]:
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

In [None]:
class TextExtractor:
    def __init__(self, root):
        self.root = root
        self.root.title('Choose File')
        self.root.geometry('400x200')
        self.rent_extractor = RentExtractor()  # Create RentExtractor instance
        self.choose_file_page = ChooseFilePage(self.root, self.rent_extractor)
        self.choose_file_page.setup_close_handler(self)  # Pass the TextExtractor instance to handle the closure
        
    def on_closing(self):
        # Custom function to handle the window closure
        if hasattr(self, "choose_file_page") and hasattr(self.choose_file_page, "display_text_page"):
            self.choose_file_page.display_text_page.destroy()  # Destroy the DisplayTextPage if it exists
        self.root.destroy()  # Close the main application window
        os._exit(0)
#         root.quit()
        
        
class RentExtractor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.keywords_match = Matcher(self.nlp.vocab)
        self.match_after = Matcher(self.nlp.vocab)
        self.match_before = Matcher(self.nlp.vocab)
        self.initialize_matchers()

    def initialize_matchers(self):
        keywords = ["initial rent", "annual rent", "rent"]

        for keyword in keywords:
            if len(keyword.split()) == 1:
                self.keywords_match.add(keyword, [
                    [{"LOWER": keyword.lower()}]
                ])
                self.match_after.add(keyword + "_after", [
                    [{"LOWER": keyword.lower()},
                     {"OP": "*"},
                     {"LOWER": {"IN": ["£", "$", "€"]}},
                     {"TEXT": {"REGEX": r"[\d,.]+"}}]
                ])
                self.match_before.add(keyword + "_before", [
                    [{"LOWER": {"IN": ["£", "$", "€"]}},
                     {"TEXT": {"REGEX": r"[\d,.]+"}},
                     {"OP": "*"},
                     {"LOWER": keyword.lower()}]
                ])
            else:
                self.keywords_match.add(keyword, [
                    [{"LOWER": keyword.split()[0].lower()},
                     {"LOWER": keyword.split()[1].lower()}]
                ])
                self.match_after.add(keyword + "_after", [
                    [{"LOWER": keyword.split()[0].lower()},
                     {"LOWER": keyword.split()[1].lower()},
                     {"OP": "*"},
                     {"LOWER": {"IN": ["£", "$", "€"]}},
                     {"TEXT": {"REGEX": r"[\d,.]+"}}]
                ])
                self.match_before.add(keyword + "_before", [
                    [{"LOWER": {"IN": ["£", "$", "€"]}},
                     {"TEXT": {"REGEX": r"[\d,.]+"}},
                     {"OP": "*"},
                     {"LOWER": keyword.split()[0].lower()},
                     {"LOWER": keyword.split()[1].lower()}]
                ])

    def extract_annual_rent(self, text):
        doc = self.nlp(text)
        keywords_matched = self.keywords_match(doc)

        rent = None

        for match_id, start, end in keywords_matched:
            search_start = max(0, start - 50)
            search_end = min(end + 100, len(doc))

            if self.second_match(self.match_after, doc, start, search_end):
                rent = self.second_match(self.match_after, doc, start, search_end)
            else:
                rent = self.second_match(self.match_before, doc, search_start, end)

            if rent:
                break

        return rent

    def second_match(self, match_func, doc, start, end):
        rent_value = None
        span = doc[start: end]

        rent_match = match_func(span)
        if rent_match:
            rent_value = re.search(r'[£$€][\d,.]+', span.text).group()
        return rent_value

class ChooseFilePage(tk.Frame):
    def __init__(self, root, rent_extractor):
        super().__init__(root)
        self.style = Style()
        self.style.theme_use('arc')
        self.root = root
        self.rent_extractor = rent_extractor  # Store the RentExtractor instance
        self.pack(fill=tk.BOTH, expand=True)
        self.pack_propagate(False)

        self.file_label = tk.Label(self, text="File path: ", font=("Arial", 16))
        self.file_label.pack()

        self.choose_file = tk.Button(self, text="Choose File", command=self.open_file_dialog)
        self.choose_file.pack(pady=50, side=tk.LEFT, padx=10)

        self.extract_btn = tk.Button(self, text="Extract", command=self.show_extracted_text)
        self.file_path = None

    def open_file_dialog(self):
        self.file_path = filedialog.askopenfilename()
        if self.file_path:
            self.file_label.config(text=f"File path: {self.file_path}", wraplength=self.winfo_width())
            messagebox.showinfo("Import file successfully!", f"{self.file_path} imported successfully!")
            self.extract_btn.pack(pady=50, side=tk.LEFT, padx=10)
        else:
            messagebox.showinfo("No file selected", "No file selected!")

    def show_extracted_text(self):
        threading.Thread(target=self.extract_text_display).start()

    def extract_text_display(self):
        progress_window = tk.Toplevel(self.root)
        progress_window.title("Text Extraction Progress")
        progress_window.geometry("300x100")
        progress_window.resizable(False, False)

        progress_label = tk.Label(progress_window, text="Extracting text...", font=("Arial", 12))
        progress_label.pack(pady=10)

        self.progress_bar = Progressbar(progress_window, orient=tk.HORIZONTAL, length=200, mode='determinate')
        self.progress_bar.pack(pady=5)

        self.percentage_label = tk.Label(progress_window, text="0% extracted")
        self.percentage_label.pack(pady=5)

        extracted_text = self.extract_text_from_pdf(self.file_path)

        rent = self.rent_extractor.extract_annual_rent(extracted_text)

        progress_window.destroy()

        # Now, only pass the extracted rent to the DisplayTextPage
        DisplayTextPage(self.root, self.file_path, rent)

        # Destroy the current page (ChooseFilePage)
        self.destroy()

    def extract_text_from_pdf(self, pdf_path):
        images = convert_from_path(pdf_path)
        text = ""
        total_images = len(images)

        for idx, image in enumerate(images, 1):
            image_bytes = image.convert("RGB")
            text += pytesseract.image_to_string(image_bytes, lang='eng')

            # Update the progress bar with the percentage of extracted work
            percentage = (idx / total_images) * 100
            self.update_progress_bar(percentage)
            self.percentage_label.config(text=f"{int(percentage)}% extracted")
            self.progress_bar.update_idletasks()

        return text

    def update_progress_bar(self, percentage):
        self.progress_bar["value"] = percentage
        self.root.update_idletasks()

            
    def setup_close_handler(self, text_extractor):
        # Register the cleanup function to be called when the program exits
        self.root.protocol("WM_DELETE_WINDOW", lambda: text_extractor.on_closing())
        


class DisplayTextPage:
    def __init__(self, root, file_path, extracted_rent):
        self.root = root
        self.root.title('Extracted Annual Rent')
        self.root.geometry('600x400')  # Adjust the window size to accommodate the file name

        # Show the file name at the top of the window
        file_label = tk.Label(self.root, text=f"File path: {file_path}", font=("Arial", 14), wraplength= 580)
        file_label.pack(pady=10, padx=10, anchor=tk.W, fill=tk.X, expand=True)

        extracted_rent_frame = tk.Frame(self.root)
        extracted_rent_frame.pack(fill=tk.BOTH, expand=True)

        self.text_display = tk.Text(extracted_rent_frame, wrap=tk.WORD, font=("Arial", 12), state=tk.NORMAL)
        self.text_display.pack(fill=tk.BOTH, expand=True)

        # Set the text widget to be read-only
        self.text_display.config(state=tk.NORMAL)
        self.text_display.insert(tk.END, "Extracted Annual Rent:\n")
        self.text_display.insert(tk.END, extracted_rent)
        self.text_display.config(state=tk.DISABLED)  # Set the text widget back to read-only

if __name__ == "__main__":
    root = ThemedTk(theme="arc")
    TextExtractor(root)
    root.mainloop()


2023-08-15 11:31:01.001 python[35881:6562512] +[CATransaction synchronize] called within transaction
