In [1]:
import os
import sys
import threading
import re
import spacy
from spacy.matcher import Matcher
from PyQt5 import QtWidgets, QtGui, QtCore
from pdf2image import convert_from_path
import pytesseract
import atexit
import stanza
import dateparser
import csv

pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

class TextExtractor:
    def __init__(self):
        self.app = QtWidgets.QApplication(sys.argv)
        self.app.setStyle("Fusion")  # Set the Fusion style
        self.root = QtWidgets.QMainWindow()
        self.root.setWindowTitle('Choose File')
        self.root.setGeometry(100, 100, 400, 200)
        self.extractor = Extractor()  # Create Extractor instance
        self.choose_file_page = ChooseFilePage(self.root, self.extractor)
        self.choose_file_page.setup_close_handler(self)  # Pass the TextExtractor instance to handle the closure

    def run(self):
        self.root.show()
        sys.exit(self.app.exec_())

    def on_closing(self):
        # Custom function to handle the window closure
        if hasattr(self, "choose_file_page") and hasattr(self.choose_file_page, "display_text_page"):
            self.choose_file_page.display_text_page.destroy()  # Destroy the DisplayTextPage if it exists
        self.root.close()  # Close the main application window

class Extractor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.stanza_nlp = stanza.Pipeline(processors='tokenize,ner', lang='en')
        self.matcher = Matcher(self.nlp.vocab)
        self.match_after = Matcher(self.nlp.vocab)
        self.match_before = Matcher(self.nlp.vocab)
        self.match_term = Matcher(self.nlp.vocab)
        self.info = {}
        self.initialize_matchers()

    def initialize_matchers(self):
        # Define the first search patterns
        rent_value_pattern = [
            {"LOWER": {"IN": ["annual", "initial"]}, "OP": "?"},
            {"LOWER": "rent"}
        ]
        term_pattern = [{"LOWER": "term"}]
        commencement_date_pattern = [{"LOWER": "term"}, {"LOWER": "commencement"}, {"LOWER": "date"}]
        
        # Add the patterns to the matcher
        self.matcher.add("RENT_VALUE", [rent_value_pattern])
        self.matcher.add("TERM", [term_pattern])
        self.matcher.add("COMMENCEMENT_DATE", [commencement_date_pattern])
        
        rent_after_pattern = rent_value_pattern +[
             {"OP": "*"},
             {"LOWER": {"IN": ["£", "$", "€"]}},
             {"TEXT": {"REGEX": r"[\d,.]+"}}]

        rent_before_pattern =[{"LOWER": {"IN": ["£", "$", "€"]}},
                     {"TEXT": {"REGEX": r"[\d,.]+"}},
                     {"OP": "*"}] + rent_value_pattern

        self.match_after.add("rent_after",[rent_after_pattern])
        self.match_before.add("rent_after",[rent_before_pattern])

        # works for x years, x (x) years... with spaces.
        term_length_pattern = [
            {"LIKE_NUM": True, "OP": "+"},
            {"IS_SPACE": True, "OP": "*"},
            {"IS_PUNCT": True, "OP":"?"},
            {"LIKE_NUM": True, "OP":"?"},
            {"IS_PUNCT": True, "OP":"?"},
            {"IS_SPACE": True, "OP": "*"},
            {"LOWER": {"IN": ["year", "years", "month", "months"]}}
        ]

        self.match_term.add("TERM_LENGTH", [term_length_pattern])

    def extract_info(self, text):
        doc = self.nlp(text)
        rent = None
        term_length = None
        commencement_date = None
        matches = self.matcher(doc)

         # Extract and print the matched spans
        for match_id, start, end in matches:
        #     matched_text = doc[start:end].text
            if self.nlp.vocab.strings[match_id] == "RENT_VALUE":
                if rent:
                    continue
                else:
                    rent = self.second_match(self.match_after, doc,start,min(end+50,len(doc)),r'[£$€][\d,.]+')
                    if rent:
                        continue
                    else:
                        rent = self.second_match(self.match_before, doc,max(start-15,0),end,r'[£$€][\d,.]+')
            elif self.nlp.vocab.strings[match_id] == "TERM":
                if term_length:
                    continue
                else:
                    span = doc[start: min(end+20,len(doc))]
                    matches_terms = self.match_term(span)
                    for term_id, start_t, end_t in matches_terms:
                        term_length = span[start_t:end_t].text

            elif self.nlp.vocab.strings[match_id] == "COMMENCEMENT_DATE":
                if commencement_date:
                    continue
                else:
                    commencement_date = self.extract_dates_with_sutime_and_dateparser(doc[start:end +25].text)

        self.info['Annual Rent'] = rent
        self.info['Term'] = term_length
        self.info['Term Commencement Date'] = commencement_date
        return self.info

    def second_match(self, match_func, doc, start, end, reg_pattern):
        res = None
        span = doc[start: end]

        res_matched = match_func(span)
        if res_matched:
            res = re.search(reg_pattern, span.text).group()
        return res

    def extract_dates_with_sutime_and_dateparser(self, text):
        doc = self.stanza_nlp(text)

        dates = []
        for sentence in doc.sentences:
            for entity in sentence.ents:
                if entity.type == 'DATE':
                    dates.append(entity.text)

        # If SUTime extracted any dates, return the first one
        if dates:
            return [dates[0].replace('\n', ' ')]

        # If no dates were extracted by SUTime, try using dateparser
        parsed_dates = dateparser.parse(text, settings={'STRICT_PARSING': False})
        if parsed_dates:
            return [parsed_dates[0].strftime('%Y-%m-%d')]

        return []

from PyQt5 import QtWidgets, QtGui, QtCore, QtWidgets

class ChooseFilePage(QtWidgets.QFrame):
    def __init__(self, root, extractor):
        super().__init__(root)
        self.root = root
        self.extractor = extractor  # Store the Extractor instance
        self.setLayout(QtWidgets.QVBoxLayout())
        self.file_label = QtWidgets.QLabel(self)
        self.file_label.setFont(QtGui.QFont("Arial", 16))
        self.layout().addWidget(self.file_label)
        
        self.choose_file = QtWidgets.QPushButton("Choose File", self)
        self.choose_file.clicked.connect(self.open_file_dialog)
        self.choose_file.setSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        self.choose_file.setFixedSize(120, 30)  # Set a fixed size for the "Choose File" button
        self.layout().addWidget(self.choose_file, alignment=QtCore.Qt.AlignLeft)

        self.extract_btn = QtWidgets.QPushButton("Extract", self)
        self.extract_btn.clicked.connect(self.show_extracted_text)
        self.extract_btn.setSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        self.extract_btn.setFixedSize(120, 30)  # Set a fixed size for the "Extract" button
        self.extract_btn.hide()
        self.layout().addWidget(self.extract_btn, alignment=QtCore.Qt.AlignLeft)

        self.file_path = None

    def open_file_dialog(self):
        self.file_path, _ = QtWidgets.QFileDialog.getOpenFileName(self, "Choose PDF File", "", "PDF Files (*.pdf)")
        if self.file_path:
            self.file_label.setText(f"File path: {self.file_path}")
            QtWidgets.QMessageBox.information(self, "Import file successfully!", f"{self.file_path} imported successfully!")
            self.extract_btn.show()  # Show the extraction button
        else:
            QtWidgets.QMessageBox.information(self, "No file selected", "No file selected!")

    def show_extracted_text(self):
        threading.Thread(target=self.extract_text_display).start()

    def extract_text_display(self):
        progress_window = QtWidgets.QWidget(self.root)
        progress_window.setWindowTitle("Text Extraction Progress")
        progress_window.setGeometry(100, 100, 300, 100)
        progress_window.setWindowModality(QtCore.Qt.WindowModal)
        progress_window.show()

        progress_label = QtWidgets.QLabel("Extracting text...", progress_window)
        progress_label.setGeometry(10, 10, 280, 20)

        self.progress_bar = QtWidgets.QProgressBar(progress_window)
        self.progress_bar.setGeometry(10, 40, 280, 20)

        self.percentage_label = QtWidgets.QLabel("0% extracted", progress_window)
        self.percentage_label.setGeometry(10, 70, 280, 20)

        extracted_text = self.extract_text_from_pdf(self.file_path)
        info = self.extractor.extract_info(extracted_text)

        progress_window.hide()

        # Now, only pass the extracted rent to the DisplayTextPage
        display_text_page = DisplayTextPage(self.root, self.file_path, info, extracted_text)
        self.root.setCentralWidget(display_text_page)
        self.destroy()

    def extract_text_from_pdf(self, pdf_path):
        images = convert_from_path(pdf_path)
        text = ""
        total_images = len(images)

        for idx, image in enumerate(images, 1):
            image_bytes = image.convert("RGB")
            text += pytesseract.image_to_string(image_bytes, lang='eng')

            # Update the progress bar with the percentage of extracted work
            percentage = (idx / total_images) * 100
            self.update_progress_bar(percentage)
            self.percentage_label.config(text=f"{int(percentage)}% extracted")
            self.progress_bar.update_idletasks()

        return text

    def update_progress_bar(self, percentage):
        self.progress_bar["value"] = percentage
        self.root.update_idletasks()

    def setup_close_handler(self, text_extractor):
        # Register the cleanup function to be called when the program exits
        self.root.closeEvent = lambda event: text_extractor.on_closing()

class DisplayTextPage:
    def __init__(self, root, file_path, extracted_info, original_text):
        self.root = root
        self.root.setWindowTitle('Extracted Annual Rent')
        self.root.setGeometry(100, 100, 600, 400)
        self.file_path = file_path 

        file_label = tk.Label(self.root, text=f"File path: {file_path}", font=("Arial", 14), wraplength=580)
        file_label.pack(pady=10, padx=10, anchor=tk.W, fill=tk.X, expand=True)

        extracted_info_frame = tk.Frame(self.root)
        extracted_info_frame.pack(fill=tk.BOTH, expand=True)

        columns = ("Key", "Value")
        self.treeview = ttk.Treeview(extracted_info_frame, columns=columns, show="headings", height=5)
        self.treeview.pack(fill=tk.BOTH, expand=True)

        self.treeview.heading("Key", text="Key")
        self.treeview.heading("Value", text="Value")

        for key, value in extracted_info.items():
            self.treeview.insert("", tk.END, values=(key, value))
        

        export_csv_btn = QtWidgets.QPushButton("Export CSV", extracted_info_frame)
        export_csv_btn.setGeometry(10, 310, 100, 30)
        export_csv_btn.clicked.connect(self.export_to_csv)

        export_txt_btn = QtWidgets.QPushButton("Export TXT", extracted_info_frame)
        export_txt_btn.setGeometry(120, 310, 100, 30)
        export_txt_btn.clicked.connect(lambda: self.export_to_txt(original_text))

    def export_to_csv(self):
        filename = f"exportInfo_{os.path.basename(self.file_path)}.csv"
        with open(filename, mode="w", newline="") as file:
            writer = csv.writer(file)
            writer.writerow(["Key", "Value"])  # Writing the header
            for item in self.treeview.get_children():
                key = self.treeview.item(item, "values")[0]
                value = self.treeview.item(item, "values")[1]
                writer.writerow([key, value])
        
        tk.messagebox.showinfo("CSV Export", f"Information has been exported to {filename}")

    def export_to_txt(self, original_text):
        filename = f"exportText_{os.path.basename(self.file_path)}.txt"

        with open(filename, 'w') as txt_file:
            txt_file.write(original_text)

        tk.messagebox.showinfo("TXT Export", f"Original text has been exported to {filename}")


    def setup_close_handler(self, text_extractor):
        # Register the cleanup function to be called when the program exits
        self.root.protocol("WM_DELETE_WINDOW", lambda: text_extractor.on_closing())

if __name__ == "__main__":
    app = QtWidgets.QApplication(sys.argv)
    root = QtWidgets.QMainWindow()
#     extractor = Extractor()
    text_extractor = TextExtractor()
    text_extractor.run()
    sys.exit(app.exec_())
# if __name__ == "__main__":
#     text_extractor = TextExtractor()
#     text_extractor.run()

2023-08-20 17:06:19 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-08-20 17:06:20 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-08-20 17:06:20 INFO: Using device: cpu
2023-08-20 17:06:20 INFO: Loading: tokenize
2023-08-20 17:06:20 INFO: Loading: ner
2023-08-20 17:06:20 INFO: Done loading processors!


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
