In [20]:
""" Use Apple's Vision Framework via PyObjC to detect text in images """

import pathlib

import Quartz
import Vision
from Cocoa import NSURL
from Foundation import NSDictionary
# needed to capture system-level stderr
from wurlitzer import pipes


def image_to_text(img_path, lang="eng"):
    input_url = NSURL.fileURLWithPath_(img_path)

    with pipes() as (out, err):
    # capture stdout and stderr from system calls
    # otherwise, Quartz.CIImage.imageWithContentsOfURL_
    # prints to stderr something like:
    # 2020-09-20 20:55:25.538 python[73042:5650492] Creating client/daemon connection: B8FE995E-3F27-47F4-9FA8-559C615FD774
    # 2020-09-20 20:55:25.652 python[73042:5650492] Got the query meta data reply for: com.apple.MobileAsset.RawCamera.Camera, response: 0
        input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)

    vision_options = NSDictionary.dictionaryWithDictionary_({})
    vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
        input_image, vision_options
    )
    results = []
    handler = make_request_handler(results)
    vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
    error = vision_handler.performRequests_error_([vision_request], None)

    return results

def make_request_handler(results):
    """ results: list to store results """
    if not isinstance(results, list):
        raise ValueError("results must be a list")

    def handler(request, error):
        if error:
            print(f"Error! {error}")
        else:
            observations = request.results()
            for text_observation in observations:
                recognized_text = text_observation.topCandidates_(1)[0]
                results.append([recognized_text.string(), recognized_text.confidence()])
    return handler


def main():
    import sys
    import pathlib

    img_path = "image.png"
#     if not img_path.is_file():
#         sys.exit("Invalid image path")
#     img_path = str(img_path.resolve())
    results = image_to_text(img_path)
    print(results)


if __name__ == "__main__":
    main()


[['OLSWANG', 0.30000001192092896], ['9 January 2015 a', 0.5], ['COUNTERPART LEASE', 1.0], ['RELATING TO UNIT 21 PRINCESS PARADE, MILL GATE', 1.0], ['SHOPPING CENTRE, BURY', 1.0], ['(1)', 0.5], ['INFRARED UK TIGER NOMINEE 1 LIMITED AND INFRARED UK', 0.5], ['TIGER NOMINEE 2 LIMITED', 0.5], ['(2)', 0.5], ['THAI NGOC TRAN', 0.30000001192092896], ['90 High Holborn', 0.5], ['London WCIV 6XX', 0.30000001192092896], ['www.olswang.com', 0.30000001192092896], ['Ref. SNBI23367-628\\18846475-1', 0.5], ['T +44 (0) 20 7067 3000', 0.5], ['F +44 (0) 20 7067 3999', 0.5], ['DX 37972 Kingsway', 0.30000001192092896], ['Olswang LLP is authorised and regulated by the Solicitors Regulation Authority', 0.5]]


In [25]:
from pdf2image import convert_from_path
import io
import os
pdf_path = "Bury_Ten_Docs_21_Princess_Parade_Lease_09012015 (1).pdf"
images = convert_from_path(pdf_path)
text = ""



for image in images:

    image.save('image.png')
    text_list = image_to_text('image.png')
    formatted_text = "\n".join([item[0] for item in text_list])
    text+= formatted_text
    os.remove('image.png')
print(text)

OLSWANG
9 January 2015 a
COUNTERPART LEASE
RELATING TO UNIT 21 PRINCESS PARADE, MILL GATE
SHOPPING CENTRE, BURY
(1)
INFRARED UK TIGER NOMINEE 1 LIMITED AND INFRARED UK
TIGER NOMINEE 2 LIMITED
(2)
THAI NGOC TRAN
90 High Holborn
London WCIV 6XX
www.olswang.com
Ref. SNBI23367-628\18846475-1
T +44 (0) 20 7067 3000
F +44 (0) 20 7067 3999
DX 37972 Kingsway
Olswang LLP is authorised and regulated by the Solicitors Regulation AuthorityCONTENTS
Clause
1. DEFINITIONS AND INTERPRETATION
2
DEMISE
3. RENTS
3.1
YEARLY RENT
3.2
SERVICE CHARGE RENT
3.3
INSURANCE RENT AND ADDITIONAL RENT
TENANT'S COVENANTS
Page
5
4 1
4.2
PAYMENT OF RENTS
INTEREST IN DEFAULT
6
4.3
4.4
4.5
4.6
4.7
OUTGOINGS
VAT
REPAIR AND MAINTENANCE
DECORATION
CONTRIBUTION TO COMMON EXPENSES
6
4.8
4.9
LANDLORD'S RIGHTS OF ENTRY
LANDLORD'S RIGHT TO REMEDY BREACHES OF COVENANT
4 10 DEFECTS IN THE PREMISES
4.11 DISPOSALS
4.12 ALTERATIONS
4.13
CONSTRUCTION (DESIGN AND MANAGEMENT) REGULATIONS 2007
4 14
TOWN PLANNING
4.15
USE
4.16 DELIVERIES


In [26]:
import threading
import re
import spacy
from spacy.matcher import Matcher
import tkinter as tk
from tkinter import filedialog, messagebox, ttk, PhotoImage
from pdf2image import convert_from_path
from ttkthemes import ThemedTk, ThemedStyle
from tkinter.ttk import Progressbar, Style
from pdf2image import convert_from_path
import pytesseract
import atexit
import os
import stanza
import dateparser
import csv
from PIL import Image, ImageTk

In [46]:
    def extract_dates_with_sutime_and_dateparser( text):
#         nlp = stanza.Pipeline(processors='tokenize,ner', lang='en')
        stanza_nlp = stanza.Pipeline(processors='tokenize,ner', lang='en')
        processed_text = re.sub(r'(\d+)"', r'\1', text)
        doc = stanza_nlp(processed_text)
        print(doc.text)

        dates = []
        for sentence in doc.sentences:
            for entity in sentence.ents:
                if entity.type == 'DATE':
                    if not any(indicator in entity.text.lower().split() for indicator in ["days", "date","day","year","years","month","months", "daily","bruntwood"]):
                        dates.append(entity.text)

        # If SUTime extracted any dates, return the first one
#         if dates:
#             return dates[0].replace('\n', ' ')
        if dates:
            dates.sort()  # Sort the dates to get the earliest one
            earliest_date = filter_complete_dates(dates)
            if earliest_date:
                return earliest_date.replace('\n', ' ')

        # If no dates were extracted by SUTime, try using dateparser
        parsed_dates = dateparser.parse(text, settings={'STRICT_PARSING': False})
        if parsed_dates:
            return parsed_dates[0].strftime('%Y-%m-%d')

#         parsed_dates = dateparser.parse(text, settings={'STRICT_PARSING': False})
#         if parsed_dates:
#             # Filter out dates that are likely to be just month and year
#             complete_dates = [date for date in parsed_dates if date.day is not None]
#             if complete_dates:
#                 return complete_dates[0].strftime('%Y-%m-%d')

        return None


    def filter_complete_dates(dates):
        for date in dates:
            parsed_date = dateparser.parse(date, settings={'STRICT_PARSING': False})
            if parsed_date and parsed_date.day is not None and parsed_date.year is not None:
                return date
        return None

In [47]:
text = "Term Commencement Date \n 8 June 2022 \n Term \nCommencement \n Date "
extract_dates_with_sutime_and_dateparser(text)

2023-08-26 11:05:35 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-08-26 11:05:35 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-08-26 11:05:35 INFO: Using device: cpu
2023-08-26 11:05:35 INFO: Loading: tokenize
2023-08-26 11:05:35 INFO: Loading: ner
2023-08-26 11:05:36 INFO: Done loading processors!


Term Commencement Date 
 8 June 2022 
 Term 
Commencement 
 Date 


'June 2022'

In [54]:
import re
import datefinder
import stanza

class DateExtractor:
    def __init__(self):
        self.stanza_nlp = stanza.Pipeline(processors='tokenize,ner', lang='en')

    def extract_dates_with_datefinder(self, text):
        processed_text = re.sub(r'(\d+)"', r'\1', text)
        doc = self.stanza_nlp(processed_text)

        # Extract dates using SUTime
        dates = []
        for entity in doc.ents:
            if entity.type == 'DATE':
                if not any(indicator in entity.text.lower().split() for indicator in ["days", "date", "day", "year", "years", "month", "months", "daily", "bruntwood"]):
                    dates.append(entity.text)

        # If SUTime extracted any dates, return the earliest one
        if dates:
            dates.sort()  # Sort the dates to get the earliest one
            earliest_date = self.filter_dates_with_datefinder(dates)
            if earliest_date:
                return earliest_date

        # If no dates were extracted by SUTime, try using datefinder
        datefinder_dates = list(datefinder.find_dates(text))
        earliest_date = self.filter_dates_with_datefinder(datefinder_dates)
        if earliest_date:
            return earliest_date.strftime('%Y-%m-%d')

        return None

    def filter_dates_with_datefinder(self, dates):
        for date_string in dates:
            parsed_dates = list(datefinder.find_dates(date_string))
            if parsed_dates:
                return min(parsed_dates)
        return None

# Create an instance of DateExtractor
date_extractor = DateExtractor()

text = "Term Commencement Date \n 8 June 2022 \n Term \nCommencement \n Date "
extracted_date = date_extractor.extract_dates_with_datefinder(text)
if extracted_date is not None:
    extracted_date = extracted_date.strftime('%Y-%m-%d')
print(extracted_date)



2023-08-26 11:11:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-08-26 11:11:11 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-08-26 11:11:11 INFO: Using device: cpu
2023-08-26 11:11:11 INFO: Loading: tokenize
2023-08-26 11:11:11 INFO: Loading: ner
2023-08-26 11:11:11 INFO: Done loading processors!


2022-06-26
