The objective of this script is to determine the conditions listed within the extension of leases.

In [None]:
import cv2
import pytesseract
from pytesseract import Output
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# set directories
## Change poppler path
poppler_path = r'C:\Users\<user>\Anaconda3\Library\lib\poppler-22.04.0\Library\bin' 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
def rotation(angle, width_center, height_center, width, height, invert):
    M = cv2.getRotationMatrix2D((width_center, height_center), angle, 1)
    invert = cv2.warpAffine(invert, M, (height, width), borderMode=cv2.BORDER_REPLICATE)
    return invert

def rotation_determination(invert):
    
    try: 
        rotation_results = pytesseract.image_to_osd(invert, output_type = pytesseract.Output.DICT)   
        
        if rotation_results["orientation"] == 270:
            invert = rotation(angle = rotation_results["orientation"],
                              width_center = width/3, height_center = height/2,
                              width = width, height = height, invert = invert)

        elif rotation_results["orientation"] == 90:
            invert = rotation(angle = rotation_results["orientation"],
                              width_center = width/2, height_center = height/2,
                              width = width, height = height, invert = invert)

        elif rotation_results["orientation"] == 180:
            invert = rotation(angle = rotation_results["orientation"],
                              width_center = width/2, height_center = height/2,
                              width = width, height = height, invert = invert)
        #else:
        #    invert = invert
        
        blank_page_indicator = "N"
        return invert, blank_page_indicator
    
    except:
        print ("Blank Page")
        blank_page_indicator = "Y"
        return "Blank Page", blank_page_indicator

# indicators that show where the relevant clauses might be
opening_phrases = ['substituted therefor', 'substituted with the following', 'replaced by the', 'replaced with the', 
                   'substituting the following', 'substituted']
closing_phrases = ['”']
unwanted_characters = ['\n', '-', '—', ';', ':', '{', '}', '~', '?', '_', '‘', '@', '%', '|']

record_path = "./Data/A2006043.pdf"
pages_images = convert_from_path(record_path, poppler_path = poppler_path)
pages_tesseract = []

for i in range(len(pages_images)):
    if i == 0:
        continue
    else:
        blank_page_indicator = "N"
        height, width = np.asarray(pages_images[i]).shape[:2]
        gray = cv2.cvtColor(np.asarray(pages_images[i]), cv2.COLOR_BGR2GRAY)
        denoise = cv2.fastNlMeansDenoising(gray, h = 3, templateWindowSize = 7, searchWindowSize = 21)
        thresh = cv2.threshold(denoise, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

        # morph open to remove noise and invert again
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
        opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations = 1)
        invert = 255 - opening
        
        # determine if rotation is required
        invert, blank_page_indicator = rotation_determination(invert = invert)
        
        if blank_page_indicator == "Y":
            continue
          
        else:
            # read the entire page and convert from dictionary to df
            d = pytesseract.image_to_data(invert, output_type = Output.DICT)
            d_df = pd.DataFrame.from_dict(d)
            d_df = d_df.loc[d_df['conf'] > -1].copy()
            d_df['page'] = i + 1
            pages_tesseract.append(d_df)
            print(f"Page {i+1} converted.")

pages_tesseract_df = pd.concat(pages_tesseract)

# remove unwanted characters and white space
for uc in unwanted_characters:
    pages_tesseract_df['text'] = pages_tesseract_df['text'].replace(uc, ' ')

pages_tesseract_df['text'] = pages_tesseract_df['text'].str.strip()
pages_tesseract_df = pages_tesseract_df.loc[pages_tesseract_df['text'] != ''].copy()

# reset index
pages_tesseract_df.reset_index(inplace = True, drop = True)

# create a window of trailing texts
pages_tesseract_df['text2'] = pages_tesseract_df['text'].shift(-1)
pages_tesseract_df['text3'] = pages_tesseract_df['text'].shift(-2)
pages_tesseract_df['text4'] = pages_tesseract_df['text'].shift(-3)
pages_tesseract_df.fillna('', inplace = True)
pages_tesseract_df['trailing_text'] = pages_tesseract_df['text'] + " " + pages_tesseract_df['text2'] + " " + pages_tesseract_df['text3'] + " " + pages_tesseract_df['text4']

# use window to detect the opening phrase and inices.
# from indices, obtain coordinates for bounding box
opening_indices = []
closing_indices = []

## get opening
for t in opening_phrases:
    matching_index = pages_tesseract_df.index[pages_tesseract_df['trailing_text'].str.contains(t)].tolist()
    for i in range(len(matching_index)):
        opening_indices.append(matching_index[i])

opening_entries = pages_tesseract_df.filter(items = opening_indices, axis = 0)

## get closing
for c in closing_phrases:
    matching_index = pages_tesseract_df.index[pages_tesseract_df['text'].str.contains(c)].tolist()
    for i in range(len(matching_index)):
        closing_indices.append(matching_index[i])

closing_entries = pages_tesseract_df.filter(items = closing_indices, axis = 0)

# based on opening and closing indices, get the coordinates of the bounding boxes to be selected
## primero, determine the number of pages this involves using the first and last entries
try:
    starting_page = opening_entries.head(n = 1)['page'].tolist()[0]
except:
    return print("DP DC Clause cannot be detected.")
ending_page = closing_entries.tail(n = 1)['page'].tolist()[0]

## segundo a: if starting page is not the same as ending, determine the number of pages it crosses
## segundo b: take the starting page first
## segundo c: then determine if there are any intermediate complete pages

### coordinates of bounding box = (x1, y1), (x2, y2) where n1 are the top-left and n2 are bottom right
### x1 will always be 0 and x2 will always be the max width of the image
### if it spills across multiple pages, the starting page's y2 will always be the bottom of the page

x1 = 0
x2 = np.asarray(pages_images[starting_page - 1]).shape[1]

if starting_page != ending_page:
      
    ##### Determine if starting page needs to be rotated, get the angle and apply thru out
    starting_page_image = np.asarray(pages_images[starting_page - 1])
    rotation_results = pytesseract.image_to_osd(starting_page_image, output_type = pytesseract.Output.DICT)
    
    if rotation_results["orientation"] == 270:
        starting_page_image = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = starting_page_image)
        
    elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
        starting_page_image = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = starting_page_image)
    else:
        starting_page_image = starting_page_image
    
    #### get starting page
    starting_page_y1 = opening_entries.head(n = 1)['top'].tolist()[0]
    starting_page_y2 = starting_page_image.shape[0]
    
    starting_page_crop = starting_page_image[starting_page_y1:starting_page_y2, x1:x2]
   
    #### get intermediate page
    n_pages = ending_page - starting_page + 1
    if n_pages == 3:
        
        ##### Determine if intermediate page needs to be rotated
        intermediate_page = np.asarray(pages_images[starting_page])
        
        if rotation_results["orientation"] == 270:
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        
        elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        else:
            intermediate_page = intermediate_page
        
    elif n_pages == 4:
        
        ##### Determine if intermediate page needs to be rotated
        intermediate_page = np.asarray(pages_images[starting_page])
        
        if rotation_results["orientation"] == 270:
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        
        elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        else:
            intermediate_page = intermediate_page
        
        ##### Determine if intermediate page needs to be rotated
        intermediate_page2 = np.asarray(pages_images[starting_page + 1])
        
        if rotation_results["orientation"] == 270:
            intermediate_page2 = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page2)
        
        elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
            intermediate_page2 = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page2)
        else:
            intermediate_page2 = intermediate_page2
        
        intermediate_page = cv2.vconcat([intermediate_page, intermediate_page2])
        
    elif n_pages > 4:    
        
        ##### Determine if intermediate page needs to be rotated
        intermediate_page = np.asarray(pages_images[starting_page])
        
        if rotation_results["orientation"] == 270:
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        
        elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
            intermediate_page = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page)
        else:
            intermediate_page = intermediate_page
        
        ##### Determine if intermediate page needs to be rotated
        intermediate_page2 = np.asarray(pages_images[starting_page + 1])
        
        if rotation_results["orientation"] == 270:
            intermediate_page2 = rotation(rotation_results["orientation"],
                                        width_center = width/3, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page2)
        
        elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
            intermediate_page2 = rotation(rotation_results["orientation"],
                                        width_center = width/2, height_center = height/2,
                                        width = width, height = height, invert = intermediate_page2)
        else:
            intermediate_page2 = intermediate_page2
        
        intermediate_page = cv2.vconcat([intermediate_page, intermediate_page2])
        
        counter = starting_page + 1
        for i in range(n_pages - 4):
            
            ##### Determine if other intermediate page needs to be rotated
            intermediate_page_i = np.asarray(pages_images[counter + i])
        
            if rotation_results["orientation"] == 270:
                intermediate_page_i = rotation(rotation_results["orientation"],
                                            width_center = width/3, height_center = height/2,
                                            width = width, height = height, invert = intermediate_page_i)
        
            elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
                intermediate_page_i = rotation(rotation_results["orientation"],
                                            width_center = width/2, height_center = height/2,
                                            width = width, height = height, invert = intermediate_page_i)
            else:
                intermediate_page_i = intermediate_page_i
            
            intermediate_page = cv2.vconcat([intermediate_page, intermediate_page_i])
           
    #### get closing page
    #### the top of it will always be top of the page
    closing_page_y1 = 0
    closing_page_y2 = closing_entries.tail(n = 1)['top'].tolist()[0] + closing_entries.tail(n = 1)['height'].tolist()[0]
    closing_page = np.asarray(pages_images[ending_page - 1])
    if rotation_results["orientation"] == 270:
        closing_page = rotation(rotation_results["orientation"],
                                    width_center = width/3, height_center = height/2,
                                    width = width, height = height, invert = closing_page)
        
    elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
        closing_page = rotation(rotation_results["orientation"],
                                width_center = width/2, height_center = height/2,
                                width = width, height = height, invert = closing_page)
    else:
        closing_page = closing_page
    
    closing_page_crop = closing_page[closing_page_y1:closing_page_y2, x1:x2]
   
    if n_pages > 2:
        final_image = cv2.vconcat([starting_page_crop, intermediate_page, closing_page_crop])
    else:
        final_image = cv2.vconcat([starting_page_crop, closing_page_crop])

else:
    y1 = opening_entries.head(n = 1)['top'].tolist()[0]
    y2 = closing_entries.tail(n = 1)['top'].tolist()[0] + closing_entries.tail(n = 1)['height'].tolist()[0]
    
    final_image = np.asarray(pages_images[starting_page - 1])
    
    # placeholder test to see whether there is a need to rotate or not
    rotation_results = pytesseract.image_to_osd(final_image, output_type = pytesseract.Output.DICT)
    if rotation_results["orientation"] == 270:
        final_image = rotation(angle = rotation_results["orientation"],
                                width_center = width/3, height_center = height/2,
                                width = width, height = height, invert = final_image)
        
    elif (rotation_results['orientation'] > 0) & (rotation_results['orientation'] < 270):
        final_image = rotation(angle = rotation_results["orientation"],
                                width_center = width/2, height_center = height/2,
                                width = width, height = height, invert = final_image)

    
    final_image = final_image[y1:y2, x1:x2]

## tercero: get text
starting_index = opening_entries.index[0]
closing_index = closing_entries.index[len(closing_entries.index) - 1]
condition = " ".join(pages_tesseract_df.iloc[starting_index:(closing_index + 1), 11].tolist())

## cuarto: imprimir
print(condition)
plt.subplot(111), plt.imshow(final_image)