In [None]:
from PIL import Image, ImageFilter, ImageEnhance
import pandas as pd
import pytesseract
from io import StringIO    
from skimage import io
from skimage import transform as tf
from skimage.feature import canny
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.pyplot import imshow
from IPython.display import display
import numpy as np
import spacy
import re
import os

class Image_preprocessing:
    def __init__(self, imageObject, num_col):
        """get an imageobject, return a dataframe"""
        ## preprocess:
        bw_image = imageObject.convert('L')
        enhanceImage = ImageEnhance.Sharpness(bw_image)
        contrast = ImageEnhance.Contrast(enhanceImage.image)
        brightness = ImageEnhance.Brightness(contrast.image)
        self.imageObject = brightness.image
        self.num_col = num_col
        # crop the image from the middle. 
        self.half1, self.half2 = self.image_crop(self.num_col, self.imageObject)
        # fix left and right margin:
        self.half1= self.sideCrop(self.half1)
        self.half2= self.sideCrop(self.half2)
        
        # Enhance the column image
        ## this is similar to prerpocess other than this turns the whole thing
        ## black and white. We do it here because the parameters for 
        ## canny operator and the hough transformation is already determined.  
        self.half1 = self.preprocessImage(self.half1)
        self.half2 = self.preprocessImage(self.half2)
        # Extract OCR data
        lineData_1, directoryData_1 = self.ocrInfo(self.half1)
        # Identify indented lines
        indentList_1, indentColumn_1 = self.indentList(lineData_1)
        # Add a space to the end of each extracted word
        ocrText_1 = []
        for word in range(len(directoryData_1['text'])):
            wordSpace = directoryData_1['text'][word] + " "
            ocrText_1.append(wordSpace)
        directoryData_1['text'] = ocrText_1
        # Group by Line Number to get lines and their strings
        groupedLines_1 = directoryData_1.groupby('line')['text'].apply(lambda x: x.sum()).reset_index()
        # Add indent marker to groupedLines
        groupedLines_1['Indent'] = indentColumn_1
        # Join bifurcated entries together
        joinedText_1 = []
        for line in range(len(groupedLines_1)):
            if groupedLines_1['Indent'][line] == 1:
                if line - 1 == -1:
                    continue
                newText_1 = groupedLines_1['text'][line-1] + groupedLines_1['text'][line]
                joinedText_1.pop()
                joinedText_1.append(newText_1)
            if groupedLines_1['Indent'][line] == 0:
                joinedText_1.append(groupedLines_1['text'][line])
        # Print final list of entries for the column
                # Extract OCR data
        lineData_2, directoryData_2 = self.ocrInfo(self.half2)
        # Identify indented lines
        indentList_2, indentColumn_2 = self.indentList(lineData_2)
        # Add a space to the end of each extracted word
        ocrText_2 = []
        for word in range(len(directoryData_2['text'])):
            wordSpace = directoryData_2['text'][word] + " "
            ocrText_2.append(wordSpace)
        directoryData_2['text'] = ocrText_2
        # Group by Line Number to get lines and their strings
        groupedLines_2 = directoryData_2.groupby('line')['text'].apply(lambda x: x.sum()).reset_index()
        # Add indent marker to groupedLines
        groupedLines_2['Indent'] = indentColumn_2
        # Join bifurcated entries together
        joinedText_2 = []
        for line in range(len(groupedLines_2)):
            if groupedLines_2['Indent'][line] == 1:
                if line - 1 == -1:
                    continue
                newText_2 = groupedLines_2['text'][line-1] + groupedLines_2['text'][line]
                joinedText_2.pop()
                joinedText_2.append(newText_2)
            if groupedLines_2['Indent'][line] == 0:
                joinedText_2.append(groupedLines_2['text'][line])
                
        ## put the list into a dataframe
        self.df_1 = pd.DataFrame({'original_line':joinedText_1})
        self.df_2 = pd.DataFrame({'original_line':joinedText_2})
        self.df = pd.concat([self.df_1, self.df_2], ignore_index=True)
        self.df['original_line'] = self.df.applymap(lambda x: self.data_extract(x))
        self.df['people_name'] = self.df[['original_line']].applymap(lambda x:str(x[0]) if x[0] is not '' else np.nan)
        self.df['home_address'] = self.df[['original_line']].applymap(lambda x:str(x[1]) if x[1] is not None else np.nan)
        self.df['occupation'] = self.df[['original_line']].applymap(lambda x:str(x[2]) if x[2] is not None else np.nan)
        self.df['work_address'] = self.df[['original_line']].applymap(lambda x:str(x[3]) if x[3] is not None else np.nan)
        ## drop na
        self.df.dropna(subset=['people_name', 'home_address', 'occupation', 'work_address'], how='all', inplace = True)
        return
    ## imput image and number of columns:
    def image_crop(self, num_col, imageObject):
        """Crop the image into halves"""
        if num_col == 2:
            width, height = imageObject.size
            imageWithEdges = imageObject.filter(ImageFilter.FIND_EDGES)
            imageWithEdges = np.asanyarray(imageWithEdges)

            edges = canny(imageWithEdges, sigma=0.3, low_threshold=10, high_threshold=80)
            lines = tf.probabilistic_hough_line(edges, threshold=10, line_length=80,
                                             line_gap=3)
            valid_width, valid_height = [], []   
            ## just incase we don't have any thing from below methods.
            up_conner = 0
            for line in lines:
                #  lines in format ((x0, y0), (x1, y1))
                # width, height
                p0, p1 = line
                ## only want lines that are in the area of interest
                if 2/5 * width < p0[0] < 3/5 * width and 2/5 * width < p1[0] < 3/5 * width and abs(p0[0] - p1[0]) < 1:
                    valid_width.append(p0[0])
                    valid_width.append(p1[0])
            ## get the median number and consider the median as the line we want
            if len(valid_width) == 0:
                mid_left, mid_right = 895.0, 895.0
            else:
                mid_left = np.median(valid_width)
                mid_right = np.median(valid_width) 
            ## get the top line from image processing method is unreliable
            ## since the top line is too week to be detected. 
            ## we use OCR to get the top line
            up_line = self.topCrop(imageObject)
            ## crop the image
            half1 = imageObject.crop((up_conner, up_line, mid_left, height))
            half2 = imageObject.crop((mid_right, up_line, width, height))
            return half1, half2

    def preprocessImage(self, image):
        """Processes each column image to make it black and white"""
        thresh = 150
        fn = lambda x : 255 if x > thresh else 0

        blackNwhite = image.convert('L').point(fn, mode='1')
        sharpness = ImageEnhance.Sharpness(blackNwhite)
        contrast = ImageEnhance.Contrast(sharpness.image)
        brightness = ImageEnhance.Brightness(contrast.image)
        enhancedImage = brightness.image
        return enhancedImage

    def topCrop(self, pageImage):
        """return the upper_line pixel from OCR extracted data"""
        # Run OCR and extract data

        directoryData = pytesseract.image_to_data(pageImage, output_type='data.frame')
        directoryData = directoryData.dropna().reset_index()
        directoryData['right'] = directoryData['left'] + directoryData['width']
        directoryData['bottom'] = directoryData['top'] + directoryData['height']
        # Label each line in dataframe

        line = []
        lineCount = 1

        for r in range(len(directoryData)-1):
            if directoryData['left'][r] < directoryData['left'][r+1]:
                line.append(lineCount)
            else:
                line.append(lineCount)
                lineCount += 1

        line.append(lineCount)
        directoryData['line'] = line

        # Group by line to create Line Data dataframe

        lineDataLT = directoryData[['line', 'left', 'top']].groupby('line').min().reset_index()
        lineDataRB = directoryData[['line', 'right', 'bottom']].groupby('line').max().reset_index()
        lineData = pd.merge(lineDataLT, lineDataRB)

        width, height = pageImage.size
        top = lineData['bottom'][0]+ 20
        ## return the top pixel
        return top
    
    def sideCrop(self, pageImage):
        """Crop the left and right margin"""
        # Run OCR and extract data

        directoryData = pytesseract.image_to_data(pageImage, output_type='data.frame')
        directoryData = directoryData.dropna().reset_index()
        directoryData['right'] = directoryData['left'] + directoryData['width']
        directoryData['bottom'] = directoryData['top'] + directoryData['height']
        # Label each line in dataframe
        line = []
        lineCount = 1

        for r in range(len(directoryData)-1):
            if directoryData['left'][r] < directoryData['left'][r+1]:
                line.append(lineCount)
            else:
                line.append(lineCount)
                lineCount += 1

        line.append(lineCount)
        directoryData['line'] = line

        # Group by line to create Line Data dataframe

        lineDataLT = directoryData[['line', 'left', 'top']].groupby('line').min().reset_index()
        lineDataRB = directoryData[['line', 'right', 'bottom']].groupby('line').max().reset_index()
        lineData = pd.merge(lineDataLT, lineDataRB)

        width, height = pageImage.size
        left = lineData['left'].mode()[0] - 10
        right = lineData['right'].max()+ 3
        cropImage = pageImage.crop((left, 0, width, height))
        
        return cropImage

    def indentList(self, df):
        """Identify indented lines and develop column of binary identification as to whether a line is indented"""
        indentColumn = []
        for line in range(len(df)):
            if df['left'][line] > (df['left'].mean()*3):
                indentColumn.append(1)
            else:
                indentColumn.append(0)

        # Pull list of rows that are indented, in reverse order

        indentLineData = df[df['left'] > df['left'].mean()*3] # this mean metric might need tinkering
        indentList = indentLineData['line'].tolist()
        indentList = sorted(indentList, reverse=True)  

        return indentList, indentColumn


    def ocrInfo(self, pageImage):

        """Run OCR to extract word by word locative information (directoryData) and line specific data (lineData)"""

        directoryData = pytesseract.image_to_data(pageImage, output_type='data.frame')
        directoryData = directoryData.dropna().reset_index()
        # directoryData = directoryData.drop([0,12])
        directoryData = directoryData.dropna().reset_index()
        directoryData['right'] = directoryData['left'] + directoryData['width']
        directoryData['bottom'] = directoryData['top'] + directoryData['height']
        line = []
        lineCount = 1

        for r in range(len(directoryData)-1):
            if directoryData['left'][r] < directoryData['left'][r+1]:
                line.append(lineCount)
            else:
                line.append(lineCount)
                lineCount += 1

        line.append(lineCount)
        directoryData['line'] = line

        lineDataLT = directoryData[['line', 'left', 'top']].groupby('line').min().reset_index()
        lineDataRB = directoryData[['line', 'right', 'bottom']].groupby('line').max().reset_index()
        lineData = pd.merge(lineDataLT, lineDataRB)

        return lineData, directoryData
    
    
    def data_extract(self, line):
        """extract people_name, home_address, occupation and work_address from the OCR extracted lines"""
        # home address
        people_name, home_address, occupation, work_address ='', None, None, None
        # change the type of the line into a string
        line= str(line)
        # clean the string, remove special characters
        reg = re.compile( "[A-Za-z0-9 ,\.]*")
        line = ''.join(reg.findall(line))
        # line = re.sub('[wid\.]', 'widow', line)
        # if line is just space (this is very common in the OCR extracted original data)
        if line.strip() == '':
            return people_name, home_address, occupation, work_address

        #if line has home address
        if 'h.' in line:
            home_address = line.split('h.')[1].strip()

        line = line.split('h.')[0].strip()
        ## people's name starts with upper case
        count = 0
        for word in str(line).split():
            if word[0].isupper():
                people_name = people_name + ' ' + word
                count += 1
            else:
                break

        line = ''.join(str(line).split()[count:])
        line = re.split(',|\.', line)

        ## occupation should not start with number:
    #     print(line)
        if len(line) > 0 and len(line[0]) > 0 and line[0][0].islower():
            occupation = line[0]
            work_address = ' '.join(line[1:])
        else:
            work_address = ' '.join(line) 

        return people_name, home_address, occupation, work_address



In [None]:
## the name of the directories that each page has two_columns.


two_columns = ['4ad95a70-317a-0134-d1af-00505686a51c','4adf9ec0-317a-0134-03ad-00505686a51c',
                '4ae3cb40-317a-0134-489d-00505686a51c','4ae76b60-317a-0134-b849-00505686a51c',
               '4aea8af0-317a-0134-2393-00505686a51c','4aed8a80-317a-0134-28a4-00505686a51c',
               '4af0c6f0-317a-0134-e90c-00505686a51c','4af3b880-317a-0134-bda8-00505686a51c',
               '4af6a690-317a-0134-5947-00505686a51c','4afa0510-317a-0134-cf84-00505686a51c',
               '4afd6280-317a-0134-575a-00505686a51c','4b00bf60-317a-0134-32d0-00505686a51c',
               '4b0419c0-317a-0134-7464-00505686a51c','4b073d20-317a-0134-af68-00505686a51c',
               '4b0aa870-317a-0134-712b-00505686a51c','4b0e13f0-317a-0134-6578-00505686a51c',
               '4b119360-317a-0134-9131-00505686a51c','4b154340-317a-0134-afd3-00505686a51c',
                '4b18f080-317a-0134-fded-00505686a51c','4b336e60-317a-0134-1e9b-00505686a51c',
                '4b36edd0-317a-0134-eedc-00505686a51c','4b3a14d0-317a-0134-011c-00505686a51c',
               '4b3d0590-317a-0134-1631-00505686a51c','4b4009d0-317a-0134-949b-00505686a51c',
               '4b437600-317a-0134-6db3-00505686a51c','4b47b740-317a-0134-ad0b-00505686a51c',
               '4b4b2b90-317a-0134-6800-00505686a51c','4b4e8300-317a-0134-fb8c-00505686a51c',
                '4b51d420-317a-0134-aa50-00505686a51c','4b5532f0-317a-0134-52ca-00505686a51c',
                '4b58d200-317a-0134-d2aa-00505686a51c','4b5c40e0-317a-0134-e9c9-00505686a51c',
                '4b5ff0e0-317a-0134-7e27-00505686a51c','4b63a460-317a-0134-d3bd-00505686a51c',
                '4b66b460-317a-0134-8cb2-00505686a51c','4b69a410-317a-0134-a570-00505686a51c',
                '4b6c95d0-317a-0134-f4e4-00505686a51c','4b6f8210-317a-0134-ff86-00505686a51c',
                '4b728f10-317a-0134-8c07-00505686a51c','4b8e3f70-317a-0134-721a-00505686a51c',
                '4b939190-317a-0134-d1d5-00505686a51c']

In [None]:
df = pd.DataFrame()
start_year, end_year = 1848, 1889
for directory_name in two_columns:
    start_year = start_year + 1
    ## this is the path to the files in HPC
    file_list = os.fsencode('/beegfs/nmw2-share/jpg/' + directory_name)
    try:
        
        # some pages will throw an error. In order to prevent those pages
        # from impacting the overall extraction we have built in 
        # a skip-functionality that will move past the error prone
        # images.  
        
        for file in os.listdir(file_list):
            filename = os.fsdecode(file)
            imageObject = Image.open('/beegfs/nmw2-share/jpg/' + directory_name + '/' + filename)
            new_df = Image_preprocessing(imageObject,2).df
            new_df['year'] = start_year
            ## the page_number is extracted from the name of the file.
            ## note this is not the page number on the page.
            new_df['page'] = filename.split('.')[0]
            df = pd.concat([df, new_df], ignore_index=True)
    except:
        continue

In [None]:
# resulting directories:
df.to_csv('two_columns.csv')