# Find rows in a column

Use Tesseract to separate columns into rows.

In [2]:
import cv2
import pandas as pd
import numpy as np
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
from statistics import mean
import math
import statistics
import re
import os
import tempfile

In [3]:
# These OCR image preprocessing steps are based on https://stackoverflow.com/a/43493383
# I don't really understand why this particular combination of filters works, but it does seem to improve OCR results

BINARY_THRESHOLD = 200

def process_image_for_ocr(file_path):
    # TODO : Implement using opencv
    temp_filename = set_image_dpi(file_path)
    im_new = remove_noise_and_smooth(temp_filename)
    return im_new


def set_image_dpi(file_path):
    im = Image.open(file_path)
    #length_x, width_y = im.size
    #factor = max(1, int(IMAGE_SIZE / length_x))
    #size = factor * length_x, factor * width_y
    # size = (1800, 1800)
    #im_resized = im.resize(size, Image.ANTIALIAS)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
    temp_filename = temp_file.name
    im.save(temp_filename, dpi=(300, 300))
    return temp_filename


def image_smoothening(img):
    ret1, th1 = cv2.threshold(img, BINARY_THRESHOLD, 255, cv2.THRESH_BINARY)
    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    blur = cv2.GaussianBlur(th2, (1, 1), 0)
    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th3


def remove_noise_and_smooth(file_name):
    img = cv2.imread(file_name, 0)
    filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
    img = image_smoothening(img)
    or_image = cv2.bitwise_or(img, closing)
    return or_image

In [8]:
def cluster_points(points, distance):
    '''
    Cluster together nearby points.
    '''
    clusters = []
    start = 0
    cluster = []
    for x in points:
        if x < start + distance:
            cluster.append(x)
        else:
            if cluster:
                clusters.append(sorted(cluster))
            cluster = [x]
        start = x
    clusters.append(cluster)
    return clusters

def find_rows(image_name, image_path, output_dir='data'):
    '''
    Use Tesseract to detect lines of text, then use those lines to crop out individual rows from the column.
    '''
    row_dir = os.path.join(output_dir, 'rows')
    os.makedirs(row_dir, exist_ok=True)
    col = cv2.imread(image_path)
    h, w = col.shape[:2]
    temp_img = process_image_for_ocr(image_path)
    
    # We're interested in the printed labels on the left of each column
    # Here we crop off the right side of the image to remove some of the handwritten text
    # This simplifies the OCR results
    temp_img = temp_img[0:h, 0:int(round(w * 0.6))]
    
    # Get OCR results as a Pandas dataframe
    results = pytesseract.image_to_data(temp_img, output_type=pytesseract.Output.DATAFRAME)
    # print(results.loc[results['level'] == 5])
    
    # Level 4 of the OCR results are individual words
    # Get all the unique 'top' positions of words
    lines = pd.unique(results.loc[results['level'] == 4]['top'])
    
    # Sort the 'top' values
    lines = sorted(lines)
    
    # Get an approximate line height from the median of the text height values
    line_height = int(round(results.loc[~results['text'].isna()]['height'].median()))
    # print(line_height)
    
    # Group nearby lines into clusters based on the line height
    clusters = cluster_points(lines, line_height)
    
    # Make a copy of the image
    col2 = col.copy()
    
    # window = int(round(line_height * .8))
    
    # Get the header row
    header = col[0:clusters[1][0], 0:w]
    #header = col[0:clusters[1]]
    header_height = header.shape[0]
    # cv2.imwrite('{}/columns/rows/{}-header.jpg'.format(output_dir, image_name[:-4]), header)
    cells = [l[0] for l in clusters[1:]]
    #cells = clusters[1:]
    
    for index, cell in enumerate(cells):
        cv2.line(col2,(0, cell),(w, cell),(255,0,0),3)
        try:
            # Get next cell top, add a margin to make sure we get all the content
            next_cell = cells[index+1] + int(round(line_height/2))
        except IndexError:
            # If we're at the end, just go to the bttom of the column
            next_cell = h
        
        # Add a margin at the top of the cell
        if cell > line_height:
            row = col[(cell - line_height):next_cell, 0:w]
        else:
            row = col[0:next_cell, 0:w]
        #combined = np.concatenate((header, row), axis=0)
        # cv2.line(combined,(0, header_height),(w, header_height),(255,0,0),3)
        #cv2.imwrite('{}/columns/rows/{}-{}.jpg'.format(output_dir, image_name[:-4], index), combined)
        cv2.imwrite('{}/{}-{}.jpg'.format(row_dir, image_name[:-4], index), row)
    # cv2.imwrite('{}/{}-rows.jpg'.format(row_dir, image_name[:-4]), col2)
    return results

## Process a single column

In [9]:
results = find_rows('N193-113_0014-col-1.jpg', '/Users/tim/mycode/stock-exchange/notebooks/processed/1929/columns/N193-113_0014-col-1.jpg', 'processed/testing')

## Process a directory

In [None]:
columns_dir = '/Users/tim/mycode/stock-exchange/notebooks/processed/1929/columns'
for img_name in [i for i in os.listdir(columns_dir) if i[-4:] == '.jpg']:
    img_path = os.path.join(columns_dir, img_name)
    find_rows(img_name, img_path, 'processed/1929')