In [1]:
%pylab inline
import pg_toolkit as pgt
pgt.toolkit_config.set_pg_conn_string("dbname='decl'")
import hashlib
import glob
import os
from IPython.lib import backgroundjobs as bg
jobs = bg.BackgroundJobManager()
from IPython.display import Image, HTML, display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [3]:
# import json
# with open("feed.json", "r") as fp:
#     data = json.load(fp)
#     for record in data:
#         record['path'] = '/Users/tilarids/dev/decl/data/' + hashlib.sha224(record['declaration'].get('url','').encode('utf-8')).hexdigest() + '.pdf'

In [4]:
# pgt.import_json_into_pg("decls", data, lambda record: record['id'], create_table=True, skip_duplicates=True)

In [5]:
pgt.import_glob_list_into_pg('decls_pdfs', '/Users/tilarids/dev/decl/data/*.pdf')
data = pgt.pg_query("""
    SELECT
        t.id as id,
        t.data->'declaration'->'url' as url,
        t.data->'path' as path
    FROM decls t
    INNER JOIN decls_pdfs dp ON (t.data->>'path'=dp.path)
    WHERE t.data->'declaration'->>'url' != ''
""")

In [6]:
import cv2
import sys
import os
import numpy as np

def line_intersection(l1, l2):
    line1 = ((l1[1], l1[2]), (l1[3],l1[4]))
    line2 = ((l2[1], l2[2]), (l2[3],l2[4]))
    xdiff = (line1[0][0] - line1[1][0], line2[0][0] - line2[1][0])
    ydiff = (line1[0][1] - line1[1][1], line2[0][1] - line2[1][1])

    def det(a, b):
        return a[0] * b[1] - a[1] * b[0]

    div = det(xdiff, ydiff)
    if div == 0:
        raise Exception('lines do not intersect')

    d = (det(*line1), det(*line2))
    x = det(d, xdiff) / div
    y = det(d, ydiff) / div
    return x, y

def extract_borders(img):
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray,50,150,apertureSize = 3)

    lines = cv2.HoughLines(edges,1,np.pi/180,200)
    if lines is None or len(lines) == 0:
        return None, None, None, None
    out_lines = []
    height, width = img.shape[0], img.shape[1]
    for rho,theta in lines[0]:
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a*rho
        y0 = b*rho
        x1 = int(x0 + 1000*(-b))
        y1 = int(y0 + 1000*(a))
        x2 = int(x0 - 1000*(-b))
        y2 = int(y0 - 1000*(a))

        out_lines.append((theta,x1,y1,x2,y2))

    left_border = None
    top_border = None
    right_border = None
    bottom_border = None

    for (theta,x1,y1,x2,y2) in out_lines:
        if abs(theta - np.pi / 2) < np.pi / 10: # horizontal line.
            lint = line_intersection((theta,x1,y1,x2,y2), (0, 0, 0, 0, height))
            rint = line_intersection((theta,x1,y1,x2,y2), (0, width, 0, width, height))
            if (not top_border or top_border[2] > lint[1]) and lint[1] > 0.02 * height and rint[1] > 0.01 * height:
                top_border = (theta,lint[0],lint[1],rint[0],rint[1])
            if (not bottom_border or bottom_border[2] < lint[1]) and lint[1] < 0.98 * height and rint[1] < 0.98 * height:
                bottom_border = (theta,lint[0],lint[1],rint[0],rint[1])
        elif abs(theta) < np.pi / 10: # vertical line
            tint = line_intersection((theta,x1,y1,x2,y2), (0, 0, 0, width, 0))
            bint = line_intersection((theta,x1,y1,x2,y2), (0, 0, height, width, height))
            if (not left_border or left_border[1] > tint[0]) and tint[0] > 0.02 * width and bint[0] > 0.02 * width:
                left_border = (theta, tint[0], tint[1], bint[0], bint[1])
            if (not right_border or right_border[1] < tint[0]) and tint[0] < 0.98 * width and bint[0] < 0.98 * width:
                right_border = (theta,tint[0], tint[1], bint[0], bint[1])

    #   cv2.line(img,(left_border[1],left_border[2]),(left_border[3],left_border[4]),(0,0,255),2)
    #   cv2.line(img,(top_border[1],top_border[2]),(top_border[3],top_border[4]),(0,255,0),2)
    #   cv2.line(img,(right_border[1],right_border[2]),(right_border[3],right_border[4]),(255,0,0),2)
    #   cv2.line(img,(bottom_border[1],bottom_border[2]),(bottom_border[3],bottom_border[4]),(255,0,255),2)
    return left_border, top_border, right_border, bottom_border
# show_extracted_borders('/Users/tilarids/dev/decl/img_data/0278d69820395cf130f098f79b46caa62023627a9a7362295e2c5489.pdf.1.png')

In [7]:
def extract_fields():
    for path in log_progress(data['path']):
#         print "Processing path:", path,
#         import pdb; pdb.set_trace()
        for img_path in glob.glob('/Users/tilarids/dev/decl/img_data/' + os.path.basename(path) + ".*.png")[:2]:
            try:
                z = None
                z = pgt.pg_query_by_id('detected_borders', img_path)
            except:
                pass
            if z:
#                 print "skip",
                continue
            img = cv2.imread(img_path)
            height, width = img.shape[0], img.shape[1]
#             import pdb; pdb.set_trace()
            left_border, top_border, right_border, bottom_border = extract_borders(img)
            if left_border is None or right_border is None or top_border is None or bottom_border is None:
                pgt.import_json_into_pg('detected_borders', 
                                        [{'path':path, 'img_path':img_path}], 
                                        lambda record: record['img_path'], 
                                        create_table=False, 
                                        skip_duplicates=True)
                continue
            detected_width = right_border[1] - left_border[1]
            detected_height = bottom_border[2] - top_border[2]
            out = {'left_border':map(float,left_border),
                     'top_border':map(float,top_border),
                     'right_border':map(float,right_border),
                     'bottom_border':map(float,bottom_border),
                     'detected_width':detected_width,
                     'detected_height':detected_height,
                     'height': height,
                     'width': width,
                     'img_path':img_path,
                     'path':path}
#             import pdb; pdb.set_trace()
            pgt.import_json_into_pg("detected_borders", [out], lambda record: record['img_path'], create_table=False, skip_duplicates=True)
#         print ""

In [8]:
import subprocess
def apply_tesseract():
    for path in log_progress(data['path']):
#         print "Processing path with tesseract:", path,
        for img_path in glob.glob('/Users/tilarids/dev/decl/img_data/' + os.path.basename(path) + ".*.png")[:2]:
            try:
                z = None
                z = pgt.pg_query_by_id('tesseract_ocr', img_path)
            except:
                pass
            if z:
#                 print "skip",
                continue
            try:
                out = subprocess.check_output(["tesseract", img_path, "stdout", "-l", "ukr"])
            except:
                continue
            record = {'ocr': out,
                     'img_path':img_path,
                     'path':path}
#             import pdb; pdb.set_trace()
            pgt.import_json_into_pg("tesseract_ocr", [record], lambda record: record['img_path'], create_table=False, skip_duplicates=True)
#         print ""

In [None]:
jobs.new(extract_fields)
jobs.new(apply_tesseract)

In [9]:
z = pgt.pg_query("""
    SELECT
        t.data->'ocr' as ocr,
        t.data->'img_path' as img_path
    FROM tesseract_ocr t
    WHERE t.data->>'img_path' LIKE '%.0.png' OR t.data->>'img_path' LIKE '%.1.png'
""")
z['ocrl'] = z['ocr'].map(lambda x: x.lower())
imgs = [s[1]['img_path'] for s in z.iterrows() 
                     if ((u'клара' in s[1]['ocrl'].lower() or u'кпара' in s[1]['ocrl'])
                         and (u'одаток' in s[1]['ocrl']))]
                 
#                          and (s[1]['img_path'].endswith('.0.png') or s[1]['img_path'].endswith('.1.png')) 
#                          and (not u'ларант' in s[1]['ocr'] and not u'парант' in s[1]['ocr'])
#                         )])
imgs = [x[:x[:-4].rfind('.')+1]+str(int(x[x[:-4].rfind('.')+1:-4]) + 1)+".png" for x in imgs]
pgt.import_list_into_pg('decl_imgs', imgs, col_name='img_path')
# imagesList=''.join( ["<img style='height: 200px; margin: 0px; float: left; border: 1px solid black;' src='%s' />" % 
#                      str(s.replace('/Users/tilarids/dev/decl/', '')) for s in imgs]) 
# display(HTML(imagesList))

In [10]:
def order_points(pts):
    # initialzie a list of coordinates that will be ordered
    # such that the first entry in the list is the top-left,
    # the second entry is the top-right, the third is the
    # bottom-right, and the fourth is the bottom-left
    rect = np.zeros((4, 2), dtype = "float32")

    # the top-left point will have the smallest sum, whereas
    # the bottom-right point will have the largest sum
    s = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]

    # now, compute the difference between the points, the
    # top-right point will have the smallest difference,
    # whereas the bottom-left will have the largest difference
    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]

    # return the ordered coordinates
    return rect

def four_point_transform(image, pts):
    # obtain a consistent order of the points and unpack them
    # individually
    rect = order_points(pts)
    (tl, tr, br, bl) = pts

    # compute the width of the new image, which will be the
    # maximum distance between bottom-right and bottom-left
    # x-coordiates or the top-right and top-left x-coordinates
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))

    # compute the height of the new image, which will be the
    # maximum distance between the top-right and bottom-right
    # y-coordinates or the top-left and bottom-left y-coordinates
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))

    # now that we have the dimensions of the new image, construct
    # the set of destination points to obtain a "birds eye view",
    # (i.e. top-down view) of the image, again specifying points
    # in the top-left, top-right, bottom-right, and bottom-left
    # order
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype = "float32")

    # compute the perspective transform matrix and then apply it
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # return the warped image
    return warped

In [14]:
z = pgt.pg_query("""
    SELECT
        db.data->>'img_path' as img_path,
        db.data->'left_border' as left_border,
        db.data->'top_border' as top_border,
        db.data->'right_border' as right_border,
        db.data->'bottom_border' as bottom_border,
        db.data->'detected_width' as detected_width,
        db.data->'detected_height' as detected_height,
        db.data->'height' as height,
        db.data->'width' as width
    FROM detected_borders db
    INNER JOIN decl_imgs di on (di.img_path=db.data->>'img_path')
""")

for s in log_progress(z.iterrows(), every=1, size=len(z)):
    img_path = s[1]['img_path']
    try:
        z = None
        z = pgt.pg_query_by_id('extract_imgs', img_path)
    except:
        pass
    if z:
        continue

    if not s[1]['left_border']:
        continue

    img = cv2.imread(img_path)
    height, width = s[1]['height'], s[1]['width']
    detected_height = s[1]['detected_height']
    detected_width = s[1]['detected_width']
#     import pdb; pdb.set_trace()
    left_border = map(int, list(s[1]['left_border']))
    right_border = map(int, list(s[1]['right_border']))
    top_border = map(int, list(s[1]['top_border']))
    bottom_border = map(int, list(s[1]['bottom_border']))
    
    WIDTH_THRESHOLD_MIN, WIDTH_THRESHOLD_MAX = (0.75, 0.86)
    HEIGHT_THRESHOLD_MIN, HEIGHT_THRESHOLD_MAX = (0.8, 0.97)
    if (HEIGHT_THRESHOLD_MIN < float(detected_height) / height < HEIGHT_THRESHOLD_MAX and
        WIDTH_THRESHOLD_MIN < float(detected_width) / width < WIDTH_THRESHOLD_MAX):
        
        tl = line_intersection(left_border, top_border)
        tr = line_intersection(right_border, top_border)
        bl = line_intersection(left_border, bottom_border)
        br = line_intersection(right_border, bottom_border)
        warp = four_point_transform(img, np.array([tl, tr, br, bl]))
        wh, ww = warp.shape[0], warp.shape[1]
        
        TL1 = (0.615866388308977 * ww, 0.13972602739726028 * wh)
        TR1 = (0.7891440501043842 * ww, 0.13972602739726028 * wh)
        BL1 = (0.615866388308977 * ww, 0.16712328767123288 * wh)
        BR1 = (0.7891440501043842 * ww, 0.16712328767123288 * wh)

        TL2 = (0.7933194154488518 * ww, 0.13972602739726028 * wh)
        TR2 = (0.9665970772442589 * ww, 0.13972602739726028 * wh)
        BL2 = (0.7933194154488518 * ww, 0.16712328767123288 * wh)
        BR2 = (0.9665970772442589 * ww, 0.16712328767123288 * wh)

        record = {'img_path': img_path}
        for k,v in {'first': np.array([TL1, TR1, BR1, BL1]), 
                    'second': np.array([TL2, TR2, BR2, BL2])
                   }.iteritems():
            im = four_point_transform(warp, v)
            imh,imw = im.shape[0], im.shape[1]
            new_path = s[1]['img_path'].replace('/img_data/', '/extract_img_data/') + "." + k + ".png"
            cv2.imwrite(new_path, im)
            record[k] = new_path
            record[k+'_width'] = imw
            record[k+'_height'] = imh
            
            imbin = cv2.adaptiveThreshold(cv2.resize(cv2.cvtColor(im, cv2.COLOR_BGR2GRAY), (95, 22)) ,255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
            bin_path = s[1]['img_path'].replace('/img_data/', '/extract_img_data/') + "." + k + ".bin"
            
            np.save(bin_path, imbin)
            record[k+'_bin'] = bin_path + ".npy"
            
        pgt.import_json_into_pg("extract_imgs", [record], lambda record: record['img_path'], create_table=False, skip_duplicates=True)

In [15]:
extracts = pgt.pg_query("""
    SELECT
        ei.data->>'second_bin' as second_bin,
        ei.data->>'first_bin' as first_bin,
        ei.data->'first_width' as first_width,
        ei.data->'first_height' as first_height,
        ei.data->'second_width' as second_width,
        ei.data->'second_height' as second_height,
        ei.data->'img_path' as img_path,
        d.data->'income'->'5'->>'value' as income,
        d.data->'income'->'5'->>'family' as family_income,
        d.data->'path' as path
    FROM extract_imgs ei
    INNER JOIN detected_borders db on (db.data->>'img_path'=ei.data->>'img_path')
    INNER JOIN decls d on (db.data->>'path'=d.data->>'path')
""")
# len(extracts)
extracts[['income','family_income','first_bin','second_bin']].to_json('model_input.json',orient='records')
# extracts['second_width'].value_counts()
# extracts['first_height'].hist(bins=20)

In [None]:
def show_extracted_borders(img_path):
    img = cv2.imread(img_path)
    left_border, top_border, right_border, bottom_border = extract_borders(img)
    cv2.line(img,(left_border[1],left_border[2]),(left_border[3],left_border[4]),(0,0,255),2)
    cv2.line(img,(top_border[1],top_border[2]),(top_border[3],top_border[4]),(0,255,0),2)
    cv2.line(img,(right_border[1],right_border[2]),(right_border[3],right_border[4]),(255,0,0),2)
    cv2.line(img,(bottom_border[1],bottom_border[2]),(bottom_border[3],bottom_border[4]),(255,0,255),2)
    print top_border
    print img.shape
    plt.figure()
    plt.imshow(img)

In [None]:
pgt.import_json_into_pg("detected_borders", [], lambda record: record['img_path'], create_table=True, skip_duplicates=True)

In [None]:
pgt.pg_query_by_id('decls','vulyk_7_82')['income']['5']

In [None]:
841*0.01

In [None]:
!open /Users/tilarids/dev/decl/data/fc2dfe309d880193fd04672ff6c98ed664e78268e9dd26ba1608cc33.pdf