[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tulasiram58827/Information-Extraction-From-Documents/blob/main/SROIE_Candidate_Generators.ipynb)

This notebook pre-process the data and create a dataframe which is then used to create the tensorflow dataset object

## Download Data

Homepage: https://rrc.cvc.uab.es/?ch=13&com=downloads. 

In [1]:
!gdown --id 10r9y17wg8Elo-3Zi61xA_8QDaKix8giN -O data.tar.xz

Downloading...
From: https://drive.google.com/uc?id=10r9y17wg8Elo-3Zi61xA_8QDaKix8giN
To: /content/data.tar.xz
247MB [00:03, 80.4MB/s]


In [2]:
!tar -xf data.tar.xz

## Imports

In [3]:
!pip install -q dateparser

import dateparser
import cv2
import glob
import json
import imutils
import numpy as np

# https://stackoverflow.com/a/51855662
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import matplotlib.pyplot as plt

from tqdm import tqdm
from imutils import paths
from google.colab.patches import cv2_imshow

## Data checking

In [4]:
images = sorted(list(paths.list_images("/content/data/img")))
len(images)

626

In [5]:
csvs = sorted(glob.glob("/content/data/box/*.csv"))
len(csvs)

626

In [6]:
jsons = sorted(glob.glob("/content/data/key/*.json"))
len(jsons)

626

In [133]:
sample_csv = pd.read_csv(csvs[0], names=["x1_1", "y1_1", "x2_1", "y2_1", 
                                         "x3_1", "y3_1", "x4_1", "y4_1", "transcript"])
sample_csv.head()

Unnamed: 0,x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1,transcript
0,72,25,326,25,326,64,72,64,TAN WOON YANN
1,50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
2,205,121,285,121,285,139,205,139,789417-W
3,110,144,383,144,383,163,110,163,NO.53 55
4,192,169,299,169,299,187,192,187,TAMAN DAYA


## Helper Functions

####  Apply Date Candidate Generator

In [135]:
def parse_dates(row):
    return dateparser.parse(row, settings={'STRICT_PARSING': True, 
                                           'REQUIRE_PARTS': ['day', 'month', 'year']})

sample_csv["date_candidate"] = sample_csv["transcript"].apply(parse_dates)
sample_csv["date_candidate"] = pd.to_datetime(sample_csv['date_candidate'], errors = 'coerce').dt.date
sample_csv.head(5)

Unnamed: 0,x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1,transcript,date_candidate
0,72,25,326,25,326,64,72,64,TAN WOON YANN,NaT
1,50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND,NaT
2,205,121,285,121,285,139,205,139,789417-W,NaT
3,110,144,383,144,383,163,110,163,NO.53 55,NaT
4,192,169,299,169,299,187,192,187,TAMAN DAYA,NaT


#### Apply Total Amount Candidate Generator

In [136]:
sample_csv["total_candidate"] = pd.to_numeric(sample_csv["transcript"], errors="coerce")
sample_csv.sample(5)

Unnamed: 0,x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1,transcript,date_candidate,total_candidate
2,205,121,285,121,285,139,205,139,789417-W,NaT,
9,165,372,342,372,342,389,165,389,25/12/2018 8:13:39 PM,2018-12-25,
16,276,506,306,506,306,522,276,522,DISC,NaT,
25,202,597,245,597,245,612,202,612,9.000,NaT,9.0
1,50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND,NaT,


In [137]:
temp = np.array([False]*sample_csv.shape[0])
for idx, row in sample_csv.iterrows():
  text = str(row['transcript']).replace(':', '').lower()
  if text in ['total', 'amount', 'balance']:
    temp[idx-2:idx+10] = True
sample_csv['total_check'] = temp

In [138]:
sample_csv_filtered = sample_csv[(~sample_csv['date_candidate'].isnull()) | (~sample_csv['total_candidate'].isnull() & sample_csv['total_check'])]
sample_csv_filtered

Unnamed: 0,x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1,transcript,date_candidate,total_candidate,total_check
9,165,372,342,372,342,389,165,389,25/12/2018 8:13:39 PM,2018-12-25,,False
21,27,570,137,570,137,583,27,583,9556939040116,NaT,9556939040116.0,True
25,202,597,245,597,245,612,202,612,9.000,NaT,9.0,True
26,275,598,309,598,309,612,275,612,0.00,NaT,0.0,True
27,411,596,443,596,443,613,411,613,9.00,NaT,9.0,True
30,408,669,443,669,443,684,408,684,0.00,NaT,0.0,True
32,401,703,443,703,443,719,401,719,9.00,NaT,9.0,True
34,402,748,441,748,441,763,402,763,10.00,NaT,10.0,True
36,412,772,443,772,443,786,412,786,1.00,NaT,1.0,True


## Generate Neighbors

In [139]:
# Reference : https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea/boxBArea
    # return the intersection over union value
    return iou

In [140]:
def generate_neighbors(orig_csv_df, filt_csv_df, image_shape):
    for idx, row in filt_csv_df.iterrows():
        # co_ord = [0, row['y1_1']-(10*image_shape[0]/100), row['x3_1'], row['y3_1']]
        co_ord = [0, row['ymin']-(10*image_shape[0]/100), row['xmax'], row['ymax']]
        # cv2.rectangle(image, (co_ord[0], int(co_ord[1])), (co_ord[2], co_ord[3]),(0, 0, 255), 2)
        neighbors = list()
        neigh_pos = list()
        for new_idx, new_row in orig_csv_df.iterrows():
            new_co_ord = [new_row['xmin'], new_row['ymin'], new_row['xmax'], new_row['ymax']]
            # cv2.rectangle(image, (new_co_ord[0], int(new_co_ord[1])), (new_co_ord[2], new_co_ord[3]), (0, 0, 255), 2)
            area = bb_intersection_over_union(co_ord, new_co_ord)
            if area >= 0.5:
                # print(row['transcript'], new_row['transcript'])
                neighbors.append(new_row['Object'])
                xx = abs((new_row['xmax']+new_row['xmin']) - (row['xmax']+row['xmin']))/2
                yy = abs((new_row['ymax']+new_row['ymin']) - (row['ymax']+row['ymin']))/2
                neigh_pos.append(f"({'%.3f' % (xx/image_shape[1])},{'%.3f' % (yy/image_shape[0])})")
        neighbors = "  ".join(str(i) for i in neighbors)
        neigh_pos = "  ".join(str(x) for x in neigh_pos)
        filt_csv_df.loc[idx, 'neighbors'] = neighbors
        filt_csv_df.loc[idx, 'neigh_pos'] = neigh_pos
        filt_csv_df.loc[idx, 'xmin'] = '%.3f' % (new_row['xmin']/image_shape[1])
        filt_csv_df.loc[idx, 'xmax'] = '%.3f' % (new_row['xmax']/image_shape[1])
        filt_csv_df.loc[idx, 'ymin'] = '%.3f' % (new_row['ymin']/image_shape[0])
        filt_csv_df.loc[idx, 'ymax'] = '%.3f' % (new_row['ymax']/image_shape[0])
    return filt_csv_df

In [141]:
image_shape = cv2.imread(images[0]).shape
image_shape
image= cv2.imread(images[0])

In [None]:
from google.colab.patches import cv_imshow
import numpy as np
new_df = generate_neighbors(sample_csv, sample_csv_filtered, image_shape)

In [143]:
new_df.head()

Unnamed: 0,x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1,transcript,date_candidate,total_candidate,total_check,neighbors
9,165,372,342,372,342,389,165,389,25/12/2018 8:13:39 PM,2018-12-25,,False,7 8 9
21,27,570,137,570,137,583,27,583,9556939040116,NaT,9556939040116.0,True,14 18 21
25,202,597,245,597,245,612,202,612,9.000,NaT,9.0,True,14 15 18 19 21 23 24 25
26,275,598,309,598,309,612,275,612,0.00,NaT,0.0,True,14 15 16 18 19 21 22 23 24 25 26
27,411,596,443,596,443,613,411,613,9.00,NaT,9.0,True,14 15 16 17 18 19 20 21 22 23 24 25 26 27
