In [1]:
import re
import regex
import numpy as np
import pandas as pd

In [2]:
class RowColRansac(object):
    def __init__(self):
        self.matches = []

    def add(self, params, targetparams, error):
        if targetparams:
            self.matches.append((params, targetparams, error))
        self.MAX_ERROR = 1

    def findMapping(self):
        a=[]
        for params, targetparams, error in self.matches:
            if targetparams:
                r,c = params
                tr, tc = targetparams
                a.append(tr - r)
        mean_diff = self.findMeanDiff(a)
        mapping = self.refineRet(mean_diff)
        return mapping

    def refineRet(self, mean_diff):
        b = []
        ret = {}
        for params, targetparams, error in self.matches:
            r,c = params
            if targetparams:
                tr, tc = targetparams
                diff_mean_diff = abs(tr - r - mean_diff)
                if diff_mean_diff < 2*self.MAX_ERROR:
                    b.append([r, tr - r, int(diff_mean_diff*10)])
                else:
                    b.append([r, -1, -1])
            else:
                b.append([r, -1, -1])
        b = np.array(b)
        output_range = range(min(b[:,0])-20, max(b[:,0])+41)
        b = b[b[:,2]>=0,:]
        for i in output_range:
            diffcurr_diffdiff = np.abs(b[:,0] - i) * 1000 + b[:,2]
            idx = np.argsort(diffcurr_diffdiff)[0]
            ret[i] = b[idx,1]
        return ret
    
    def findMeanDiff(self, a):
        a.sort()
        max_inliers_count = 0
        for i, lowerbound in enumerate(a):
            inliers = []
            for j in range(i, len(a)):
                if a[j] <= a[i] + 2*self.MAX_ERROR:
                    inliers.append(a[j])
                else:
                    break
            if len(inliers) > max_inliers_count:
                max_inliers_count = len(inliers)
                mean_diff = 1.0*sum(inliers)/len(inliers)
        return mean_diff
    
ransac = RowColRansac()
ransac.add((11,12), (11,12), 2)
ransac.add((11,12), (11,10), 0)
ransac.add((12,0), None, 0)
ransac.add((12,8), (14, 9), 1)
ransac.add((13,0), None, 0)
ransac.add((14,0), (16,0), 0)

print(ransac.findMapping())

{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 2, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 2, 20: 2, 21: 2, 22: 2, 23: 2, 24: 2, 25: 2, 26: 2, 27: 2, 28: 2, 29: 2, 30: 2, 31: 2, 32: 2, 33: 2, 34: 2, 35: 2, 36: 2, 37: 2, 38: 2, 39: 2, 40: 2, 41: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 50: 2, 51: 2, 52: 2, 53: 2, 54: 2, -2: 0, -9: 0, -8: 0, -7: 0, -6: 0, -5: 0, -4: 0, -3: 0, -1: 0}


In [3]:
lines = '''
<STORE "FOUR LEAVES">
@ <MALL "Junction 8::8">
GST Reg No: <GST "M2-0040287-1">
<DATE %d%m%y> <OTHER \d{5} \d{4}> <TIME %H:%M:%S> <OTHER \d{4}>
^sameline <ID \d{6} \d{5} \d{4}> <TIME %H:%M:%S> <OTHER \d{4}>
Rcpt#:<ID ID> \n <DATE1 %d/%m/%Y><TIME1 %H:%M>
'''
target_lines = '''
################
FOUR LEAFES 123
B"u Junction 8
f\dsGST No: M2-0040287-1
chau len ba
chau QRTY 123
TOTAL 24.23$
CHANGE $45.00
111119 12345 1234 22:22:22 1234
Rcpt#:A123 11/11/2011 11:11
@BUGIS JUNCTION
111111 12345 22:23
'''
target_lines = target_lines.split('\n')
target_lines = [line.strip() for line in target_lines if len(line) > 0]

rawtypes = ['RAW', 'STORE', 'MALL', 'GST', 'BIZNO', 'LOT', 'TEL', 'ZIPCODE']
regextypes = ['DATE', 'TIME', 'ID', 'OTHER', 'GSTTAX', 'QTY', 'TOTAL', 'SUBTOTAL', 'CHANGE', 'SVC']
regex_shortkey = {'PRICE':r'(([1-9]\d*|0)\.\d\d)',
                  'NUMBER':r'[1-9]\d*',
                  'ID':r'[ ]?\w*?[ :\.#]{0,4}.*?([A-Z0-9]{2,25}([-/][A-Z0-9]{1,8}([-/][0-9A-Z]{1,8})?)?)',
                  'NAME':''}

def timestr2reg(timestr):
    ret = re.escape(timestr)
    ret = ret.replace('%d', '([0-2][0-9]|3[01])')
    ret = ret.replace('%m', '(0[1-9]|1[012])')
    ret = ret.replace('%y', '(18|19|20)')
    ret = ret.replace('%Y', '(20(18|19|20))')
    ret = ret.replace('%H', '([01][0-9]|2[0-3]|[0-9])')
    ret = ret.replace('%M', '([0-5][0-9])')
    ret = ret.replace('%S', '([0-5][0-9])')
    return ret

def type2regex(kw_type, raw_string):
    if kw_type in rawtypes and len(raw_string) > 2 and raw_string[0] == '"' and raw_string[-1] == '"':
        return re.escape(raw_string[1:-1])
    elif raw_string in regex_shortkey.keys():
        return regex_shortkey[raw_string]
    elif kw_type in ['DATE', 'TIME']:
        return timestr2reg(raw_string)
    elif kw_type in regextypes:
        return raw_string
    else:
        return re.escape(raw_string)

class TWord(object):
    def __init__(self, kw_type, raw_string, row=None, explen=None):
        m = re.match(r'([A-Z]+)-?([0-9]*)', kw_type)
        self.kw_type = m.group(1)
        self.kw_name = m.group(1) + '-' + m.group(2)
        self.regexp = type2regex(kw_type, raw_string)
        if '::' in self.regexp:
            a, b = self.regexp.split('::', 1)
            self.regexp = a
            self.exact = b
        else:
            self.exact = None
        if explen is None:
            self.explen = len(self.regexp)
        else:
            self.explen = explen
        self.row = row
        self.maxAllowedError = int(self.explen/5) #TODO: change function shape in future
        
    def check(self, lines):
        ret = []
        for row, line in enumerate(lines):
            for mae in range(0, self.maxAllowedError + 1):
                searchString = '(' + self.regexp + '){e<=' + str(mae) + '}'
                m = regex.search(searchString, line)
#                 print('search %s in %s with %d errors' %(searchString, line, mae))
                if m:
                    extracted = m.group(0)
                    realError = mae
                    if self.exact is not None and self.exact in extracted:
                        ret.append((row,None,realError, m))
                    break
        
        return ret
    
# TODO: match RAW+DATETIME. Remember matched string.
# add all match options to RANSAC.=> done.

def tpassage_check(twords, lines):
    ransac = RowColRansac()
    matched_results = {}
    for tword in twords:
        tword.extracted = None
        if tword.kw_type in (rawtypes + ['DATE', 'TIME']):
            ret = tword.check(lines)
            matched_results[tword] = []
            for x,y,e,m in ret:
                ransac.add((tword.row, None), (x,y), e)
                matched_results[tword].append((x, m))
                    
    mapping = ransac.findMapping()
    print(mapping)
    if len(mapping):
        for tword in twords:
            expected_row = tword.row + mapping[tword.row]
            if len(matched_results[tword]) > 0:
                row_extracted = [(abs(target_row-expected_row), target_row, m.group(0)) for target_row, m in matched_results[tword]]
                row_extracted.sort()
                diff, target_row, extracted = row_extracted[0]
                if diff <= 1:
                    tword.extracted = (target_row, extracted)
            else:
                for lineid in [expected_row, expected_row-1, expected_row+1]:
                    if lineid < 0 or lineid > len(lines): continue
                    # TODO: narrow down search range within row (remove before/after words)
                    line = lines[lineid]
                    searchString = '(' + tword.regexp + ')'
                    m = re.search(searchString, line)
                    if m:
                        extracted = m.group(0)
                        tword.extracted = (target_row, extracted)
                        break
#         confident_score = matched/all
#         return [(line_start, line_end), confident_score, {key_value_extracted}],...    
    return twords




tword1 = TWord(kw_type='STORE', raw_string='"FOUR LEAVES"', row=0)
tword2 = TWord(kw_type='RAW', raw_string='"@ "', row=1)
tword3 = TWord(kw_type='MALL', raw_string='"Junction 8::8"', row=1)
tword4 = TWord(kw_type='RAW', raw_string='"GST Reg No: "', row=2)
tword5 = TWord(kw_type='GST', raw_string='"M2-0040287-1"', row=2)
twords = [tword1, tword2, tword3, tword4, tword5]

# tword0 = TWord(kw_type='DATE', raw_string="%d%m%y", row=0, explen=6)
# tword1 = TWord(kw_type='DATE', raw_string="%d/%m/%Y", row=0, explen=9)
# tword2 = TWord(kw_type='TIME', raw_string="%H:%M:%S", row=1, explen=9)
# tword3 = TWord(kw_type='TIME', raw_string="%H:%M", row=1, explen=5)


# tpassage_check(twords, target_lines)
# tword3.check(target_lines)
# tword1.regexp

In [5]:
class TPassage(object):
    def __init__(self, lines):
        self.twords = []
        for i, line in enumerate(lines):
            twords = self._buildTWords(line)
            for tword in twords:
                tword.row = i
            self.twords += twords
            
    def _buildTWords(line):
        rWord = r'(<([A-Z]+-?[0-9]*)[ ](.*?)>)'
        rs = re.findall(rWord, line)
        ret = []
        start = 0
        for all_str, kw_type, word_str in rs:
            a = line.find(all_str, start)
            b = a + len(all_str)
            if a > start + 1:
                ret.append(TWord('RAW', line[start:a], '', 0))
    #             print('RAW === %s'%(line[start:a]))
            ret.append(TWord(kw_type, word_str, '', 0))
    #         print('%s === %s' % (kw_type, word_str))
            start = b
        if len(line) > start + 1:
            ret.append(TWord('RAW', line[start:len(line)], '', 0))
    #         print('RAW === %s'%(line[start:len(line)]))
        return ret

#     def tpassage_check(self, lines):
#         ransac = RowColRansac()
#         for tword in self.twords:
#             ret = tword.check(lines)
#             for x,y,e,_ in ret:
#     #             if 
#                 ransac.add((tword.row, None), (x,y), e)
#         mapping = ransac.findMapping()
#         print(mapping)
#         if mapping:
#             for tword in self.twords:
#                 if tword.kw_type in []:
#                     expected_row = tword.row + mapping[tword.row]
#                     ret = tword.check(lines[expected_row-1:expected_row+2])
#                     if len(ret) > 0:
#                         #sort by e
#                         m = ret[0][3]
#         else:
#             print('Not found')
    
lines = '''
<STORE "FOUR LEAVES">
@ <MALL "Junction 8::8">
GST Reg No: <GST "M2-0040287-1">
<DATE %d%m%y> <OTHER \d{5} \d{4}> <TIME %H:%M:%S> <OTHER \d{4}>
^sameline <ID \d{6} \d{5} \d{4}> <TIME %H:%M:%S> <OTHER \d{4}>
Rcpt#:<ID ID> \n <DATE1 %d/%m/%Y><TIME1 %H:%M>
'''
target_lines = '''
FOUR LEAVES 123
Bu Junction 8
111111 12345 1234 22:22:22 1234
Rcpt#:A123 11/11/11 11:11
'''
t = TPassage(lines)

In [None]:
def readInstruction(line):
    if len(line) == 0:
        return 'null', None
    elif line[0] == '^':
        words = line.split(' ')
        if len(words[0]) < 2:
            return 'line', line
        else:
            inst = words[0][1:]
            if inst == 'code' or len(inst) > 0 and inst[0] == '^':
                return 'code', words[1].strip()
    elif line == '...':
        return 'break', None
    else:
        return 'line', line
    
def readDatabase(db_file):
    allines = open(db_file).readlines()
    inst, line = readInstruction(line.strip())
    assert inst == 'code'
    newcode = line
    lines = []
    tpassages = []
    templates = []
    for line in allines:
        inst, line = readInstruction(line.strip())
        if inst == 'code':
            tpassages.append(TPassage(lines))
            templates.append(newcode, Templates(tpassages))
            newcode = line
            tpassages = []
            lines = []
        elif inst == 'line':
            lines.append(line)
        elif inst == 'break':
            tpassages.append(TPassage(lines))
            lines = []
    tpassages.append(TPassage(lines))
    templates.append(Templates(tpassages))
    return templates

In [None]:
gt_data = pd.read_csv(samples_file)

templates = readDatabase(templates_path)
    
class Template(object):
    def __init__(self, code, tpassages):
        self.tpassages = tpassages
        self.code = code

    def _parseString(self, desc_string):
        pass

    def tpassage_check(self, lines):
        rets = []
        for tp in self.tpassages:
            ret = tp.check(lines)
        # combine and confirm 
        return {} # Raw key-value result
    
    @staticmethod
    def extract_fields(raw_dict, lines):
        # build ExtractedData
        # DATE (1),2, ..., 10h rule.
        # TOTAL, GSTTAX to select ?, ...
        data = ExtractedData()

In [None]:
import pandas as pd
from os import path
import re


samples_file = '/home/loitg/workspace/ocrversion2/temp/samples300.csv'
texts_dir = '/home/loitg/workspace/ocrversion2/temp/texts_samples300/'

templates = readDatabase("D:\\uatfull\\prod_jun\\process_texts\\database.txt")

gt_data = pd.read_csv(samples_file)

def prepocessLine(line):   
    return re.sub('\s+', ' ', line).strip()

for i, row in gt_data.iterrows():
    fn = texts_dir + row['ImageName'] + '.jpg.txt'
    if not path.exists(fn): continue
    lines = open(fn).readlines()
    lines = [prepocessLine(line) for line in lines]
    for template in templates:
        raw_dict = template.tpassage_check(lines)
        data = template.extract_fields(raw_dict)
    
    #compare gt_data vs data
    gt_data[]


In [None]:
class CLExtractor2(object):
    def __init__(self, templates_path):
        self.templates = readDatabase(templates_path)
        self.tf = location_nn.TemplateFilterer()
        for template in self.templates:
            # select Store KW, Mall KW
            store_kws = []
            mall_kws = []
            self.tf.add(template.code, store_kws, mall_kws)
        
    def extract(self, orilines):
        # filters
        filtered_codes = self.tf.search(orilines)
        datas = []
        for template in self.templates:
            if template.code in filtered_codes:
                # detect
                raw_dict = template.tpassage_check(self, lines)
                datas.append(template.extract_fields(raw_dict))

        if len(datas) == 1:
            return datas[0]
        else:
            None

        #
    