In [1]:
import glob
import json
import numpy.random as random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce

## question

In [2]:
question = glob.glob('../../../../ICDAR-2019-SROIE/data/box/*')

In [3]:
txt_list = []
for text in question:
    with open(text) as f:
        txt = f.read()
    txt_list.append(txt)

In [4]:
def split_only_8(string):
    comma_split_string = [elem for elem in string.split(',')] 
    element_8_list = []
    leftover = []
    if comma_split_string != ['']:
        for idx, txt in enumerate(comma_split_string):
            if idx < 8:
                element_8_list.append(int(txt))
            else:
                leftover.append(txt)
        element_8_list.append(','.join(leftover))
    return element_8_list

In [5]:
corpus = [[split_only_8(row) for row in text.split('\n')] for text in txt_list]

In [8]:
df_list = [pd.DataFrame(c, columns=['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4','y4','transcript']).dropna() for c in corpus]

In [9]:
df_long = reduce(lambda a,b: pd.concat([a, b]), df_list)

In [10]:
text_list = list(df_long.transcript)

## answer

In [11]:
answer = glob.glob('../../../../ICDAR-2019-SROIE/data/key/*')

In [12]:
json_list = []
for ans in answer:
    with open(ans) as f:
        json_list.append(json.load(f))

In [13]:
json_list[1]

{'company': 'INDAH GIFT & HOME DECO',
 'date': '19/10/2018',
 'address': '27, JALAN DEDAP 13, TAMAN JOHOR JAYA, 81100 JOHOR BAHRU, JOHOR.',
 'total': '60.30'}

In [14]:
answer_df = pd.DataFrame(json_list)

In [17]:
from collections import Counter
from itertools import zip_longest

In [130]:
def random_combination(zip_list, len_words, join_string):
    random_elements = [random.choice(zip_seg_company[_+1], num_each_combination) for _ in range(len_words)]
    return [join_string.join(rand) for rand in list(zip(*random_elements))] 

In [259]:
def random_combinations_from_list(nested_list, longest_list, num_each_combination, join_string):
    set_list = set(nested_list)
    seg_list = [str(lst).split() for lst in set_list]
    zip_seg_list = list(zip_longest(*seg_list))
    zip_seg_list = list(map(lambda e: list(filter(lambda x: x, e)), zip_seg_list))
    random_comb_list = []
    for _ in range(2, min(longest_list, len(zip_seg_list))):
        random_comb_list.extend([random_combination(zip_seg_list, _, join_string) for __ in range(num_each_combination)])
    return random_comb_list

### company data augmentation

In [264]:
set_company = list(answer_df.company)

In [265]:
set_company.extend(random_combinations_from_list(set_company, 7, 400, ' '))

### address data augmentation

In [293]:
set_address = list(set(answer_df.address))

In [294]:
seg_address = [addr.strip()+','
                    if len(str(address).split(',')) != idx
                    else addr.strip()+random.choice([',','.'])
                    for address in set_address 
                    for idx, addr in enumerate(str(address).split(','))]

In [295]:
set_address.extend(random_combinations_from_list(seg_address, 5, 750, ','))

In [296]:
set_address.extend(seg_address)

### date data augmentation

In [146]:
set_date = list(answer_df.date)

In [147]:
aug_each_date = 200

In [31]:
from numpy.random import randint as rint

In [32]:
months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
def rmonths():
    return random.choice(months)

In [33]:
set_date.extend(['{:02d}/{:02d}/{}'.format(rint(1, 32), rint(1,13), rint(1990, 2020)) for _ in range(aug_each_date)])
set_date.extend(['{:02d}-{:02d}-{}'.format(rint(1, 32), rint(1,13), rint(1990, 2020)) for _ in range(aug_each_date)])
set_date.extend(['{:02d}-{:02d}-{:02d}'.format(rint(1, 32), rint(1,13), rint(0, 99)) for _ in range(aug_each_date)])
set_date.extend(['{:02d} {} {}'.format(rint(1, 32), rmonths(), rint(1990, 2020)) for _ in range(aug_each_date)])

In [34]:
set_total = list(answer_df.total)

### none data augmentation
영수증에 나왔던 단어 중 company, date, address 가 아닌 단어들을 none으로 둠

In [297]:
not_none = ' '.join(list(map(str, answer_df.company + answer_df.address + answer_df.date)))

In [298]:
none = list(filter(lambda e: e not in not_none, text_list))

In [299]:
set_none = random.choice(none, 10000)

In [300]:
len(set_company), len(set_date), len(set_address), len(set_none)

(2626, 626, 3723, 10000)

In [301]:
answer = {0:'company', 1:'address', 2:'None'}
answer_inv = {value:key for key, value in answer.items()}

In [302]:
company_label_list = [(_, answer_inv['company']) for _ in set_company]
# date_label_list = [(_, answer_inv['date']) for _ in set_date]
address_label_list = [(_, answer_inv['address']) for _ in set_address]
none_label_list = [(_, answer_inv['None']) for _ in set_none]

In [303]:
corpus_label = company_label_list + address_label_list + none_label_list

In [304]:
len(corpus_label)

16349

## put in DB

In [43]:
from glob import glob
import sys
sys.path.append('../../../modules/database/')
from db_util import *
from db import *

In [198]:
path = r'C:\Users\hscho\Desktop\long8v/DB/'
name = 'word_classificaton_data'
description = 'data for word classification, no splitted'
db_raw = create_data_db(path=path, name=name, description=description, data_class='RAW')

In [45]:
cache = {}
for index, value in tqdm(enumerate(corpus_label)):
    cache = update_cache(cache, index=index, label=str(value[1]), image=None, text=str(value[0]), ref=None)

14624it [00:00, 337741.12it/s]


In [46]:
write_cache_to_db(db_raw, db_raw.open_db(b'db_data'), cache)

In [47]:
print_env(db_raw)

+-------------+----------------------------------------------------------+
| Key         | Value                                                    |
+-------------+----------------------------------------------------------+
| class       | RAW                                                      |
| created     | 2020-09-09 19:52:03                                      |
| db_data     | (database)                                               |
| description | data for word classification, no splitted                |
| name        | word_classificaton_data                                  |
| note        |                                                          |
| reference   | C:\Users\hscho\Desktop\long8v/DB/word_classificaton_data |
| sep         | 	                                                        |
| updated     | 2020-09-09 19:52:03                                      |
+-------------+----------------------------------------------------------+


In [48]:
register_db(db=db_raw)

<Environment at 0x1da32cf06f0>

## tokenizer

In [161]:
from tensorflow.keras import preprocessing

In [162]:
all_text = list(zip(*corpus_label))[0]
all_text_chr = list(map(lambda e: list(str(e)), all_text))
tokenizer = preprocessing.text.Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(all_text_chr)

In [163]:
max_len = max([len(_) for _ in all_text_chr])

In [164]:
tokenizer_path = r'C:\Users\hscho\Desktop\long8v/DB/'
with open('{}/tokenizer.json'.format(tokenizer_path), 'w') as f:
    json.dump(tokenizer.to_json(), f)

## data preprocessing

In [222]:
from sklearn.model_selection import train_test_split

In [223]:
train, test = train_test_split(corpus_label, shuffle=True)

In [224]:
zip_train = list(zip(*train))
zip_test = list(zip(*test))

In [225]:
train_x = zip_train[0]
train_y = zip_train[1]
test_x = zip_test[0]
test_y = zip_test[1]

In [226]:
n_cat = len(set(train_y))

In [227]:
def preprocess(lst, tokenizer):
    lst_chr = list(map(lambda e : list(str(e)), lst))
    lst_sequences = tokenizer.texts_to_sequences(lst_chr)
    padded_lst_sequences = preprocessing.sequence.pad_sequences(lst_sequences, maxlen=135, padding='post')
    return padded_lst_sequences

In [228]:
padded_train_x = preprocess(train_x, tokenizer)
padded_test_x = preprocess(test_x, tokenizer)

In [229]:
padded_train_x

array([[44, 12,  6, ...,  0,  0,  0],
       [ 7, 11,  7, ...,  0,  0,  0],
       [ 6,  7,  3, ...,  0,  0,  0],
       ...,
       [16, 14, 23, ...,  0,  0,  0],
       [17,  3,  8, ...,  0,  0,  0],
       [10,  7,  5, ...,  0,  0,  0]])

In [230]:
from keras.utils import to_categorical

In [231]:
train_y_ctg = to_categorical(train_y)
test_y_ctg = to_categorical(test_y)

In [305]:
# -*- coding: utf-8 -*-

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Attention
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LeakyReLU

import tensorflow as tf
import tensorflow.keras as keras


In [306]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, 150, trainable=True))

# conv
model.add(Conv1D(128, 3, activation=None))
model.add(LeakyReLU())
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Conv1D(256, 5, activation=None))
model.add(LeakyReLU())
model.add(BatchNormalization())
model.add(MaxPool1D(2))

# lstm
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))

model.add(Dense(64, activation=None))
model.add(LeakyReLU())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(n_cat, activation='softmax'))



# compile model
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [307]:
padded_train_x.shape, train_y_ctg.shape

((12261, 135), (12261, 3))

In [308]:
for _ in range(10):
    model.fit(x=padded_train_x, y=train_y_ctg, batch_size=20, validation_split=0.1, validation_freq=2)



In [309]:
model.evaluate(x=padded_test_x, y=test_y_ctg)



[0.049779947847127914, 0.9863013625144958]

In [310]:
answer[np.argmax(model.predict(preprocess(['20-19-2020'], tokenizer)))]

'None'

## inference

In [311]:
def get_answer(transcript):
    return answer[np.argmax(model.predict(preprocess([transcript], tokenizer)))]

In [312]:
df = df_list[0]
for idx, row in df.iterrows():
    ans = get_answer(row.transcript)
    if ans != 'None':
        print(ans, row.transcript)

company BOOK TA .K(TAMAN DAYA) SDN BND
address NO.53 55,57 & 59, JALAN SAGU 18,
address TAMAN DAYA,
address 81100 JOHOR BAHRU,
company KF MODELLING CLAY KIDDY FISH


In [313]:
answer_df.loc[0]['address']

'NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR.'

In [314]:
from collections import defaultdict

In [333]:
predicted_list = []
for df in tqdm(df_list):
    dedict = defaultdict(str)
    for idx, row in df.iterrows():
        ans = get_answer(row.transcript)
        if ans != 'None':
            dedict[ans] += row.transcript
    predicted_list.append(dedict)

100%|█████████████████████████████████████████████████| 626/626 [21:17<00:00,  2.04s/it]


In [334]:
get_answer('OLD TOWN KOPTIAM SDN BHD')

'company'

In [335]:
get_answer('OLDTOWN WHITE COFFEE')

'None'

In [336]:
get_answer('F&P PHARMACY')

'company'

In [337]:
get_answer('LIGHTROOM GALLERY SDN BHD')

'company'

In [338]:
get_answer('SWC ENTERPRISE SDN BHD')

'None'

In [339]:
list(zip(company, answer_df['company']))

[('BOOK TA .K(TAMAN DAYA) SDN BNDKF MODELLING CLAY KIDDY FISH',
  'BOOK TA .K (TAMAN DAYA) SDN BHD'),
 ('INDAH GIFT & HOME DECOROUNDING ADJ............', 'INDAH GIFT & HOME DECO'),
 ('MR D.T.Y. (JOHOR) SDN BHD', 'MR D.I.Y. (JOHOR) SDN BHD'),
 ('YONGFATT ENTERPRISE', 'YONGFATT ENTERPRISE'),
 ('MR D.I.Y. (M) SDN BHD', 'MR D.I.Y. (M) SDN BHD'),
 ('TAN CHAY YEEABC HO TRADINGTAMAN DESA HARMONI.', 'ABC HO TRADING'),
 ('TAN CHAY YEESOON HUAT MACHINERY ENTERPRISEWORKMANSHIP & SERVICEGIANT 606 OVERFLOW ASSY',
  'SOON HUAT MACHINERY ENTERPRISE'),
 ('TAN CHAY YEES.H.H. MOTOR (SUNGAI RENGIT) SDN. BHD.',
  'S.H.H. MOTOR (SUNGAI RENGIT) SDN. BHD.')]

In [340]:
address = [_['address'] for _ in predicted_list]
company = [_['company'] for _ in predicted_list]

In [341]:
list(zip(address, answer_df['address']))

[('NO.53 55,57 & 59, JALAN SAGU 18,TAMAN DAYA,81100 JOHOR BAHRU,',
  'NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR.'),
 ('27,JALAN DEDAP 13,TAMAN JOHOR JAYA,',
  '27, JALAN DEDAP 13, TAMAN JOHOR JAYA, 81100 JOHOR BAHRU, JOHOR.'),
 ('LOT 1851-A & 1851-B, JALAN KPB 6,KAWASAN PERINDUSTRIAN BALAKONG,XXXX',
  'LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR (MR DIY TESCO TERBAU)'),
 ('', 'NO 122.124. JALAN DEDAP 13 81100 JOHOR BAHRU'),
 ('LOT 1851-A & 1851-B, JALAN KPB 6,KAWASAN PERINDUSTRIAN BALAKONG,XXXXX',
  'LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR (TESCO PUTRA NILAI)'),
 ('NO.2&4,JALAN HARMONI 3/2,',
  'NO.2&4, JALAN HARMONI 3/2, TAMAN DESA HARMONI. 81100 JOHOR BAHRU JOHOR'),
 ('NO.53 JALAN PUTRA 1,TAMAN SRIPUTRA,',
  'NO.53 JALAN PUTRA 1, TAMAN SRI PUTRA, 81200 JOHOR BAHRU JOHOR'),
 ('NO. 343, JALAN KURAU, SUNGAI RENGIT,81620 PENGERANG, JOHOR.',
  'NO. 343,

In [342]:
sum(pred==ans for pred, ans in zip(address, answer_df.address))

7

In [343]:
sum(pred==ans for pred, ans in zip(company, answer_df.company))

226

In [344]:
len(answer_df.company)

626

In [345]:
226/ 626

0.3610223642172524