In [65]:
def parse_bbox_string(rect):
    rect_split = rect.split(',')
    x = int(rect_split[0])
    y = int(rect_split[1])
    w = int(rect_split[2])
    h = int(rect_split[3])
    return {'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h}

def get_text(token):
    if 'position' not in token or token['position'] == 'B' or token['position'] == 'U':
        if 'type' in token:
            return token['type']
        elif 'original' in token:
            return token['original'].lower()
        elif 'text' in token:
            return token['text'].lower()
    else:
        return None

def get_entity_text(json):
    regions = json['regions']
    pageWidth = json['pages'][0]['pageWidth']
    pageHeight = json['pages'][0]['pageHeight']
    text_lines = []
    for region in regions:
        if region['type'] == 'PARAGRAPH':
            for line in region['lines']:
                line_bbox = parse_bbox_string(line['rect'])
                # record bbox as x1, y1, x2, y2 percentage coordinates
                line_data = {'bbox': [line_bbox['x1'] / pageWidth, line_bbox['y1'] / pageHeight, line_bbox['x2'] / pageWidth, line_bbox['y2'] / pageHeight]}
                
                line_data['text'] = []
                for token in line['tokens']:
                    token_text = get_text(token)
                    if token_text is not None:
                        line_data['text'].append(token_text)
                if len(line_data['text']) > 0:
                    text_lines.append(line_data)
    return text_lines

In [55]:
def get_token_count(data):
    count = 0
    for line in data:
        count += len(line['text'])
    return count

def text_only(data):
    text_list = []
    for line in data:
        text_list.append(line['text'])
    return text_list

def text_and_bbox(data):
    line_list = []
    for line in data:
        line_list.append([*line['bbox'],*line['text']])
    return line_list

def print_by_line(list):
    for element in list:
        print(element)

In [66]:
import json
from os import listdir
from os.path import isfile, join

IMAGE_STUFF_DIR = 'image-stuff/'
image_stuff_file_list = [f for f in listdir(IMAGE_STUFF_DIR) if isfile(join(IMAGE_STUFF_DIR, f))]

entity_texts = []
total_word_count = 0
for file in image_stuff_file_list:
    with open(IMAGE_STUFF_DIR + file) as image_stuff:
        image_stuff_json = json.load(image_stuff)
    entity_text_list = get_entity_text(image_stuff_json)
    entity_texts.append(entity_text_list)
    total_word_count += get_token_count(entity_text_list)

print('Number of words: ' + str(total_word_count))
print('Number of image-stuff: ' + str(len(entity_texts)))
print('Average number of words per image-stuff: ' + str(total_word_count / len(entity_texts)))
print('Number of samples / Number of words per sample: ' + str(50 / 313.42))

# I would need 500,000 samples to make it over 1500 and follow the other flows.
# According to google, since (Number of samples / Number of words per sample) < 1500
# Tokenize the text as n-grams and use a
# simple multi-layer perceptron (MLP) model to classify them (left branch in the
# flowchart below):
#   a. Split the samples into word n-grams; convert the n-grams into vectors.
#   b. Score the importance of the vectors and then select the top 20K using the scores.
#   c. Build an MLP model.

Number of words: 15779
Number of image-stuff: 50
Average number of words per image-stuff: 315.58
Number of samples / Number of words per sample: 0.15953034267117605


In [67]:
print(entity_texts[0])

[{'bbox': [0.1278705636743215, 0.11868214152002995, 0.8580375782881002, 0.15761886933732686], 'text': ['STRUCTURE', 'STRUCTURE']}, {'bbox': [0.1714509394572025, 0.15761886933732686, 0.32724425887265135, 0.19880194683639085], 'text': ['DATE']}, {'bbox': [0.37447807933194155, 0.17521527517783603, 0.6056889352818372, 0.198427555222763], 'text': ['ficha', 'de', 'EVENT_VITAL']}, {'bbox': [0.6936325678496869, 0.16585548483713966, 0.7267745302713987, 0.2092849120179708], 'text': ['2']}, {'bbox': [0.7262526096033403, 0.18345189067764883, 0.7382567849686847, 0.18944215649569449], 'text': ['⌨']}, {'bbox': [0.7588726513569938, 0.16997379258704606, 0.8204592901878914, 0.20816173717708725], 'text': ['24']}, {'bbox': [0.8580375782881002, 0.16061400224634967, 0.8838726513569938, 0.19206289779108948], 'text': ['⌨']}, {'bbox': [0.7737473903966597, 0.19131411456383376, 0.8838726513569938, 0.2366154998128042], 'text': ['⌨', '⌨', '⌨', '⌨', '⌨']}, {'bbox': [0.12839248434237996, 0.2347435417446649, 0.433194

In [68]:
print_by_line(text_only(entity_texts[5]))

['olavida']
['⌨', 'PERSON']
['no. 1468']
['PERSON']
['PERSON']
['PERSON']
['en', 'la', 'ORGANIZATION_RELIGIOUS', ',', 'en', 'LOCALE']
['a', 'DATE']
['fue', 'EVENT_VITAL', 'solemnemente', 'por', 'el', 'suscrito', 'párroco']
['una', 'COREF_GENDER', 'que', 'EVENT_VITAL', 'en', 'LOCALE']
['a', 'DATE']
['a', 'COREF', 'llamó', 'PERSON']
['y', 'PERSON']
['FAMILY_MEMBER', ':', 'PERSON', 'y', 'PERSON']
['FAMILY_MEMBER', ':', 'PERSON', 'y', 'PERSON']
['⌨', '⌨', '⌨', '⌨']
['fueron', 'NONFAMILY', ':', 'PERSON', 'PERSON', 'y', 'PERSON', 'PERSON']
['⌨', '⌨', '⌨']
['⌨', '⌨', '⌨', '⌨']
['en', 'la', 'ORGANIZATION_RELIGIOUS', ',', 'en', 'l', 'DATE']
['↔']
['fue', 'EVENT_VITAL', 'solemnemente', 'por', 'el', 'suscrito', 'OCCUPATION']
['un', 'COREF_GENDER', 'que', 'EVENT_VITAL', 'en', 'LOCALE']
['el', 'DATE']
['-']
['a', 'COREF', 'llamó', 'PERSON']
['FAMILY_MEMBER']
['de', 'PERSON', 'y', 'PERSON']
['↔']
['FAMILY_MEMBER', ':', 'PERSON', 'y', 'PERSON']
['FAMILY_MEMBER', ':', 'PERSON', 'y', 'PERSON']
['fueron

In [69]:
print_by_line(text_and_bbox(entity_texts[6]))

[0.3205162678139285, 0.07967032967032966, 0.3498252218338263, 0.08987441130298274, '✍']
[0.1917182038182307, 0.10282574568288853, 0.4939499865555257, 0.13383045525902668, 'ORGANIZATION_RELIGIOUS', 'EVENT_VITAL', 'solemnemte']
[0.1874159720354934, 0.1389324960753532, 0.4966388814197365, 0.18053375196232338, '', 'en', 'esta', 'COREF', 'parroquia', 'un', 'COREF_GENDER', ',', 'que', 'dijeron']
[0.18311374025275612, 0.17032967032967034, 0.4939499865555257, 0.22684458398744112, '', 'haber', 'EVENT_VITAL', 'en', 'COREF_LOCATION', ',', 'a', 'las']
[0.18714708254907234, 0.20172684458398743, 0.4920677601505781, 0.260989010989011, 'TIME', ',', 'FAMILY_MEMBER']
[0.18714708254907234, 0.2382260596546311, 0.4829255176122614, 0.28532182103610676, 'de', 'PERSON', ',', 'EVENT_VITAL', 'LOCALE', ',', 'y']
[0.18472707717128262, 0.2802197802197802, 0.5004033342296316, 0.31907378335949765, 'de', 'TITLE', 'PERSON', ',', 'OCCUPATION', ',', 'EVENT_VITAL']
[0.17854261898359775, 0.3120094191522763, 0.492874428609

In [None]:
#1. Make N-grams
#2. Vectorize the N-grams