In [1]:
import glob
import json

from PIL import Image, ImageDraw
from IPython.display import display

docs = []

for f in glob.glob("data/*.json"):
    docs.append(json.load(open(f)))

In [94]:
from collections import Counter
from sklearn.base import TransformerMixin, BaseEstimator

class TokFeaturiser(BaseEstimator, TransformerMixin):
    def __init__(self, min_token_count=3, neighbourhood_size=(5, 5)):
        self.min_token_count = min_token_count
        self.lower_counts = None
        self.neighbourhood_size = neighbourhood_size
    
    def fit(self, X, y=None, **fit_args):
        self.lower_counts = dict(Counter(t['text'] for doc in X for t in doc['tokens']))
        return self

    @staticmethod
    def _get_shape(t):
        return "".join("d" if c.isdigit() else "X" if c.isupper() else "x" if c.islower() else c for c in t)

    def _unary_featurise(self, tok):
        bb = tok['bounding_box']
        if bb is not None:
            (x0,y0), (x1,y1) = bb
            yield from zip(('x0','x1','y0','y1'), (x0,x1,y0,y1))
            yield 'center_x', (x0+x1)/2
            yield 'center_y', (y0+y1)/2
            yield 'width', x1-x0
            yield 'height', y1-y0

        txt = tok['text']
        if self.lower_counts.get(txt.lower, -1) > self.min_token_count:
            yield 'text', txt
            yield 'text_lower', txt.lower()
            yield 'text[:3]', txt[:3]
            yield 'text[-3:]', text[-3:]
        
        shape = self._get_shape(txt)
        yield 'shape', shape
        yield 'shape_lower', shape.lower()
        yield 'shape[:3]', shape[:3]
        yield 'shape[-3:]', shape[-3:]
        
    def _add_window_features(self, doc):
        Xt = []
        doc_copy = [dict(f) for f in doc]
        left_window, right_window = self.neighbourhood_size
        for i, feats in enumerate(doc):
            left_nhood = doc_copy[max(0, i-left_window):i]
            for offset_ix, neighbour_feat in enumerate(reversed(left_nhood), 1):
                feats.update({f'wndw-{offset_ix}::{k}':v for k,v in neighbour_feat.items()})

            right_nhood = doc_copy[i+1:i+right_window]
            for offset_ix, neighbour_feat in enumerate(right_nhood, 1):
                feats.update({f'wndw+{offset_ix}::{k}':v for k,v in neighbour_feat.items()})
            Xt.append(feats)
        return Xt
        
        
    def transform(self, X):
        X = [dict(x) for x in X]
        Xt = []
        for doc in X:
            Xt.append([dict(self._unary_featurise(t)) for t in doc['tokens']])
        Xt = [self._add_window_features(doc) for doc in Xt]
        for x, f in zip(X, Xt):
            x['features'] = f
        return X

In [185]:
from itertools import chain

class InstanceGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, field_percentile=95, table_percentile=90):
        self.field_percentile = field_percentile
        self.table_percentile = table_percentile
    
    @staticmethod
    def get_percentile_window(label_values, percentile):
        left_lengths, right_lengths = [], []
        for label, value in label_values:
            if not label:
                continue
            if not value:
                left_lengths.append(0), right_lengths.append(0)
                continue
            left_lengths.append(max(0, min(label) - min(value)))
            right_lengths.append(max(0, max(value) - max(label)))
        return int(np.percentile(left_lengths, percentile)+1), int(np.percentile(right_lengths, percentile)+1)


    def fit(self, X, y=None, **fit_args):
        field_label_values = [(f['label'], f['value']) for annot in y for f in annot['field']]        
        table_label_values = [[column['label'], val] for annot in y
                                  for table in annot['table']
                                  for column in table['columns']
                                  for val in column['value']]
        self.field_window = self.get_percentile_window(field_label_values, self.field_percentile)
        self.table_window = self.get_percentile_window(table_label_values, self.table_percentile)
        return self
    
    @staticmethod
    def get_pairwise_features(a_features, b_features):
        features = {}
        for fname, feature in (('a', a_features),  ('b', b_features)):
            features.update({
                f'pairwise::{fname}[0]::{k}': v for k,v in feature[0].items()
            })
            if len(feature) > 1:
                features.update({
                    f'pairwise::{fname}[-1]::{k}': v for k,v in feature[-1].items()
                })
        return features
    
    def get_positional_features(a_ixs, b_ixs, tokens):
        pass
        
    
    def get_doc_candidates(self, values, value_type, doc, window):
        instances = []
        w_left, w_right = window
        for v in values:
            left_candidates = range(max(0, v[0]-w_left), v[0])
            rl = min(v[-1]+1, len(doc['features']) - 1)
            rr = min(v[-1]+1+w_right, len(doc['features']))
            right_candidates = range(rl, rr)
            
            for candidate in chain(left_candidates, right_candidates):
                features = self.get_pairwise_features(
                    [doc['features'][ix] for ix in v],
                    [doc['features'][candidate]]
                )
                features.update({'type': value_type})
                instances.append({
                    'doc_id': doc['doc_id'],
                    'key': v,
                    'value': candidate,
                    'features': features,
                    'type': value_type
                })
        return instances
        
    def transform(self, X):
        X = [dict(x) for x in X]
        for doc in X:
            instances = []
            instances += self.get_doc_candidates(doc['fields'], 'field', doc, self.field_window)
            instances += self.get_doc_candidates(doc['tables'], 'table', doc, self.table_window)
            doc['instances'] = instances
        return X

In [186]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict
import random
import numpy as np

class InstanceModel(LGBMClassifier):
    def fit(self, X, y=None, **fit_args):
        flat_X, flat_y = [], []
        for xi, yi in zip(X, y):
            label_map = {tuple(f['label']): set(f['value']) for f in yi['field'] if f['label']}
            label_map.update({tuple(c['label']):set(c for cv in c['value'] for c in cv) for t in yi['table'] for c in t['columns']})
            for i in xi['instances']:
                flat_X.append(i['features'])
                flat_y.append(i['value'] in label_map[tuple(i['key'])])
        self.vectorizer = DictVectorizer()
        flat_X = self.vectorizer.fit_transform(flat_X)
        return super().fit(flat_X, flat_y, **fit_args)
    
    def predict(self, X):
        X = [dict(x) for x in X]
        features = []
        lens = []
        preds = []
        for doc in X:
            xf = [i['features'] for i in doc['instances']]
            lens.append(len(xf))
            features += xf
        features = self.vectorizer.transform(features)
        flat_preds = super().predict(features)
        
        offset = 0
        for doc, nfeats in zip(X, lens):
            values = {'field': {}, 'table': {}}
            for instance, pred in zip(doc['instances'], flat_preds[offset:offset+nfeats]):
                k, v = tuple(instance['key']), instance['value']
                if k not in values[instance['type']]:
                    values[instance['type']][k] = set()
                if pred:
                    values[instance['type']][k].add(v)
            offset += nfeats
            preds.append(values)
        return preds
    
    @staticmethod
    def evaluate(ytrue, ypred):
        ent_types = ('field', 'table')
        results = []
        for yt, yp in zip(ytrue, ypred):
            label_map = {tuple(f['label']): set(f['value']) for f in yt['field'] if f['label']}
            label_map.update({tuple(c['label']):set(c for cv in c['value'] for c in cv) for t in yt['table'] for c in t['columns']})
            for t in ent_types:
                for k, pred_v in yp[t].items():
                    true_v = label_map[k]
                    results.append({
                        'type': t,
                        'k': k,
                        'pred_v': pred_v,
                        'true_v': true_v,
                        'iou': len(pred_v & true_v)/len(pred_v | true_v) if pred_v else int(pred_v == true_v),
                        'match': pred_v == true_v
                    })

        metrics = {}
        for met in ('iou', 'match'):
            for t in ('field', 'table'):
                metrics[f'{t}.{met}'] = np.mean([r[met] for r in results if r['type'] == t])
            metrics[f'micro.{met}'] = np.mean([r[met] for r in results])
        return metrics, results
    
    def score(self, X, y):
        metrics, results = self.evaluate(y, self.predict(X))
        return metrics['micro.iou']

In [179]:
import numpy as np

X = [{
    'tokens': d['tokens'],
    'fields': [f['label'] for f in d['annotations']['field'] if f['label']],
    'tables': [c['label'] for table in d['annotations']['table'] for c in table['columns'] if c['label']],
    'doc_id': d['doc_id']
} for d in docs]

y = [doc['annotations'] for doc in docs]

In [180]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline((
    ('featuriser', TokFeaturiser()),
    ('instance_generator', InstanceGenerator()),
    ('model', InstanceModel(verbose=1, n_jobs=-1))    
))

In [182]:
preds = pipeline.fit(X,y).predict(X)

In [183]:
metrics, results = InstanceModel.evaluate(y, preds)

In [184]:
metrics

{'field.iou': 0.5774592952558371,
 'table.iou': 0.3959119155673821,
 'micro.iou': 0.536086252945883,
 'field.match': 0.4977973568281938,
 'table.match': 0.26119402985074625,
 'micro.match': 0.44387755102040816}

In [187]:
param_grid = {
    'featuriser__min_token_count': [3,10],
    'featuriser__neighbourhood_size': [(3,3), (5,5), (10,10)],
    'instance_generator__field_percentile': [95,99,100],
    'instance_generator__table_percentile': [90, 95]
}

In [188]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipeline, param_grid, verbose=True, cv=3, n_jobs=-1)

In [None]:
grid.fit(X,y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [None]:
def token_coords(token, width, height):
    (x1, y1), (x2, y2) = token["bounding_box"]                        
    x1, y1, x2, y2 = int(round(x1*width)), int(round(y1*height)), int(round(x2*width)), int(round(y2*height))
    return ((x1, y1), (x2, y2))

def show(doc):
    image = Image.open("data/{}.jpg".format(doc["doc_id"]))
    width, height = image.width, image.height
    
    annotations = doc["annotations"]
    tokens = doc["tokens"]
    
    overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
    draw = ImageDraw.Draw(overlay)

    for field in annotations["field"]:
        for token_idx in field["label"]:
            token = tokens[token_idx]
            draw.rectangle(token_coords(token, width, height), fill=(0, 255, 0, 100))
        for token_idx in field["value"]:
            token = tokens[token_idx]
            draw.rectangle(token_coords(token, width, height), fill=(0, 0, 255, 100))
    for table in annotations["table"]:
        for column in table["columns"]:
            for token_idx in column["label"]:
                token = tokens[token_idx]
                draw.rectangle(token_coords(token, width, height), fill=(255, 255, 0, 100))
            for row in column["value"]:
                for token_idx in row:
                    token = tokens[token_idx]
                    draw.rectangle(token_coords(token, width, height), fill=(255, 0, 255, 100))

    image = image.convert("RGBA")
    image = Image.alpha_composite(image, overlay)
    display(image)