In [None]:
import os
import json

In [None]:
TRAIN_IMAGE_FOLDER = "./train/images/"
TRAIN_LABEL_FOLDER = "./train/annotations/"

TEST_IMAGE_FOLDER = "./test/images/"
TEST_LABEL_FOLDER = "./test/annotations/"

In [None]:
image_paths = []
label_paths = []

for file in os.listdir(TRAIN_IMAGE_FOLDER):
    image_paths.append(os.path.join(TRAIN_IMAGE_FOLDER, file))

for file in os.listdir(TRAIN_LABEL_FOLDER):
    label_paths.append(os.path.join(TRAIN_LABEL_FOLDER, file))

print("Number of images: ", len(image_paths))
print("Number of labels: ", len(label_paths))

In [None]:
from tqdm import tqdm

ground_truths = []

for path in tqdm(label_paths):
    with open(path) as json_file:
        data = json.load(json_file)

    image_name = os.path.basename(path)[:-5] + ".jpg"

    for item in data["data-series"]:
        for k in item:
            if isinstance(item[k], float):
                item[k] = round(item[k], 4) 

    x_axis_ids = [item["id"] for item in data["axes"]["x-axis"]["ticks"]]
    y_axis_ids = [item["id"] for item in data["axes"]["y-axis"]["ticks"]]

    x_labels = [item for item in data["text"] if item["id"] in x_axis_ids]
    x_labels.sort(key=lambda x: x["polygon"]["x0"])
    x_labels = [item["text"] for item in x_labels]

    y_labels = [item for item in data["text"] if item["id"] in y_axis_ids]
    y_labels.sort(key=lambda x: x["polygon"]["y0"])
    y_labels = [item["text"] for item in y_labels]

    gt = {
        "file_name": os.path.join("images", image_name),
        "ground_truth": {
            "gt_parse": {
                "class": data["chart-type"],
                "value": data["data-series"],
                "x_type": data["axes"]["x-axis"]["values-type"],
                "y_type": data["axes"]["y-axis"]["values-type"],
                "x_labels": x_labels,
                "y_labels": y_labels,
            }
        }
    }
    ground_truths.append(gt)

# TODO: for multi stages pipeline, we need to save the points of each axis and the points of each data series

In [None]:
# split ground truths into train and validation
import random

random.shuffle(ground_truths)

train_ground_truths = ground_truths[:int(len(ground_truths) * 0.8)]
val_ground_truths = ground_truths[int(len(ground_truths) * 0.8):]

print("Number of train ground truths: ", len(train_ground_truths))
print("Number of val ground truths: ", len(val_ground_truths))


In [None]:
# !pip install jsonlines

In [None]:
# copy all images to train/images and val/images
import shutil

# make dirs for validation
os.makedirs("./validation/images/", exist_ok=True)

for gt in tqdm(val_ground_truths):
    shutil.copy2(os.path.join("./train", gt["file_name"]), "./validation/images/")


In [None]:
# save to jsonl file
import jsonlines

with jsonlines.open("./train/metadata.jsonl", mode="w") as writer:
    writer.write_all(train_ground_truths)

with jsonlines.open("./validation/metadata.jsonl", mode="w") as writer:
    writer.write_all(val_ground_truths)
    

In [None]:
max_len = 0
lens = []

for gt in ground_truths:
    l = len(gt["ground_truth"]["gt_parse"]["value"])
    lens.append(l)
    if l > max_len:
        max_len = l

print("Max number of data-series: ", max_len)

In [None]:
# draw histogram of length of data-series
import matplotlib.pyplot as plt
plt.hist(lens, bins=10)

In [None]:
len(lens) - sum(np.array(lens) <= 32)

In [None]:
import numpy as np
import Levenshtein as lev
from sklearn.metrics import r2_score


def sigmoid2(x):
    return 2 - 2 / (1 + np.exp(-x))


def rmse(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.sqrt(np.mean(np.square(y_true - y_pred)))


def nrmse(y_true, y_pred):
    if len(y_true) != len(y_pred):
        return 0
    # y_bar = np.array([np.mean(y_true) for _ in range(len(y_true))])
    # return sigmoid2(rmse(y_true, y_pred) / rmse(y_true, y_bar))
    return sigmoid2(1 - r2_score(y_true, y_pred))


def nlev(y_true, y_pred):
    if len(y_true) != len(y_pred):
        return 0
    return sigmoid2(sum([lev.distance(y_t, y_p) for y_t, y_p in zip(y_true, y_pred)]) / sum([len(y) for y in y_true]))


def calculate_score(pred, gt):
    if pred["class"] != gt["class"]:
        return 0

    if len(pred["value"]) != len(gt["value"]):
        return 0
    
    if len(pred["value"]) == 0 and len(gt["value"]) == 0:
        return 1

    pred_xs = [x["x"] for x in pred["value"]]
    pred_ys = [x["y"] for x in pred["value"]]

    gt_xs = [x["x"] for x in gt["value"]]
    gt_ys = [x["y"] for x in gt["value"]]

    score = 0
    if isinstance(gt_xs[0], str):
        score += nlev(pred_xs, gt_xs)
    else:
        score += nrmse(pred_xs, gt_xs)

    if isinstance(gt_ys[0], str):
        score += nlev(pred_ys, gt_ys)
    else:
        score += nrmse(pred_ys, gt_ys)

    return score / 2

In [None]:
calculate_score(
    {
        'class': 'scatter',
        'value': [
            {'x': 1949.4201, 'y': 66.683},
            {'x': 1954.6107, 'y': 66.2785},
            {'x': 1959.9936, 'y': 65.6718},
            {'x': 1964.7997, 'y': 64.0537},
        ]
    },
    {
        'class': 'scatter',
        'value': [
            {'x': 1949.4201, 'y': 6.683},
            {'x': 1954.6107, 'y': 66.2785},
            {'x': 1959.9936, 'y': 65.6718},
            {'x': 1964.7997, 'y': 64.0537},
        ]
    },
)

In [None]:
from PIL import Image

Image.open(os.path.join(TRAIN_IMAGE_FOLDER, ground_truths[1000]["file_name"]))