In [1]:
import pandas as pd
import os
from PIL import Image

WINDOW_WIDTH = 1000
WINDOW_HEIGHT = 1000

In [2]:
original_df = pd.read_json('training_data/seeclick_web.json')
original_df.head(1)

Unnamed: 0,img_filename,url,elements
0,e4469ac844e851b42167f303c8c9278e.png,https://lunaweddingandeventsupplies.com.au/pro...,"[{'instruction': 'Product description', 'bbox'..."


In [3]:
original_df.columns, original_df.dtypes

(Index(['img_filename', 'url', 'elements'], dtype='object'),
 img_filename    object
 url             object
 elements        object
 dtype: object)

In [4]:
first_row = original_df.iloc[0]

# Get the 'elements' column from the first row
elements = first_row['elements']

# If 'elements' is not empty, display the keys of the first JSON object
if elements:
    print(elements[0].keys())
    print(elements[0])
else:
    print("The 'elements' column in the first row is empty.")

#bbox key, represented by [left, top, right, bottom].

dict_keys(['instruction', 'bbox', 'data_type'])
{'instruction': 'Product description', 'bbox': [0.521875, 0.932407407407407, 0.865625, 0.9842592592592591], 'data_type': 'text'}


In [5]:
#First filter to only have ones in dataset
image_names = set(os.listdir('training_data/cpfs01/user/chengkanzhi/seeclick_web_imgs_part/'))
print(image_names)
print('old size: ', len(original_df))
df = original_df[original_df['img_filename'].isin(image_names)]
print('new size: ', len(df))

{'68a7025e38a65dfb2579f92056ff9b55.png', 'd3220624f983cfd4da706e81af78db48.png', '37c1d7d003e9bce9e6c0482b8427ad2f.png', '212b66a0bef98630b8c3f029f6762d97.png', '2e38132e27a2f74c5deb41594b0fc558.png', '182383fecc39b32f12de8d502903df42.png', '705dd2dd9df27ba0bbcd88a744f72e26.png', '8fcfed535b2c32f159c4a4913a33cff0.png', 'c102b361aa9fd2e321d3bebbc40b40d8.png', '5d996870461c80e7b99efa056065bb2c.png', 'e8070466805575ab95fedf07a1ea7769.png', '66e995b8ff92d7792bfb7d53a2e025ec.png', 'e7d2596d69c7e02c44da88f0dbfabbdf.png', '18856ea7e09252fc9abcb8bcad94a0f5.png', '46fcca90924fe20ee5b147ac23fbd442.png', '9d3164ef516f2070f74cc11ac7863b8c.png', '0cfc4265096cb2f83a5f0a3a77964be5.png', 'b33a1b9172806f587cd7c5570556fc75.png', '93bb436b90c9ebaae697c0faaa763c05.png', '97f3ab9ac21236150029f521c13481a4.png', 'e159bbe644c124b92cf969ec9339bde2.png', 'eeff4ddfd86b4a647ae1ebcbeeba46f1.png', '80bc035d8e0e2a71b5847e215d6cf707.png', 'b865d1d972e315993503abffe12c680e.png', '6292a3182d026800f83ec5be21ed0fdd.png',

In [6]:
print('old_size: ', len(df))
exploded_df = df.explode('elements').reset_index(drop=True)

exploded_df['instruction'] = exploded_df['elements'].apply(lambda x: x['instruction'])

def get_point(bbox):
    bbox = bbox['bbox']
    l, t, r, b = bbox[0], bbox[1], bbox[2], bbox[3]
    return [(l+r)/2, (t+b)/2]

exploded_df['point'] = exploded_df['elements'].apply(get_point)

final_df = exploded_df.drop(columns=['elements', 'url'])
print('new_size: ', len(final_df))
final_df.head(10)

old_size:  10000
new_size:  113142


Unnamed: 0,img_filename,instruction,point
0,b1072ebd95d94a714a204ad0a44c2cb0.png,Composers -- Western Australia.,"[0.157552083333333, 0.8166666666666665]"
1,b1072ebd95d94a714a204ad0a44c2cb0.png,InterLibrary Loan,"[0.74140625, 0.2907407407407405]"
2,b1072ebd95d94a714a204ad0a44c2cb0.png,Bookmark link for this record,"[0.5882812499999995, 0.30000000000000004]"
3,b1072ebd95d94a714a204ad0a44c2cb0.png,Author,"[0.889583333333333, 0.2240740740740735]"
4,b1072ebd95d94a714a204ad0a44c2cb0.png,Classic Catalogue Home,"[0.05625, 0.12129629629629551]"
5,b1072ebd95d94a714a204ad0a44c2cb0.png,Search,"[0.554427083333333, 0.233796296296296]"
6,b1072ebd95d94a714a204ad0a44c2cb0.png,Feedback,"[0.889583333333333, 0.5074074074074065]"
7,b1072ebd95d94a714a204ad0a44c2cb0.png,e-Everything!,"[0.151302083333333, 0.12129629629629551]"
8,b1072ebd95d94a714a204ad0a44c2cb0.png,Record Number,"[0.889583333333333, 0.3074074074074066]"
9,b1072ebd95d94a714a204ad0a44c2cb0.png,Call number,"[0.889583333333333, 0.25740740740740653]"


In [7]:
def get_label_column(img_width, x):
    offset = img_width / WINDOW_WIDTH
    actual_x = x*img_width
    return int(actual_x / offset)

def get_label_row(img_height, y):
    offset = img_height / WINDOW_HEIGHT
    actual_y = y*img_height
    return int(actual_y / offset)


def get_window_label(input):
    x, y = input['point']
    img_name = input['img_filename']
    img = Image.open(os.path.join('training_data/cpfs01/user/chengkanzhi/seeclick_web_imgs_part/', img_name))
    width, height = img.size
    row, col  = get_label_row(height, y), get_label_column(width, x)
    label = row*WINDOW_WIDTH + col
    assert label < WINDOW_WIDTH*WINDOW_HEIGHT and label >= 0 
    return label

In [8]:
final_df['label'] = final_df.apply(get_window_label, axis=1)
final_df.head(10)

NameError: name 'Image' is not defined

In [8]:
#save df
final_df.to_csv('training_data/seeclick_web_train.csv', index=False)