Main notebook :3

In [None]:
import collections

from PIL import Image
import numpy as np

PATH_TEMPLATE = 'book1/family-tree-{}.png'
filepath = PATH_TEMPLATE.format('05')
image = Image.open(filepath)
data = np.asarray(image)
print(image.mode) # RGB
# image.show()

RGB


In [12]:
# filters out red color, and turns into binary array
grid = (data[:,:,0] < 150).astype(int) 
rows, cols = grid.shape
print(rows, cols)

2200 3400


In [None]:
def show_image(grid):
	# Given a binary grid, show the image
	grid = grid * 255
	grid = 255 - grid
	out = Image.fromarray(np.uint8(grid))
	out.show()

In [None]:
left = grid[:, :cols//2+100]
right = grid[:, cols//2+100:]
show_image(left)

In [22]:
# Find starting pixel of page border
i = int(rows / 2)
j = 0
while not grid[i][j]:
	j += 1

In [23]:
def get_page(grid, start_i, start_j):
  print(f'get_page {start_i} {start_j}')
  rows, cols = grid.shape
  queue = collections.deque()
  queue.append((start_i, start_j))

  visited = np.zeros([rows, cols])

  min_i, max_i = (start_i, None), (start_i, None)
  min_j, max_j = (None, start_j), (None, start_j)

  while queue:
    i, j = queue.popleft()
    if i < 0 or i >= rows or j < 0 or j >= cols:
      continue
    if visited[i][j]:
      continue
    if not grid[i][j]:
      continue
    visited[i][j] = 1

    if i < min_i[0]:
      min_i = i, j
    elif i > max_i[0]:
      max_i = i, j
    if j < min_j[1]:
      min_j = i, j
    elif j > max_j[1]:
      max_j = i, j

    queue.append((i + 1, j))
    queue.append((i - 1, j))
    queue.append((i, j + 1))
    queue.append((i, j - 1))

  print('Page corners')
  print(min_i, max_i, min_j, max_j)
  # Cut off page boundaries
  page = grid[min_i[0]+50:max_i[0]-50, min_j[1]+150:max_j[1]-150]
  return page

In [25]:
left_page = get_page(grid, i, j)
show_image(left_page)

get_page 1100 428
Page corners
(128, 441) (2080, 1525) (1992, 416) (142, 1721)


# CSV Parser

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('data/zeng_google_sheet.csv', keep_default_na=False)
df['id'] = df['id'].astype(int)
df['name_images'] = [[]] * len(df)
df['generation'] = df['generation'].astype(int)
df['parent'] = df['parent'].astype(int)
df['children'] = df['children'].apply(lambda x: [int(child) for child in x.split(',')] if x else [])

# Iterate through rows and set parent based on children relationships
for _, row in df.iterrows():
    # Get the current node's ID and its children
    node_id = row['id']
    children = row['children']
    
    # For each child, set its parent to the current node_id
    for child_id in children:
        df.loc[df['id'] == child_id, 'parent'] = node_id
df

Unnamed: 0,id,name,name_images,generation,parent,children,biography,notes
0,1,点,[],1,-1,[2],https://en.wikipedia.org/wiki/Zeng_Dian,
1,2,参,[],2,1,"[3, 4, 5]",https://en.wikipedia.org/wiki/Zengzi,是宗聖公
2,3,华,[],3,2,[],,
3,4,申,[],3,2,[],,
4,5,元,[],3,2,[6],,
5,6,西,[],4,5,[7],,
6,7,钦,[],5,6,[8],,
7,8,㝵,[],6,7,[9],,
8,9,羨,[],7,8,[10],,
9,10,遐,[],8,9,"[11, 12]",,


In [None]:
# Export DataFrame to JSONL format
df.to_json('data/book1.jsonl', orient='records', lines=True, force_ascii=False)

In [15]:
import json

with open('data/book1.jsonl', 'r') as f:
    records = [json.loads(line) for line in f]
    
print(records[0])


{'id': 1, 'name': '点', 'name_images': [], 'generation': 1, 'parent': -1, 'children': [2], 'biography': 'https://en.wikipedia.org/wiki/Zeng_Dian', 'notes': ''}
