Main notebook :3

In [None]:
import collections

from PIL import Image
import numpy as np

In [None]:
def get_image(filepath):
    image = Image.open(filepath)
    data = np.asarray(image)
    # filters out red color, and turns into binary array
    if len(data.shape) == 3:
        return (data[:,:,0] > 150).astype(np.uint8)
    else:
        return (data > 150).astype(np.uint8)

def save_image(grid, filepath):
    img = Image.fromarray(grid * 255, mode='L')
    img.save(filepath)
    print(f'saved to {filepath}')

In [None]:
def show_image(grid):
    # Given a binary grid, show the image
    grid = grid * 255
    # grid = 255 - grid
    out = Image.fromarray(np.uint8(grid))
    out.show()

In [None]:
def remove_small_islands(orig_grid, max_size=10):
    grid = orig_grid.copy()  # avoid modifying original
    rows, cols = grid.shape
    visited = np.zeros_like(grid, dtype=bool)

    directions = [(1,0), (-1,0), (0,1), (0,-1)]

    for r in range(rows):
        for c in range(cols):
            if grid[r, c] == 0 and not visited[r, c]:
                # Start BFS
                queue = collections.deque([(r, c)])
                visited[r, c] = True
                coords = [(r, c)]

                while queue:
                    cr, cc = queue.popleft()
                    for dr, dc in directions:
                        nr, nc = cr + dr, cc + dc
                        if (
                            0 <= nr < rows and 0 <= nc < cols and
                            grid[nr, nc] == 0 and not visited[nr, nc]
                        ):
                            visited[nr, nc] = True
                            queue.append((nr, nc))
                            coords.append((nr, nc))

                # After BFS, check island size
                if len(coords) <= max_size:
                    for (rr, cc) in coords:
                        grid[rr, cc] = 1

    return grid

In [None]:
def get_corners(grid, start_i, start_j):
  """Returns the coordinates of the corners of the page (tl, tr, br, bl)."""
  rows, cols = grid.shape
  queue = collections.deque()
  queue.append((start_i, start_j))

  visited = np.zeros([rows, cols])

  tl, tr, br, bl = (start_i, start_j), (start_i, start_j), (start_i, start_j), (start_i, start_j)

  while queue:
    i, j = queue.popleft()
    if i < 0 or i >= rows or j < 0 or j >= cols:
      continue
    if visited[i][j]:
      continue
    if grid[i][j]:
      continue
    visited[i][j] = 1

    if -i - j > -tl[0] - tl[1]:
      tl = i, j
    if -i + j > -tr[0] + tr[1]:
      tr = i, j
    if i - j > bl[0] - bl[1]:
      bl = i, j
    if i + j > br[0] + br[1]:
      br = i, j

    queue.append((i + 1, j))
    queue.append((i - 1, j))
    queue.append((i, j + 1))
    queue.append((i, j - 1))

  return tl, tr, br, bl

In [None]:
import cv2
def normalize_page(grid, page_corners, width=1300, height=1950):
    tl, tr, br, bl = page_corners
    tlp = [tl[1], tl[0]]
    trp = [tr[1], tr[0]]
    brp = [br[1], br[0]]
    blp = [bl[1], bl[0]]
    src_pts = np.float32([tlp, trp, brp, blp])
    dst_pts = np.float32([
        [0, 0], # top-left
        [width-1, 0], # top-right
        [width-1, height-1], # bottom-right
        [0, height-1], # bottom-left
    ])

    M = cv2.getPerspectiveTransform(src_pts, dst_pts)
    return cv2.warpPerspective(grid, M, (width, height))

In [59]:
PATH_TEMPLATE = 'book1/family-tree-{}.png'
idx = -1
for i in range(4, 13):
    print(f'i: {i}')
    filepath = PATH_TEMPLATE.format(f'{i:02d}')
    grid = get_image(filepath)
    grid = remove_small_islands(grid)
    rows, cols = grid.shape
    # Find starting pixel of page border
    i = int(rows / 2)
    j = cols - 20
    while grid[i][j] or grid[i][j-1] or grid[i][j-2] or grid[i][j-3]:
        j -= 1
    right_page_corners = get_corners(grid, i, j)
    print('right_page_corners', right_page_corners)
    right_page = normalize_page(grid, right_page_corners)
    save_image(right_page, f'book1/pages/{idx}.png')
    idx += 1

    i = int(rows / 2)
    j = 0
    while grid[i][j] or grid[i][j+1] or grid[i][j+2] or grid[i][j+3]:
        j += 1
    left_page_corners = get_corners(grid, i, j)
    print('left_page_corners', left_page_corners)
    left_page = normalize_page(grid, left_page_corners)
    save_image(left_page, f'book1/pages/{idx}.png')
    idx += 1


i: 4
right_page_corners ((127, 1909), (110, 3182), (2043, 3199), (2059, 1927))
saved to book1/pages/-1.png
left_page_corners ((133, 425), (147, 1705), (2089, 1682), (2079, 402))
saved to book1/pages/0.png
i: 5


  img = Image.fromarray(grid * 255, mode='L')


right_page_corners ((131, 1915), (110, 3190), (2039, 3213), (2060, 1939))
saved to book1/pages/1.png
left_page_corners ((128, 441), (141, 1721), (2080, 1697), (2069, 416))
saved to book1/pages/2.png
i: 6
right_page_corners ((128, 1897), (107, 3170), (2046, 3194), (2065, 1922))
saved to book1/pages/3.png
left_page_corners ((126, 427), (137, 1707), (2073, 1688), (2064, 405))
saved to book1/pages/4.png
i: 7
right_page_corners ((124, 1903), (103, 3175), (2041, 3199), (2060, 1926))
saved to book1/pages/5.png
left_page_corners ((127, 436), (138, 1716), (2072, 1696), (2062, 414))
saved to book1/pages/6.png
i: 8
right_page_corners ((132, 1890), (112, 3163), (2047, 3185), (2065, 1912))
saved to book1/pages/7.png
left_page_corners ((128, 423), (142, 1704), (2081, 1682), (2070, 401))
saved to book1/pages/8.png
i: 9
right_page_corners ((126, 1898), (105, 3173), (2036, 3194), (2054, 1919))
saved to book1/pages/9.png
left_page_corners ((133, 433), (142, 1713), (2083, 1696), (2077, 416))
saved to boo

# Crop

In [65]:
def shrink_page(grid):
  """Shrinks the page to the smallest possible rectangle that contains the tree.
  """
  rows, cols = grid.shape
  def find_shrink_start(arr):
    mid = int(len(arr) / 2)
    search_range = arr[mid-150:mid+150]
    max_idx = np.argmax(search_range)
    return mid - 150 + max_idx

  # Cut left/right
  col_present = np.sum(1-grid, axis=0)
  min_col = find_shrink_start(col_present)
  max_col = min_col
  while min_col > 0 and col_present[min_col]:
    min_col -= 1
  while max_col < cols - 1 and col_present[max_col]:
    max_col += 1
  min_col = max(min_col - 10, 0)
  max_col = min(max_col + 10, cols - 1)
  print('Min and max cols')
  print(min_col, max_col)

  grid = grid[:, min_col:max_col + 1]

  # Cut top/bottom
  row_present = np.sum(1-grid, axis=1)
  min_row = 0
  # while not row_present[min_row]:
  #   min_row += 1
  # min_row = max(min_row - 10, 0)

  max_row = row_present.shape[0] - 1
  while not row_present[max_row]:
    max_row -= 1
  max_row = min(max_row + 10, rows - 1)
  print('Min and max rows')
  print(min_row, max_row)
  grid = grid[:max_row, :]

  return grid

In [66]:
PATH_TEMPLATE = 'book1/pages/{}.png'
for i in range(17):
    print(f'i: {i}')
    filepath = PATH_TEMPLATE.format(i)
    grid = get_image(filepath)
    rows, cols = grid.shape
    if i % 2 == 0:
        grid = grid[50:rows-50, 150:cols-50]
    else:
        grid = grid[50:rows-50, 50:cols-150]
    grid = shrink_page(grid)
    save_image(grid, f'book1/cropped/{i}.png')

i: 0
Min and max cols
360 637
Min and max rows
0 1699
saved to book1/cropped/0.png
i: 1
Min and max cols
512 686
Min and max rows
0 1708
saved to book1/cropped/1.png
i: 2
Min and max cols
359 638
Min and max rows
0 1702
saved to book1/cropped/2.png
i: 3
Min and max cols
308 891
Min and max rows
0 1704
saved to book1/cropped/3.png
i: 4
Min and max cols
360 637
Min and max rows
0 1702
saved to book1/cropped/4.png
i: 5
Min and max cols
462 738
Min and max rows
0 1754
saved to book1/cropped/5.png
i: 6
Min and max cols
309 688
Min and max rows
0 1758
saved to book1/cropped/6.png
i: 7
Min and max cols
411 790
Min and max rows
0 1756
saved to book1/cropped/7.png
i: 8
Min and max cols
0 999
Min and max rows
0 1755
saved to book1/cropped/8.png
i: 9
Min and max cols
565 636
Min and max rows
0 434
saved to book1/cropped/9.png
i: 10
Min and max cols
463 536
Min and max rows
0 440
saved to book1/cropped/10.png
i: 11
Min and max cols
514 688
Min and max rows
0 440
saved to book1/cropped/11.png
i: 12

  img = Image.fromarray(grid * 255, mode='L')


# attempt 2

In [None]:
img = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)

# Optional blur to reduce noise
blur = cv2.GaussianBlur(img, (5, 5), 0)

# Binary inverse threshold — black border becomes white
_, thresh = cv2.threshold(blur, 50, 255, cv2.THRESH_BINARY_INV)

# Find contours
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours by area (largest first)
contours = sorted(contours, key=cv2.contourArea, reverse=True)

In [None]:
print(len(contours))

In [None]:
page_corners = []

for cnt in contours[:2]:  # take top 2 (two pages)
    # Approximate contour to polygon
    peri = cv2.arcLength(cnt, True)
    approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
    print(approx)
    print(len(approx))

    if len(approx) == 4:
        corners = approx.reshape(4, 2)
        page_corners.append(corners)

# Visualize results
vis = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
for corners in page_corners:
    for x, y in corners:
        cv2.circle(vis, (int(x), int(y)), 10, (0, 0, 255), -1)

cv2.imwrite("detected_corners.png", vis)

# Sort each corner set to TL, TR, BR, BL order
def order_points(pts):
    s = pts.sum(axis=1)
    diff = np.diff(pts, axis=1)
    tl = pts[np.argmin(s)]
    br = pts[np.argmax(s)]
    tr = pts[np.argmin(diff)]
    bl = pts[np.argmax(diff)]
    return np.array([tl, tr, br, bl], dtype=np.float32)

ordered_pages = [order_points(c) for c in page_corners]

for i, corners in enumerate(ordered_pages):
    print(f"Page {i+1} corners (TL, TR, BR, BL):\n", corners)

In [None]:
print(ordered_pages)

# CSV Parser

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('data/zeng_google_sheet.csv', keep_default_na=False)
df['id'] = df['id'].astype(int)
df['name_images'] = [[]] * len(df)
df['generation'] = df['generation'].astype(int)
df['parent'] = df['parent'].astype(int)
df['children'] = df['children'].apply(lambda x: [int(child) for child in x.split(',')] if x else [])

# Iterate through rows and set parent based on children relationships
for _, row in df.iterrows():
    # Get the current node's ID and its children
    node_id = row['id']
    children = row['children']
    
    # For each child, set its parent to the current node_id
    for child_id in children:
        df.loc[df['id'] == child_id, 'parent'] = node_id
df

In [None]:
# Export DataFrame to JSONL format
df.to_json('data/book1.jsonl', orient='records', lines=True, force_ascii=False)

In [None]:
import json

with open('data/book1.jsonl', 'r') as f:
    records = [json.loads(line) for line in f]
    
print(records[0])
