Main notebook :3

# Get pages

In [None]:
from utils import get_image, save_image, show_image, remove_small_islands, get_corners, normalize_page

In [None]:
PATH_TEMPLATE = "book1/original/{}.png"
idx = -1 # skip first page
for i in range(3, 12):
    print(f"i: {i}")
    filepath = PATH_TEMPLATE.format(f"{i}")
    a = get_image(filepath)
    a = remove_small_islands(a)
    rows, cols = a.shape
    # Find starting pixel of page border
    i = int(rows / 2)
    j = cols - 20
    while a[i][j] or a[i][j - 1] or a[i][j - 2] or a[i][j - 3]:
        j -= 1
    right_page_corners = get_corners(a, i, j)
    print("right_page_corners", right_page_corners)
    right_page = normalize_page(a, right_page_corners)
    save_image(right_page, f"book1/pages/{idx}.png")
    idx += 1

    i = int(rows / 2)
    j = 0
    while a[i][j] or a[i][j + 1] or a[i][j + 2] or a[i][j + 3]:
        j += 1
    left_page_corners = get_corners(a, i, j)
    print("left_page_corners", left_page_corners)
    left_page = normalize_page(a, left_page_corners)
    save_image(left_page, f"book1/pages/{idx}.png")
    idx += 1


In [None]:
PATH_TEMPLATE = "book2/original/{}.png"
idx = -1
for i in range(68):
    print(f"i: {i}")
    filepath = PATH_TEMPLATE.format(f"{i}")
    a = get_image(filepath)
    a = remove_small_islands(a)
    rows, cols = a.shape
    # Find starting pixel of page border
    i = int(rows / 2)
    j = cols - 20
    while a[i][j] or a[i][j - 1] or a[i][j - 2] or a[i][j - 3]:
        j -= 1
    right_page_corners = get_corners(a, i, j)
    print("right_page_corners", right_page_corners)
    right_page = normalize_page(a, right_page_corners)
    save_image(right_page, f"book2/pages/{idx}.png")
    idx += 1

    i = int(rows / 2)
    j = 0
    while a[i][j] or a[i][j + 1] or a[i][j + 2] or a[i][j + 3]:
        j += 1
    left_page_corners = get_corners(a, i, j)
    print("left_page_corners", left_page_corners)
    left_page = normalize_page(a, left_page_corners)
    save_image(left_page, f"book2/pages/{idx}.png")
    idx += 1


# Crop

In [None]:
from utils import trim_borders, shrink_page

PATH_TEMPLATE = "book1/pages/{}.png"
for i in range(17):
    print(f"i: {i}")
    filepath = PATH_TEMPLATE.format(i)
    a = get_image(filepath)
    a = trim_borders(a)
    a = shrink_page(a)
    save_image(a, f"book1/graphs/{i}.png")

In [None]:
from utils import trim_borders, shrink_page, is_tree_start_page, merge_graphs

In [None]:
PATH_TEMPLATE = "book1/pages/{}.png"
pages = []
is_page_tree_start = []
for i in range(17):
    filepath = PATH_TEMPLATE.format(i)
    a = get_image(filepath)
    a = trim_borders(a)
    is_page_tree_start.append(is_tree_start_page(a))
    a = shrink_page(a)
    pages.append(a)
#   save_image(a, f"book1/graphs/{i}.png")

print("Merging and saving graphs")
i = 0
while i < 17:
    start_i = i
    graph = pages[i]
    i += 1
    while i < 17 and not is_page_tree_start[i]:
        print(f"---------Merge starting at i: {i}")
        left = pages[i]
        graph = merge_graphs(left, graph)
        i += 1
    save_image(graph, f"book1/graphs/{start_i}_{i-1}.png")


# Graph parsing heuristic

In [None]:
import os
import dataclasses
import json
from utils import find_lines, find_line_ends, Node, sort_nodes, infer_ends, verify_nodes, get_name_image, print_trees
from data.node import Node as DataNode

In [None]:
graph_files = sorted(os.listdir("book1/graphs"), key=lambda x: int(x.split('_')[0]) if x.endswith('.png') else 0)
print(graph_files)
node_idx = 1

data_file = open("data/book1.jsonl", "w")

for filepath in graph_files:
    if not filepath.endswith(".png"):
        continue
    filename = os.path.splitext(filepath)[0]
    a = get_image(os.path.join("book1/graphs", filepath))
    raw_results = find_lines(a)
    results = []
    for raw_result in raw_results:
        parents, children = find_line_ends(raw_result)
        if len(parents) != 1:
            raise ValueError("More than one parent found", parents)
        parent = parents[0]
        results.append((parent, children))
        print(parent, children)

    nodes = []
    for parent, children in results:
        pn = Node(bot=parent)
        nodes.append(pn)
        for c in children:
            cn = Node(top=c)
            pn.children.append(cn)
            nodes.append(cn)
    print("Num nodes pre merge:", len(nodes))

    # Merge nodes
    while True:
        made_progress = False
        for i in range(len(nodes)):
            for j in range(len(nodes)):
                if i == j:
                    continue
                p = nodes[i]
                c = nodes[j]
                # print(p, c)
                # Only try the connection is p is missing bot and c is missing top
                if p.bot or c.top:
                    continue
                # print("valid")
                tx, ty = p.top
                bx, by = c.bot
                if 0 < bx - tx < 200 and abs(by - ty) < 20:
                    # print("merge")
                    # Same node, merge
                    p.bot = c.bot
                    p.children = c.children
                    del nodes[j]
                    made_progress = True
                    break
            if made_progress:
                break
        if not made_progress:
            break

    print("Num nodes post merge:", len(nodes))
    infer_ends(nodes, a)
    verify_nodes(nodes)

    # Give nodes globally unique ids
    nodes = sort_nodes(nodes)
    for n in nodes:
        n.id = node_idx
        node_idx += 1

    idx = 0
    tree = {}
    c2p = {}
    for n in nodes:
        for c in n.children:
            c2p[c.id] = n.id
    node_to_idx = {(n.top, n.bot): i for i, n in enumerate(nodes)}
    for n in nodes:
        tree[node_to_idx[(n.top, n.bot)]] = [
            node_to_idx[(c.top, c.bot)] for c in n.children
        ]
        name_img = get_name_image(n, a)
        data_node = DataNode(
            id=n.id,
            name_images=[f"book1/names/{n.id}.png"],
            generation=-1,
            parent=c2p[n.id] if n.id in c2p else -1,
            children=[c.id for c in n.children],
            notes=f"{filename}_{idx}"
        )
        data_file.write(json.dumps(dataclasses.asdict(data_node)) + "\n")
        save_image(name_img, f"book1/names/{n.id}.png")
        idx += 1
    json.dump(tree, open(f"book1/trees/{filename}.json", "w"))

data_file.close()

In [None]:
# TODO: Merge adjacent pages if their lines go off the edge (and if they're missing the tag). Warn on orphans

In [None]:
# 13_37.png looks sus

# Graph parsing opencv

In [None]:
import cv2
import math

img = get_image("book1/cropped/0.png")
lines = cv2.HoughLinesP(
    1-img,
    rho=1,
    theta=math.pi/180,
    threshold=10,
    minLineLength=60,
    maxLineGap=10
)

In [None]:
lines = lines.squeeze().tolist()
# Swap coordinates if x1+y1 > x2+y2 to ensure consistent ordering
for line in lines:
    line[0], line[1], line[2], line[3] = line[1], line[0], line[3], line[2]
    if line[0] + line[1] > line[2] + line[3]:
        line[0], line[1], line[2], line[3] = line[2], line[3], line[0], line[1]

lines = sorted(lines, key=lambda x: x[0])

def deduplicate_lines(lines, threshold=10):
  """
  Deduplicate a 2D list where entries are considered duplicates
  if all corresponding values are within the threshold.
  """
  if not lines:
    return []
  
  deduplicated = []
  
  for line in lines:
    is_duplicate = False
    for existing in deduplicated:
      # Check if all corresponding values are within threshold
      if all(abs(line[i] - existing[i]) <= threshold for i in range(len(line))):
        is_duplicate = True
        break
    
    if not is_duplicate:
      deduplicated.append(line)
  
  return deduplicated

lines = deduplicate_lines(lines)
for line in lines:
  line2 = [line[1], line[0], line[3], line[2]]
  print(line2)

In [None]:
print(lines.shape)

In [None]:
for (x1, y1, x2, y2) in lines[:, 0]:
    cv2.line(img, (x1, y1), (x2, y2), (0,0,255), 1)

In [None]:
show_image(1-img)

In [None]:
print(stats)
print(centroids)
print(num_labels)
print(labels)

In [None]:
for i, stat in enumerate(stats):
    x, y, w, h, area = stat
    if area > 50:  # adjust threshold
        name_img = img[y:y+h, x:x+w]
        save_image(name_img, f"names/name_{i}.png")

# attempt 2

In [None]:
img = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)

# Optional blur to reduce noise
blur = cv2.GaussianBlur(img, (5, 5), 0)

# Binary inverse threshold — black border becomes white
_, thresh = cv2.threshold(blur, 50, 255, cv2.THRESH_BINARY_INV)

# Find contours
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours by area (largest first)
contours = sorted(contours, key=cv2.contourArea, reverse=True)

In [None]:
print(len(contours))

In [None]:
page_corners = []

for cnt in contours[:2]:  # take top 2 (two pages)
    # Approximate contour to polygon
    peri = cv2.arcLength(cnt, True)
    approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
    print(approx)
    print(len(approx))

    if len(approx) == 4:
        corners = approx.reshape(4, 2)
        page_corners.append(corners)

# Visualize results
vis = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
for corners in page_corners:
    for x, y in corners:
        cv2.circle(vis, (int(x), int(y)), 10, (0, 0, 255), -1)

cv2.imwrite("detected_corners.png", vis)


# Sort each corner set to TL, TR, BR, BL order
def order_points(pts):
    s = pts.sum(axis=1)
    diff = np.diff(pts, axis=1)
    tl = pts[np.argmin(s)]
    br = pts[np.argmax(s)]
    tr = pts[np.argmin(diff)]
    bl = pts[np.argmax(diff)]
    return np.array([tl, tr, br, bl], dtype=np.float32)


ordered_pages = [order_points(c) for c in page_corners]

for i, corners in enumerate(ordered_pages):
    print(f"Page {i + 1} corners (TL, TR, BR, BL):\n", corners)

In [None]:
print(ordered_pages)

# CSV Parser

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("data/zeng_google_sheet.csv", keep_default_na=False)
df["id"] = df["id"].astype(int)
df["name_images"] = [[]] * len(df)
df["generation"] = df["generation"].astype(int)
df["parent"] = df["parent"].astype(int)
df["children"] = df["children"].apply(
    lambda x: [int(child) for child in x.split(",")] if x else []
)

# Iterate through rows and set parent based on children relationships
for _, row in df.iterrows():
    # Get the current node's ID and its children
    node_id = row["id"]
    children = row["children"]

    # For each child, set its parent to the current node_id
    for child_id in children:
        df.loc[df["id"] == child_id, "parent"] = node_id
df

In [None]:
# Export DataFrame to JSONL format
df.to_json("data/book1.jsonl", orient="records", lines=True, force_ascii=False)

In [None]:
import json

with open("data/book1.jsonl", "r") as f:
    records = [json.loads(line) for line in f]

print(records[0])
