In [17]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2 as cv2
from PIL import Image
import os
import re
import json
from tqdm import tqdm

In [18]:
path = "/Users/satoshan/Library/CloudStorage/Box-Box/scan_org_charts/cropped/"
path_json = "/Users/satoshan/Library/CloudStorage/Box-Box/scan_org_charts/learning/output"
path_clean = "/Users/satoshan/Library/CloudStorage/Box-Box/scan_org_charts/clean/"
files = os.listdir(path)
files = [f for f in files if os.path.isfile(os.path.join(path, f))]

files_json = os.listdir(path_json)
files_json = [f for f in files_json if os.path.isfile(os.path.join(path_json, f))]

In [23]:
results = []

for i in tqdm(range(len(files)), desc="Processing files"):
    file = os.path.join(path, files[i])
    file_json = os.path.join(path_json, files_json[i])
    image = Image.open(file) 
    image_width, image_height = image.size 
    with open(file_json, "r") as f:
        data = json.load(f)
    # the coordation of the centers of department
    centers = []
    for i, box in enumerate(data["pred_boxes"]):
        x_center = int((box[0] + box[2]) / 2)
        y_center = int((box[1] + box[3]) / 2)

        # Normalize the image size
        x_normalized = (x_center / image_width) * 10
        y_normalized = (y_center / image_height) * 10
        centers.append({"id": i, "center": (x_normalized, y_normalized)})

    # make graph
    G = nx.Graph()
    # Add nodes to graph
    for center in centers:
        G.add_node(center["id"], pos=center["center"])
    # Add edges to graph 
    distance_threshold = 4.5
    for i in range(len(centers)):
        for j in range(i + 1, len(centers)):
            dist = np.linalg.norm(np.array(centers[i]["center"]) - np.array(centers[j]["center"]))
            if dist < distance_threshold:
                G.add_edge(centers[i]["id"], centers[j]["id"], weight=dist)

    pos = nx.get_node_attributes(G, "pos")
    shortest_length_path = nx.average_shortest_path_length(G)
    num_depart = len(centers)
    match = re.search(r"cropped_(\d{4})_(\d{4})", file)
    if match:
        year = match.group(1)  # Extracts the year (e.g., "2002")
        code = match.group(2)  # Extracts the code (e.g., "7003")
    else:
        year = None
        code = None

    results.append({"code": code, "year": year , "shortest_path_length": shortest_length_path, "num_depart": num_depart})


Processing files: 100%|██████████| 101/101 [00:00<00:00, 104.41it/s]


In [24]:
i = 9
file_json = os.path.join(path_json, files_json[i])
with open(file_json, "r") as f:
    data = json.load(f)
non_zero_boxes = [box for box in data["pred_boxes"] if any(coord != 0 for coord in box)]
len(non_zero_boxes)

76

In [25]:
df = pd.DataFrame(results)
df.to_csv(os.path.join(path_clean, "org_data.csv"))