<a href="https://colab.research.google.com/github/wakachii/SI-Org-chart/blob/main/pipeline/master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Please run with Google Colab with Good GPU
<a href="https://colab.research.google.com/github/wakachii/SI-Org-chart/blob/main/pipeline/master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Detectron2 has not released pre-built binaries for the latest pytorch (https://github.com/facebookresearch/detectron2/issues/4053)
# so we install from source instead. This takes a few minutes.
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' -q

!pip install pyocr -q
!pip install layoutparser -q

# Install pre-built detectron2 that matches pytorch version, if released:
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
#!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/{CUDA_VERSION}/{TORCH_VERSION}/index.html

# exit(0)  # After installation, you may need to "restart runtime" in Colab. This line can also restart runtime

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /private/var/folders/mz/k5cvq67n1cdd51htpfqt0wbh0000gn/T/pip-req-build-dfz7zzpi
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /private/var/folders/mz/k5cvq67n1cdd51htpfqt0wbh0000gn/T/pip-req-build-dfz7zzpi
^C


In [None]:
import os
import re
import sys
import json
import pyocr
import cv2 as cv2
from tqdm import tqdm
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import layoutparser as lp
from google.colab import drive
drive.mount('/content/drive')

# import some common detectron2 utilities
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor, DefaultTrainer
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.visualizer import Visualizer, ColorMode

In [None]:
path = "/content/drive/MyDrive/scan_org_charts/"
# data
path_2010 = path + "2010"
path_2006 = path + "2006"
path_2002 = path + "2002"
path_clean = path + "clean"
# trimming
path_renamed = path + "renamed/"
path_cropped = path + "cropped/"
# model
path_learn = path + "learning/"
path_train = path_learn + "data/train"
path_coco = path_learn + "Org_chart-1.json"
path_data = path + "cropped"
path_json = path_learn + "output"

## load image and rename with OCR

In [None]:
# 2002
files_2002 = os.listdir(path_2002)
files_2002 = [f for f in files_2002 if os.path.isfile(os.path.join(path_2002, f))]
files_2002.sort()
for i in tqdm(range(len(files_2002)), desc="Processing files"):
    # load image
    file_2002 = os.path.join(path_2002, files_2002[i])
    img = cv2.imread(file_2002)
    # preprocessing for OCR
    header = img[0:350, 0:-1500]
    gray_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
    gray_image = cv2.equalizeHist(gray_image)
    gray_image = cv2.cvtColor(header, cv2.COLOR_BGR2GRAY)
    binary_image = cv2.adaptiveThreshold(
        gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    gray_image_inv = cv2.bitwise_not(binary_image)
    pil_image = Image.fromarray(gray_image_inv)
    tools = pyocr.get_available_tools()
    tool = tools[0]
    txt = tool.image_to_string(
        pil_image,
        lang="eng",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
    )
    filtered_text = re.findall(r'\d+', txt)
    if len(filtered_text) > 1:
            if len(filtered_text[0]) == 4:
                    path_save_2002 = path_renamed + "2002_" + filtered_text[0] + ".png"
                    cv2.imwrite(f"{path_save_2002}", img)


# 2010
files_2010 = os.listdir(path_2010)
files_2010 = [f for f in files_2010 if os.path.isfile(os.path.join(path_2010, f))]
files_2010.sort()
for i in tqdm(range(len(files_2010)), desc="Processing files"):
    # load image
    file_2010 = os.path.join(path_2010, files_2010[i])
    img = cv2.imread(file_2010)
    # preprocessing for OCR
    header = img[50:250, 10:-10]
    gray_image = cv2.cvtColor(header, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 245, 255, cv2.THRESH_BINARY)
    gray_image_inv = cv2.bitwise_not(binary_image)
    pil_image = Image.fromarray(gray_image_inv)
    # OCR
    tools = pyocr.get_available_tools()
    tool = tools[0]
    txt = tool.image_to_string(
        pil_image,
        lang="jpn",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
    )
    filtered_text = re.findall(r'\d+', txt)
    if len(filtered_text) > 0:
            if len(filtered_text[0]) == 4:
                    path_save_2010 = path_renamed + "2010_" + filtered_text[0] + ".png"
                    cv2.imwrite(f"{path_save_2010}", img)

## crop org chart

In [None]:
files_renamed = os.listdir(path_renamed)
files_renamed = [f for f in files_renamed if os.path.isfile(os.path.join(path_renamed, f))]
files_renamed.sort()
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})


for j in tqdm(range(1,len(files_renamed)), desc="Processing files"):
    # load image
    file = os.path.join(path, files_renamed[j])
    img = cv2.imread(file)
    # detection
    layout = model.detect(img)
    lp.draw_box(img, layout, box_width=3)
    figures = [block for block in layout if block.type == "Figure"]

    # crop
    for i, figure in enumerate(figures):
        x_1, y_1, x_2, y_2 = map(int, figure.coordinates)
        cropped_img = img[480:y_2, 50:x_2]
        path_save = path_cropped + "cropped_" + file
        cv2.imwrite(f"{path_save}", cropped_img)

## Detect departments

In [None]:
# set train data
register_coco_instances("org_chart_train", {}, path_coco, path_train)

# setting for using the model
cfg = get_cfg() # initialize
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("org_chart_train",)
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 2
cfg.SOLVER.IMS_PER_BATCH = 1
cfg.SOLVER.BASE_LR = 0.0004
cfg.SOLVER.MAX_ITER = (500)
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = (128)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1

# train
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) # for output
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

In [None]:
# the function for making the meta-data dict of the test data
def get_test_dicts(img_dir):
    img_files = [os.path.join(img_dir, f) for f in os.listdir(img_dir) if f.endswith('.jpg') or f.endswith('.png')]
    dataset_dicts = []
    for idx, img_file in enumerate(img_files):
        record = {}
        record["file_name"] = img_file
        record["image_id"] = idx
        record["height"], record["width"] = cv2.imread(img_file).shape[:2]
        dataset_dicts.append(record)
    return dataset_dicts

# change the test data form for dectron2
DatasetCatalog.register("org_chart_data", lambda: get_test_dicts(path_data))
MetadataCatalog.get("org_chart_data").set(thing_classes=["department"])

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # load trained weights
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6  # score
cfg.DATASETS.TEST = ("org_chart_data", )  # set the test data to the model

# detect departments
predictor = DefaultPredictor(cfg)
metadata = MetadataCatalog.get("org_chart_data")
dataset_dicts = DatasetCatalog.get("org_chart_data")

for d in tqdm(dataset_dicts):
    img = cv2.imread(d["file_name"])
    outputs = predictor(img)
    json_output = {
    "file_name": d["file_name"],
    "pred_boxes": outputs["instances"].pred_boxes.tensor.cpu().numpy().tolist(),
    "scores": outputs["instances"].scores.cpu().numpy().tolist(),
    "pred_classes": outputs["instances"].pred_classes.cpu().numpy().tolist()
    }
    # save JSON
    base_name = os.path.basename(d["file_name"])
    json_name = base_name.replace(".png", ".json")
    json_path = os.path.join(path_json, json_name)
    with open(json_path, "w") as f:
        json.dump(json_path, f)

In [None]:
files_croppedd = os.listdir(path_cropped)
files_croppedd = [f for f in files_croppedd if os.path.isfile(os.path.join(path_cropped, f))]

files_json = os.listdir(path_json)
files_json = [f for f in files_json if os.path.isfile(os.path.join(path_json, f))]

results = []

for i in tqdm(range(len(files_croppedd)), desc="Processing files"):
    file = os.path.join(path, files_croppedd[i])
    file_json = os.path.join(path_json, files_json[i])
    image = Image.open(file)
    image_width, image_height = image.size
    with open(file_json, "r") as f:
        data = json.load(f)
    # the coordation of the centers of department
    centers = []
    for i, box in enumerate(data["pred_boxes"]):
        x_center = int((box[0] + box[2]) / 2)
        y_center = int((box[1] + box[3]) / 2)

        # Normalize the image size
        x_normalized = (x_center / image_width) * 10
        y_normalized = (y_center / image_height) * 10
        centers.append({"id": i, "center": (x_normalized, y_normalized)})

    # make graph
    G = nx.Graph()
    # Add nodes to graph
    for center in centers:
        G.add_node(center["id"], pos=center["center"])
    # Add edges to graph
    distance_threshold = 5
    for i in range(len(centers)):
        for j in range(i + 1, len(centers)):
            dist = np.linalg.norm(np.array(centers[i]["center"]) - np.array(centers[j]["center"]))
            if dist < distance_threshold:
                G.add_edge(centers[i]["id"], centers[j]["id"], weight=dist)

    pos = nx.get_node_attributes(G, "pos")

    num_depart = len(centers)
    shortest_length_path = nx.average_shortest_path_length(G)
    match = re.search(r"cropped_(\d{4})_(\d{4})", file)
    if match:
        year = match.group(1)  # Extracts the year (e.g., "2002")
        code = match.group(2)  # Extracts the code (e.g., "7003")
    else:
        year = None
        code = None

    results.append({"code": code, "year": year , "shortest_path_length": shortest_length_path, "num_depart": num_depart})

data = pd.DataFrame(results)
data.to_csv(os.path.join(path_clean, "org_data.csv"))