In [1]:
import json
import segment_lines
import annotation_filters
import score
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

In [4]:
def main():
    MODEL = "meta-llama/Llama-3.3-70B-Instruct"
    MODEL_DIR = "Llama-3.3-70B-Instruct"
    benchmark_file_list_path = "/Volumes/MyDataDrive/thesis/code-2/src/labelrix/benchmark-file-list.json"
    out_path = "/Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/"

    with open(benchmark_file_list_path, "r") as benchmark_file_list_file:
        data = json.load(benchmark_file_list_file)
    
    votes_out_dir = out_path + MODEL_DIR + "-per_page_votes"
    votes_merged_dir = out_path + MODEL_DIR + "-per_page_votes_merged"

    os.makedirs(votes_out_dir, exist_ok=True)
    os.makedirs(votes_merged_dir, exist_ok=True)

    def _extract_with_log(file_path, pages, model, out_dir):
        print(f"[PICKED] {file_path}")
        try:
            segment_lines.extract_votes(file_path, pages, model, out_dir=out_dir)
            print(f"[DONE]   {file_path}")
        except Exception as e:
            print(f"[ERROR]  {file_path} -> {e}")
            raise

    # Parallel per-file extraction (30 at a time)
    futures = []
    with ThreadPoolExecutor(max_workers=30) as executor:
        for key, value in data.items():
            filename = key
            pages = value['pages']
            file_path = value['file_path']
            file_value = value['file_path'].split("/")[-1]

            futures.append(
                executor.submit(
                    _extract_with_log,
                    file_path,
                    pages,
                    MODEL,
                    votes_out_dir
                )
            )

        for fut in as_completed(futures):
            try:
                fut.result()
            except Exception as e:
                print(f"Extraction failed: {e}")
    
    # Here we apply filtering
    annotation_filters.process_directory_to_json(
        in_dir=votes_out_dir,
        out_dir=votes_merged_dir,
        iou_thresh=0.5,
        overlap_thresh=0.9
    )

    score_dir = out_path + MODEL_DIR + "/"
    score.score_all_jsons_global(votes_merged_dir, out_dir=score_dir)

In [5]:
if __name__ == "__main__":
    main()

[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/e8d41204-64eb-9f4b-608e-5593933aca41.json[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/456300-sept-17-23-2012-11953-13474707086771-_-pdf.json

[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/eabf486e-2ff6-c060-68b7-fcf5363bde66.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/f5a1056c-ac79-aac8-6099-87821a840150.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/f9a59888-b508-9792-d356-1d36de82c212.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/f56e2fc1-7f41-68e3-bfe5-06176b9a2e8a.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/fdf168ea-c840-1465-710a-762427b285c3.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/ef640e66-1f79-701f-61d7-e968acb9e3fc.json
[PICKED] /Volumes/MyDataDrive/thesis/code-2/data/ad-buy-forms/source/eff55361-0e39-53a1-da0b-a337a3

KeyError: 'votes_0a32ce11-7ed9-14ee-8856-6a1edfad9ff3.png'

In [9]:
MODEL_DIR = "Qwen2.5-72B-Instruct"
out_path = "/Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/"
votes_out_dir = out_path + MODEL_DIR + "-per_page_votes"
votes_merged_dir = out_path + MODEL_DIR + "-per_page_votes_merged"

os.makedirs(votes_out_dir, exist_ok=True)
os.makedirs(votes_merged_dir, exist_ok=True)


annotation_filters.process_directory_to_json(
        in_dir=votes_out_dir,
        out_dir=votes_merged_dir,
        iou_thresh=0.5,
        overlap_thresh=0.9
)

score_dir = out_path + MODEL_DIR + "/"
score.score_all_jsons_global(votes_merged_dir, out_dir=score_dir)

Wrote 24 entities to /Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged/votes_f7635eed-5555-27b6-780f-8386863b25ca.pdf_page3.json
Wrote 8 entities to /Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged/votes_faa55a77-9090-22ac-fe9b-32ab3f026300.pdf_page1.json
Wrote 14 entities to /Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged/votes_f7635eed-5555-27b6-780f-8386863b25ca.pdf_page2.json
Wrote 16 entities to /Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged/votes_efca8764-0dfb-3f1c-beb9-f629991435bb.pdf_page1.json
Wrote 12 entities to /Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged/votes_ee19ec76-3531-254f-c21f-869b6cf0916c.pdf_page2.json
Wrote 34 entitie

KeyError: 'votes_e8d41204-64eb-9f4b-608e-5593933aca41.png'