In [28]:
# 기본 경로 설정
# ===============================================================
import os
import subprocess

PROJECT_NAME = "rag"

ROOT_DIR = os.getcwd()

try:
    from google.colab import drive, userdata
    IS_COLAB_MODE = True
    print("코랩 모드")

except ModuleNotFoundError as e:
    IS_COLAB_MODE = False
    ROOT_DIR = os.path.abspath(os.path.join(ROOT_DIR, ".."))
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    print(f"로컬 모드")


    DATA_DIR = os.path.join(ROOT_DIR, "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)


# 환경변수 로드 설정
def get_secret(key_name: str):
    if IS_COLAB_MODE:
        return userdata.get(key_name)
    else:
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=os.path.join(ROOT_DIR, ".env"))
        return os.getenv(key_name)


if IS_COLAB_MODE:
    drive.mount('/content/drive')

    DATA_DIR = os.path.join(ROOT_DIR, "drive", "MyDrive", "codeit-part3-team4", "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)

    SAVE_DIR = os.path.join(DATA_DIR, "runs", PROJECT_NAME)
    os.makedirs(SAVE_DIR, exist_ok=True)


# .DS_Store 파일 삭제
_ = subprocess.run(
    ["find", DATA_DIR, "-name", ".DS_Store", "-type", "f", "-delete"],
    check=True
)

로컬 모드


In [29]:
import json
from glob import glob

# 표
from img2table.document import PDF
from img2table.ocr import TesseractOCR
import pandas as pd

# 텍스트
from langchain_community.document_loaders import PyPDFLoader

In [30]:
metadata_json_path = os.path.join(DATA_DIR, "metadata.json")

with open(metadata_json_path, "r", encoding="utf-8") as f:
    metadata_dict = json.load(f)

PDF_PATH_LIST = glob(os.path.join(PDF_DIR, "*"))

sample_pdf = PDF_PATH_LIST[0]

In [31]:
ALL_DATA = dict()

In [32]:
def trim_table(df: pd.DataFrame):
    df.columns = df.iloc[0]
    df = df[1:]
    df.reset_index(drop=True, inplace=True)

    return df

def make_columns_unique(cols):
    seen = {}
    new_cols = []

    for c in cols:
        if c not in seen:
            seen[c] = 0
            new_cols.append(c)
        else:
            seen[c] += 1
            new_cols.append(f"{c}_{seen[c]}")

    return new_cols

In [33]:
def get_table_metadata(pdf_path: str):
    table_dict = dict()

    tables_by_page = PDF(pdf_path).extract_tables(
        ocr=TesseractOCR(n_threads=1, lang="eng"),
        min_confidence=1
    )

    for page_num, tables in tables_by_page.items():
        if not tables:
            continue

        table_dict[page_num] = []

        for i, table in enumerate(tables):

            df = trim_table(table.df)
            df.columns = make_columns_unique(df.columns)
            df = df.astype(str)
            table_content = df.to_dict(orient="split")
            table_content.pop("index", None)

            table_dict[page_num].append({
                "table_title": table.title,
                "table_num": float(f"{page_num}.{i}"),
                "table_content": table_content
            })

    return table_dict


table_dict = get_table_metadata(sample_pdf)

tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


In [34]:
title = sample_pdf.split("/")[-1]

ALL_DATA[title] = {
    "doc_id": title,
    "metadata": []
}

loader = PyPDFLoader(sample_pdf)
docs = loader.load()

for doc in docs:

    page = doc.metadata["page"]
    page_label = doc.metadata["page_label"]
    section = metadata_dict[title][str(page)]

    if page not in table_dict.keys():
        table = None
    else:
        table = table_dict[page]

    ALL_DATA[title]["metadata"].append({
        "page": page,
        "page_label": page_label,
        "title": title,
        "section": section,
        "table": table
    })

Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)


In [35]:
def merge_metadata(pdf_path: str, table_dict: dict):

    title = pdf_path.split("/")[-1]

    ALL_DATA[title] = {
        "doc_id": title,
        "metadata": []
    }

    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    for doc in docs:

        page = doc.metadata["page"]
        page_label = doc.metadata["page_label"]
        section = metadata_dict[title][str(page)]

        if page not in table_dict.keys():
            table = None
        else:
            table = table_dict[page]

        ALL_DATA[title]["metadata"].append({
            "page": page,
            "page_label": page_label,
            "title": title,
            "section": section,
            "table": table
        })

In [None]:
for pdf_path in PDF_PATH_LIST:
    table_dict = get_table_metadata(pdf_path=pdf_path)
    merge_metadata(pdf_path=pdf_path, table_dict=table_dict)

tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 106 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 92 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 182 0 (offset 0)
Ignoring wrong pointing object 215 0 (offset 0)
Ignoring wrong pointing object 236 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 25 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 36 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)


tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 173 0 (offset 0)


KeyError: '대한상공회의소_기업 재생에너지 지원센터 홈페이지 개편 및 시스템 고.pdf'

In [None]:
ALL_DATA