<a href="https://colab.research.google.com/github/wdkq2/aifixing/blob/main/bodo_pdf_colab_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 국토교통부 보도자료 수집/임베딩 노트북
사용 전 첫 번째 셀의 변수를 수정해 주세요.

In [1]:
# ▶ TLS1.2 강제 설정 셀 (가장 먼저 실행)
import ssl
from requests.adapters import HTTPAdapter
import requests

# 1) SSLContext 생성: TLS 1.2만 사용하도록 제한
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
ctx.maximum_version = ssl.TLSVersion.TLSv1_2

# 2) Adapter 작성
class TLS12Adapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = ctx
        return super().init_poolmanager(*args, **kwargs)

# 3) 세션에 장착
session = requests.Session()
session.mount('https://', TLS12Adapter())
requests.get = session.get


# 이제부터 session.get(...)을 사용하면 TLS1.2로만 통신합니다.


In [2]:
# ▶ SINGLE “USER CONFIG” CELL
CONFIG_PATH = "/content/drive/MyDrive/boan_data/env.json"  # path to config JSON on Drive

In [3]:
# ▶ STEP 0 · SETUP
!pip install -q pdfplumber layoutparser[layoutmodels] sentence-transformers faiss-cpu
from google.colab import drive
import os, json
%load_ext autoreload
%autoreload 2
drive.mount('/content/drive')
with open(CONFIG_PATH) as f:
    cfg = json.load(f)
SERVICE_KEY = cfg['SERVICE_KEY']
DCLSF_CD = cfg.get('DCLSF_CD', 'A00')
START_DATE = cfg.get('START_DATE', '2020-01-01')
END_DATE = cfg.get('END_DATE', '2025-07-08')
PAGE_SIZE = cfg.get('PAGE_SIZE', 1000)
DRIVE_DIR = cfg.get('DRIVE_DIR', '/content/drive/MyDrive/boan_data')
HF_HOME_DIR = cfg.get('HF_HOME_DIR', '/content/drive/.hf_cache')
os.environ['HF_HOME'] = HF_HOME_DIR


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:

In [4]:
import requests, json, os, sqlite3, faiss, torch, pdfplumber
import layoutparser as lp
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np


In [5]:
# ▶ STEP 1 · FETCH PDF LIST
def get_pdf_items():
    items=[]
    page=1
    while True:
        params={
            'serviceKey': SERVICE_KEY,
            'pageNo': page,
            'numOfRows': PAGE_SIZE,
            'dclsfCd': DCLSF_CD,
            'startDate': START_DATE,
            'endDate': END_DATE,
            'viewType': 'json'
        }
        url='https://apis.data.go.kr/1613000/genFldPriorInfoDsc/getGenFldList'
        r=requests.get(url,params=params)
        r.raise_for_status()
        data=r.json()
        cur=data.get('response',{}).get('body',{}).get('items',[])
        items.extend(cur)
        if len(cur)<PAGE_SIZE:
            break
        page+=1
    return items


In [6]:
# ▶ STEP 2 · PDF → STRUCTURED JSON
def pdf_to_paragraphs(pdf_url):
    local_path='/tmp/temp.pdf'
    with open(local_path,'wb') as f:
        f.write(requests.get(pdf_url).content)
    paragraphs=[]
    with pdfplumber.open(local_path) as pdf:
        for page_no,page in enumerate(pdf.pages,start=1):
            words=page.extract_words()
            layout=lp.PDFPageLayout.from_words(words)
            for block in layout:
                text=block.text
                if not text or len(text)<150:
                    continue
                paragraphs.append({'page':page_no,'text':text,'bbox':block.block.bbox})
    os.remove(local_path)
    return paragraphs


In [7]:
# ▶ STEP 3 · EMBEDDING
model_name = 'upskyy/e5-large-korean' if torch.cuda.is_available() else 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
model = SentenceTransformer(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# ▶ STEP 4 · STORAGE
db=sqlite3.connect('docs.db')
cur=db.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS docs (id INTEGER PRIMARY KEY, pdf_url TEXT, page INT, text TEXT, b0 REAL, b1 REAL, b2 REAL, b3 REAL)')
index=faiss.IndexFlatIP(1024)
items=get_pdf_items()
vecs_all=[]
for item in tqdm(items):
    paras=pdf_to_paragraphs(item['downloadUrl'])
    texts=[p['text'] for p in paras]
    if not texts:
        continue
    vecs=model.encode(texts,batch_size=32,convert_to_numpy=True)
    faiss.normalize_L2(vecs)
    vecs_all.append(vecs)
    for v,p in zip(vecs,paras):
        cur.execute('INSERT INTO docs (pdf_url, page, text, b0, b1, b2, b3) VALUES (?,?,?,?,?,?,?)',
                    (item['downloadUrl'], p['page'], p['text'], *p['bbox']))
    db.commit()
index.add(np.vstack(vecs_all))
faiss.write_index(index,'faiss_index.faiss')
import shutil
shutil.copy('docs.db',DRIVE_DIR)
shutil.copy('faiss_index.faiss',DRIVE_DIR)
print('docs and index saved to',DRIVE_DIR)


SSLError: HTTPSConnectionPool(host='apis.data.go.kr', port=443): Max retries exceeded with url: /1613000/genFldPriorInfoDsc/getGenFldList?serviceKey=FKbQCum3hcP84emc8yEmXqDAAeTp3RDmVldnCHtm3K8D7fJnp94moa46igEnF7syWBBsbqYWuVldxGQ%2BQdcZtQ%3D%3D&pageNo=1&numOfRows=1000&dclsfCd=A00&startDate=2020-01-01&endDate=2025-07-08&viewType=json (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_ILLEGAL_PARAMETER] sslv3 alert illegal parameter (_ssl.c:1016)')))