# 국토교통부 보도자료 수집/임베딩 노트북
사용 전 첫 번째 셀의 변수를 수정해 주세요.

In [None]:
# ▶ SINGLE “USER CONFIG” CELL
CONFIG_PATH = "/content/drive/MyDrive/boan_data/env.json"  # path to config JSON on Drive

In [None]:
# ▶ STEP 0 · SETUP
!pip install -q pdfplumber layoutparser[layoutmodels] sentence-transformers faiss-cpu
from google.colab import drive
import os, json
%load_ext autoreload
%autoreload 2
drive.mount('/content/drive')
with open(CONFIG_PATH) as f:
    cfg = json.load(f)
SERVICE_KEY = cfg['SERVICE_KEY']
DCLSF_CD = cfg.get('DCLSF_CD', 'A00')
START_DATE = cfg.get('START_DATE', '2020-01-01')
END_DATE = cfg.get('END_DATE', '2025-07-08')
PAGE_SIZE = cfg.get('PAGE_SIZE', 1000)
DRIVE_DIR = cfg.get('DRIVE_DIR', '/content/drive/MyDrive/boan_data')
HF_HOME_DIR = cfg.get('HF_HOME_DIR', '/content/drive/.hf_cache')
os.environ['HF_HOME'] = HF_HOME_DIR


In [None]:
import requests, json, os, sqlite3, faiss, torch, pdfplumber
import layoutparser as lp
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np


In [None]:
# ▶ STEP 1 · FETCH PDF LIST
def get_pdf_items():
    items=[]
    page=1
    while True:
        params={
            'serviceKey': SERVICE_KEY,
            'pageNo': page,
            'numOfRows': PAGE_SIZE,
            'dclsfCd': DCLSF_CD,
            'startDate': START_DATE,
            'endDate': END_DATE,
            'viewType': 'json'
        }
        url='https://apis.data.go.kr/1613000/genFldPriorInfoDsc/getGenFldList'
        r=requests.get(url,params=params)
        r.raise_for_status()
        data=r.json()
        cur=data.get('response',{}).get('body',{}).get('items',[])
        items.extend(cur)
        if len(cur)<PAGE_SIZE:
            break
        page+=1
    return items


In [None]:
# ▶ STEP 2 · PDF → STRUCTURED JSON
def pdf_to_paragraphs(pdf_url):
    local_path='/tmp/temp.pdf'
    with open(local_path,'wb') as f:
        f.write(requests.get(pdf_url).content)
    paragraphs=[]
    with pdfplumber.open(local_path) as pdf:
        for page_no,page in enumerate(pdf.pages,start=1):
            words=page.extract_words()
            layout=lp.PDFPageLayout.from_words(words)
            for block in layout:
                text=block.text
                if not text or len(text)<150:
                    continue
                paragraphs.append({'page':page_no,'text':text,'bbox':block.block.bbox})
    os.remove(local_path)
    return paragraphs


In [None]:
# ▶ STEP 3 · EMBEDDING
model_name = 'upskyy/e5-large-korean' if torch.cuda.is_available() else 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
model = SentenceTransformer(model_name)


In [None]:
# ▶ STEP 4 · STORAGE
db=sqlite3.connect('docs.db')
cur=db.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS docs (id INTEGER PRIMARY KEY, pdf_url TEXT, page INT, text TEXT, b0 REAL, b1 REAL, b2 REAL, b3 REAL)')
index=faiss.IndexFlatIP(1024)
items=get_pdf_items()
vecs_all=[]
for item in tqdm(items):
    paras=pdf_to_paragraphs(item['downloadUrl'])
    texts=[p['text'] for p in paras]
    if not texts:
        continue
    vecs=model.encode(texts,batch_size=32,convert_to_numpy=True)
    faiss.normalize_L2(vecs)
    vecs_all.append(vecs)
    for v,p in zip(vecs,paras):
        cur.execute('INSERT INTO docs (pdf_url, page, text, b0, b1, b2, b3) VALUES (?,?,?,?,?,?,?)',
                    (item['downloadUrl'], p['page'], p['text'], *p['bbox']))
    db.commit()
index.add(np.vstack(vecs_all))
faiss.write_index(index,'faiss_index.faiss')
import shutil
shutil.copy('docs.db',DRIVE_DIR)
shutil.copy('faiss_index.faiss',DRIVE_DIR)
print('docs and index saved to',DRIVE_DIR)
