In [2]:
import json
import pandas as pd
from pathlib import Path

import yake
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [12]:
DATA_DIR  = Path.cwd().parent / "data" / "keywords"
#INPUT_FILES = [DATA_DIR / "goldset_10.jsonl", DATA_DIR / "goldset_50.jsonl"]
INPUT_FILES = [DATA_DIR / "goldset_10.jsonl"]
DATA_DIR.mkdir(exist_ok=True)
TOP_K = 10
print(INPUT_FILES)
print(DATA_DIR)

[PosixPath('/home/user/workspace/redfin/redfin_label_api/data/keywords/goldset_10.jsonl')]
/home/user/workspace/redfin/redfin_label_api/data/keywords


In [13]:
# YAKE + KeyBERT
yake_ex = yake.KeywordExtractor(lan="en", n=3, top=TOP_K, dedupLim=0.9, windowsSize=2)
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
kb = KeyBERT(model=embed_model)

In [14]:
def extract_keywords(text: str, top_k: int = TOP_K) -> dict:
    """
    YAKE와 KeyBERT를 동시에 적용한 키워드 후보 집합 반환
    """
    if not text or not text.strip():
        return {"yake": [], "keybert": []}
    
    keys_yake = [kw for kw, _ in yake_ex.extract_keywords(text)]
    keys_kb   = [kw for kw, _ in kb.extract_keywords(
        text, keyphrase_ngram_range=(1,3), top_n=top_k,
        use_maxsum=True, nr_candidates=20, diversity=0.6
    )]
    return {"yake": keys_yake, "keybert": keys_kb}

In [15]:
for f in INPUT_FILES:
    out_map = {}
    with open(f, "r", encoding="utf-8") as fin:
        for ln in fin:
            try:
                obj = json.loads(ln)
                doc_id = obj.get("_id") or obj.get("guid") or None
                text = " ".join([
                    obj.get("title",""), obj.get("summary",""), obj.get("body_text","")
                ]).strip()
                kws = extract_keywords(text, top_k=TOP_K)
                print(kws)
                out_map[doc_id] = kws
            except Exception as e:
                print(f"[warn] failed line: {e}")
    
    # 결과 저장
    out_path = DATA_DIR / f"{Path(f).stem}_keywords.json"
    with open(out_path, "w", encoding="utf-8") as fout:
        json.dump(out_map, fout, ensure_ascii=False, indent=2)
    print(f"Saved: {out_path}")

Saved: /home/user/workspace/redfin/redfin_label_api/data/keywords/goldset_10_keywords.json
