# Keyword Extraction

## KeyBERT
- [KeyBERT](https://github.com/MaartenGr/KeyBERT)
- [sentence-transformers](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)

In [1]:
import json
from keybert import KeyBERT

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# use multilingual sentence-transformers model 
# https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
kw_model = KeyBERT(model="paraphrase-multilingual-MiniLM-L12-v2")

# load 100 chinese posts as samples
with open("data/cn_100_tc.json", 'r') as infile: 
    json_data = json.load(infile)
    doc_tc_list = [ d.strip() for d in json_data]

In [3]:
# unigram extraction using KeyBERT
sample_len = min(len(doc_tc_list), 20)
for doc_tc in doc_tc_list[:sample_len]: 
    kws_tc = kw_model.extract_keywords(doc_tc, keyphrase_ngram_range=(1, 1), top_n=5)
    print(kws_tc)

[('2022', 0.5925), ('參賽', 0.54), ('去年', 0.4853), ('推薦', 0.4373), ('我要', 0.4157)]
[('stm32', 0.5403), ('參考手冊', 0.4746), ('手冊', 0.4217), ('書籍', 0.3949), ('開發板', 0.3702)]
[('發表', 0.4258), ('ithome', 0.408), ('內容', 0.3948), ('負載平衡', 0.3894), ('本頁', 0.3626)]
[('grid', 0.5448), ('彈性', 0.3578), ('定位問題', 0.319), ('flexbox', 0.3157), ('壓縮', 0.3132)]
[('學習效果', 0.5201), ('課程', 0.454), ('新課程', 0.4396), ('單元測試', 0.4385), ('教學', 0.4253)]
[('shadow', 0.423), ('陰影', 0.3948), ('跨平台', 0.3866), ('平台', 0.3428), ('platform', 0.2936)]
[('foundryup', 0.3489), ('synchronization', 0.3445), ('conclusion', 0.3323), ('鏈上', 0.3289), ('過程', 0.3271)]
[('開發板', 0.5563), ('pcb', 0.5439), ('pcbdoc', 0.5195), ('電腦', 0.4536), ('軟體', 0.4532)]
[('uselayouteffect', 0.4324), ('點擊', 0.4244), ('按鈕', 0.4176), ('useeffect', 0.3855), ('toggle', 0.3607)]
[('組織', 0.4294), ('數位', 0.4123), ('公司', 0.399), ('基礎設施', 0.3931), ('平台', 0.3892)]
[('熱導管', 0.6773), ('導熱管', 0.664), ('散熱器', 0.6462), ('ar12', 0.4989), ('熱量', 0.4798)]
[('寫出', 0.573