## BERTScore

https://haticeozbolat17.medium.com/text-summarization-how-to-calculate-bertscore-771a51022964

In [None]:
!pip install transformers # If you are using collab, "!" is required to download
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

# Example texts
reference = """
建議將400W水銀燈泡更換為150W節能陶瓷燈泡，預估可節省約63%的用電。此外，陶瓷燈泡使用壽命較長且環保，不易產生汞污染。
"""

candidate = """

將400W水銀燈泡更換為150W節能陶瓷燈泡，預估可節省約63%的用電。

"""

# BERTScore calculation
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score([candidate], [reference])
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTScore Precision: 0.9599, Recall: 0.9037, F1: 0.9309


In [None]:
# Example texts
candidate = """
可先盤點運轉時間長且老舊的馬達，這些馬達更換成高效率機台後節能效果最明顯；另外也可評估有無加裝變頻控制的需求。
"""
reference = """
將運轉時間長、負載率高且老舊的馬達汰換成高效率機種；並請專業人士評估有無加裝變頻控制的需求。
"""
# BERTScore calculation
scorer = BERTScorer(model_type='bert-base-chinese', lang='zh')
P, R, F1 = scorer.score([candidate], [reference])
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

BERTScore Precision: 0.8195, Recall: 0.8470, F1: 0.8330


In [None]:
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

# Example texts
reference = """建立隨手關燈的節能習慣，減少無謂的照明耗電。​保持冰箱儲存量適中，不塞滿食物，並定期清潔內部和檢查密封條"""

candidate = """

1. 確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。​

2.搭配電風扇協助室內空氣循環

3. 選購符合能效標準的冷氣機

4. 將冷氣溫度設定在26至28度

5. 定期清洗冷氣機濾網"""

# BERTScore calculation
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score([candidate], [reference])
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

### Outputs : BERTScore Precision: 0.9258, Recall: 0.9258, F1: 0.9258

BERTScore Precision: 0.7880, Recall: 0.8584, F1: 0.8217


In [None]:
# Step 1: Import the required libraries
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Step 2: Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Step 3: Define the two texts to compare
text1 = "建立隨手關燈的節能習慣，減少無謂的照明耗電。"
text2 = "確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。"

# Step 4: Prepare the texts for BERT
inputs1 = tokenizer(text1, return_tensors="pt", padding=True, truncation=True)
inputs2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True)

# Step 5: Feed the texts to the BERT model
outputs1 = model(**inputs1)
outputs2 = model(**inputs2)

# Step 6: Obtain the representation vectors
embeddings1 = outputs1.last_hidden_state.mean(dim=1).detach().numpy()
embeddings2 = outputs2.last_hidden_state.mean(dim=1).detach().numpy()

# Step 7: Calculate cosine similarity
similarity = np.dot(embeddings1, embeddings2.T) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))

# Step 8: Print the result
print("Similarity between the texts: {:.4f}".format(similarity[0][0]))

### Output: Similarity between the texts: 0.9000

Similarity between the texts: 0.9412


## GPT 生成計算相似度

In [None]:
!pip install rouge-score
!pip install sacrebleu
!pip install bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5646a9610e9f914f18436a20f97df3fcec00996fdef94f46ce7a5abdf0874a56
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-

In [None]:
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import sacrebleu
import bert_score

In [None]:


# ------------- Helper Functions -------------
def extract_action_object(title):
    """
    簡單根據預定義關鍵詞抽取建議標題中的「動作」與「受詞」。
    這裡假設若建議標題以某些動作詞開頭，則第一部分為動作，其餘為受詞。
    """
    known_actions = ["選購", "確保", "節省", "定期", "關", "保持", "調整", "使用", "採用", "推動", "建立", "節約", "清洗", "設定"]
    for action in known_actions:
        if title.startswith(action):
            object_part = title[len(action):].strip("，：：")
            return action, object_part
    # 若沒有符合，則取前兩個字為動作，餘下為受詞（僅作示範用）
    return title[:2], title[2:].strip()

def parse_suggestions(text):
    """
    將答案文本拆分成建議列表。
    假設文本中每個建議包含兩行：
      第一行：如 "1. 建議標題"
      第二行：節電效果描述
    """
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    suggestions = []
    i = 0
    while i < len(lines):
        # 判斷行是否以數字. 開頭
        if lines[i][0].isdigit():
            parts = lines[i].split(".", 1)
            title = parts[1].strip() if len(parts) == 2 else lines[i]
            effect = lines[i+1] if i+1 < len(lines) else ""
            action, obj = extract_action_object(title)
            suggestions.append({
                "title": title,
                "effect": effect,
                "action": action,
                "object": obj
            })
            i += 2
        else:
            i += 1
    return suggestions

def compute_action_object_metrics(ref_sugs, gen_sugs):
    """
    針對每筆建議（假設標準答案與生成答案皆有 5 筆建議），
    比對每筆建議的「動作」與「受詞」是否完全匹配。

    - Accuracy: 完全正確（動作與受詞均正確）的建議比例。
    - Precision/Recall/F1: 每筆建議包含2個元素，計算總正確元素數占總元素數。
    """
    total_elements = 0
    correct_elements = 0
    fully_correct = 0
    count = min(len(ref_sugs), len(gen_sugs))
    for i in range(count):
        total_elements += 2
        corr = 0
        if ref_sugs[i]["action"] == gen_sugs[i]["action"]:
            corr += 1
        if ref_sugs[i]["object"] == gen_sugs[i]["object"]:
            corr += 1
        correct_elements += corr
        if corr == 2:
            fully_correct += 1
    precision = correct_elements / total_elements if total_elements else 0
    recall = correct_elements / total_elements if total_elements else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = fully_correct / count if count else 0
    return accuracy, precision, recall, f1

In [None]:


# ------------- 模擬測試資料 -------------
# 這裡以一筆示範資料，實際上你會有多筆測試樣本，並存成一個 list，每筆資料包含 "reference" 與 "generated" 兩個欄位
test_samples = [
    {
        "reference": """1. 將冷氣溫度設定在26至28度，同時搭配電風扇協助室內空氣循環。
      每調高1°C可節省約6%空調用電，全年可大幅降低電費支出。
      2. 選購符合國家能效標準的冷氣機
      以560公升冷氣機為例，第一級冷氣機較低等級可省約40%耗電，一年可省304度電及863元。
      3. 定期清洗冷氣機
      保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
      4. 確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。
      良好散熱可使冷氣運轉效率提升，降低耗電量約5%-10%。
      5. 使用智能節能設備輔助
      如智能插座或節能控制系統，可在空閒時自動關機降低待機耗電。""",
              "generated": """1. 選購級數小、效率高的冷氣機
      以6坪空間常用冷氣機為例，第一級冷氣機比第五級能節省約37%的耗電量，一年可省約470度電和1,335元。
      2. 選購符合能效標準的冷氣機
      以560公升冷氣機為例，第一級冷氣機較低等級可省40%耗電，一年可省304度電及863元。
      3. 定期清洗冷氣機
      保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
      4. 建立隨手關燈的節能習慣，減少無謂的照明耗電。
      以5盞13瓦省電燈泡為例，一年可節省約1,213元，減少225公斤CO2排放
      5. 保持冰箱儲存量適中，不塞滿食物，並定期清潔內部和檢查密封條。
      正確使用可降低冰箱耗電約5%，延長設備壽命。"""
    },
    # 可在此添加更多測試樣本……
]

# ------------- 指標計算 -------------
# 初始化存放各項指標的列表
action_object_acc = []
action_object_prec = []
action_object_rec = []
action_object_f1 = []

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
chrf_scores = []

# 為 ROUGE 建立 scorer
rouge_scorer_inst = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
smooth = SmoothingFunction().method1

for sample in test_samples:
    ref_text = sample["reference"]
    gen_text = sample["generated"]

    # 解析建議
    ref_sugs = parse_suggestions(ref_text)
    gen_sugs = parse_suggestions(gen_text)

    # 計算動作/受詞匹配指標
    acc, prec, rec, f1 = compute_action_object_metrics(ref_sugs, gen_sugs)
    action_object_acc.append(acc)
    action_object_prec.append(prec)
    action_object_rec.append(rec)
    action_object_f1.append(f1)

    # BLEU: 使用字元級 tokenization（可根據需要調整成詞級分詞）
    ref_tokens = list(ref_text.replace("\n", ""))
    gen_tokens = list(gen_text.replace("\n", ""))
    bleu = sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smooth)
    bleu_scores.append(bleu)

    # ROUGE
    scores = rouge_scorer_inst.score(ref_text, gen_text)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

    # CLEU: 這裡使用 sacreBLEU 的 chrf 分數作為類似 CLEU 的度量
    chrf = sacrebleu.corpus_chrf([gen_text], [[ref_text]])
    chrf_scores.append(chrf.score)

# BERTScore: 跨所有測試樣本計算（此處計算 F1 分數）
generated_texts = [sample["generated"] for sample in test_samples]
reference_texts = [sample["reference"] for sample in test_samples]
P, R, F1 = bert_score.score(generated_texts, reference_texts, lang="zh", verbose=True)
bert_f1_list = F1.tolist()

# 平均各項指標
avg_acc = np.mean(action_object_acc)
avg_prec = np.mean(action_object_prec)
avg_rec = np.mean(action_object_rec)
avg_f1 = np.mean(action_object_f1)
avg_bleu = np.mean(bleu_scores)
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)
avg_chrf = np.mean(chrf_scores)
avg_bert_f1 = np.mean(bert_f1_list)

print("Action/Object Accuracy:", avg_acc)
print("Action/Object Precision:", avg_prec)
print("Action/Object Recall:", avg_rec)
print("Action/Object F1:", avg_f1)
print("BLEU:", avg_bleu)
print("ROUGE-1:", avg_rouge1)
print("ROUGE-2:", avg_rouge2)
print("ROUGE-L:", avg_rougeL)
print("CLEU (chrf):", avg_chrf)
print("BERTScore F1:", avg_bert_f1)


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 3.71 seconds, 0.27 sentences/sec
Action/Object Accuracy: 0.2
Action/Object Precision: 0.3
Action/Object Recall: 0.3
Action/Object F1: 0.3
BLEU: 0.5088866712533766
ROUGE-1: 0.6486486486486486
ROUGE-2: 0.4
ROUGE-L: 0.5945945945945946
CLEU (chrf): 41.420608504376474
BERTScore F1: 0.8030058145523071


In [None]:
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import sacrebleu
import bert_score

# 新增：安裝與引入 SentenceTransformer 與 sklearn 的 cosine_similarity
# 若尚未安裝，可先執行：!pip install sentence-transformers scikit-learn
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ------------- Helper Functions -------------
def extract_action_object(title):
    """
    簡單根據預定義關鍵詞抽取建議標題中的「動作」與「受詞」。
    這裡假設若建議標題以某些動作詞開頭，則第一部分為動作，其餘為受詞。
    """
    known_actions = ["選購", "定期", "調整", "使用", "採用", "推動", "建立", "節約", "清洗", "設定"]
    for action in known_actions:
        if title.startswith(action):
            object_part = title[len(action):].strip("，：：")
            return action, object_part
    # 若沒有符合，則取前兩個字為動作，餘下為受詞（僅作示範用）
    return title[:2], title[2:].strip()

def parse_suggestions(text):
    """
    將答案文本拆分成建議列表。
    假設文本中每個建議包含兩行：
      第一行：如 "1. 建議標題"
      第二行：節電效果描述
    """
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    suggestions = []
    i = 0
    while i < len(lines):
        if lines[i][0].isdigit():
            parts = lines[i].split(".", 1)
            title = parts[1].strip() if len(parts) == 2 else lines[i]
            effect = lines[i+1] if i+1 < len(lines) else ""
            action, obj = extract_action_object(title)
            suggestions.append({
                "title": title,
                "effect": effect,
                "action": action,
                "object": obj
            })
            i += 2
        else:
            i += 1
    return suggestions

def compute_action_object_metrics(ref_sugs, gen_sugs):
    """
    針對每筆建議（假設標準答案與生成答案皆有 5 筆建議），
    比對每筆建議的「動作」與「受詞」是否完全匹配。

    - Accuracy: 完全正確（動作與受詞均正確）的建議比例。
    - Precision/Recall/F1: 每筆建議包含2個元素，計算總正確元素數占總元素數。
    """
    total_elements = 0
    correct_elements = 0
    fully_correct = 0
    count = min(len(ref_sugs), len(gen_sugs))
    for i in range(count):
        total_elements += 2
        corr = 0
        if ref_sugs[i]["action"] == gen_sugs[i]["action"]:
            corr += 1
        if ref_sugs[i]["object"] == gen_sugs[i]["object"]:
            corr += 1
        correct_elements += corr
        if corr == 2:
            fully_correct += 1
    precision = correct_elements / total_elements if total_elements else 0
    recall = correct_elements / total_elements if total_elements else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = fully_correct / count if count else 0
    return accuracy, precision, recall, f1

# ------------- 模擬測試資料 -------------
# 這裡以一筆示範資料，實際上你會有多筆測試樣本
test_samples = [
    {
        "reference": """1. 將冷氣溫度設定在26至28度，同時搭配電風扇協助室內空氣循環。
      每調高1°C可節省約6%空調用電，全年可大幅降低電費支出。
      2. 選購符合國家能效標準的冷氣機
      以560公升冷氣機為例，第一級冷氣機較低等級可省約40%耗電，一年可省304度電及863元。
      3. 定期清洗冷氣機
      保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
      4. 確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。
      良好散熱可使冷氣運轉效率提升，降低耗電量約5%-10%。
      5. 使用智能節能設備輔助
      如智能插座或節能控制系統，可在空閒時自動關機降低待機耗電。""",
              "generated": """1. 選購級數小、效率高的冷氣機
      以6坪空間常用冷氣機為例，第一級冷氣機比第五級能節省約37%的耗電量，一年可省約470度電和1,335元。
      2. 選購符合能效標準的冷氣機
      以560公升冷氣機為例，第一級冷氣機較低等級可省40%耗電，一年可省304度電及863元。
      3. 定期清洗冷氣機
      保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
      4. 建立隨手關燈的節能習慣，減少無謂的照明耗電。
      以5盞13瓦省電燈泡為例，一年可節省約1,213元，減少225公斤CO2排放
      5. 保持冰箱儲存量適中，不塞滿食物，並定期清潔內部和檢查密封條。
      正確使用可降低冰箱耗電約5%，延長設備壽命。"""
    },
    # 可在此添加更多測試樣本……
]

# ------------- 指標計算 -------------
action_object_acc = []
action_object_prec = []
action_object_rec = []
action_object_f1 = []

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
chrf_scores = []

# 新增：計算 Cosine Similarity
cosine_similarities = []

# 為 ROUGE 建立 scorer
rouge_scorer_inst = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
smooth = SmoothingFunction().method1

# 建立 SentenceTransformer 模型 (此處以 all-MiniLM-L6-v2 為例)
st_model = SentenceTransformer('all-MiniLM-L6-v2')

for sample in test_samples:
    ref_text = sample["reference"]
    gen_text = sample["generated"]

    # 解析建議
    ref_sugs = parse_suggestions(ref_text)
    gen_sugs = parse_suggestions(gen_text)

    # 計算動作/受詞匹配指標
    acc, prec, rec, f1 = compute_action_object_metrics(ref_sugs, gen_sugs)
    action_object_acc.append(acc)
    action_object_prec.append(prec)
    action_object_rec.append(rec)
    action_object_f1.append(f1)

    # BLEU：使用字元級 tokenization（中文情況可直接用每個字）
    ref_tokens = list(ref_text.replace("\n", ""))
    gen_tokens = list(gen_text.replace("\n", ""))
    bleu = sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smooth)
    bleu_scores.append(bleu)

    # ROUGE
    scores = rouge_scorer_inst.score(ref_text, gen_text)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

    # CLEU (使用 sacreBLEU 的 chrf 分數)
    chrf = sacrebleu.corpus_chrf([gen_text], [[ref_text]])
    chrf_scores.append(chrf.score)

    # Cosine Similarity：先將參考文本與生成文本轉成向量，再計算 cosine similarity
    ref_embedding = st_model.encode(ref_text, convert_to_tensor=True)
    gen_embedding = st_model.encode(gen_text, convert_to_tensor=True)
    cos_sim = cosine_similarity(ref_embedding.cpu().reshape(1, -1), gen_embedding.cpu().reshape(1, -1))[0][0]
    cosine_similarities.append(cos_sim)

# BERTScore：跨所有測試樣本計算（此處計算 F1 分數）
generated_texts = [sample["generated"] for sample in test_samples]
reference_texts = [sample["reference"] for sample in test_samples]
P, R, F1 = bert_score.score(generated_texts, reference_texts, lang="zh", verbose=True)
bert_f1_list = F1.tolist()

# 平均各項指標
avg_acc = np.mean(action_object_acc)
avg_prec = np.mean(action_object_prec)
avg_rec = np.mean(action_object_rec)
avg_f1 = np.mean(action_object_f1)
avg_bleu = np.mean(bleu_scores)
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)
avg_chrf = np.mean(chrf_scores)
avg_bert_f1 = np.mean(bert_f1_list)
avg_cos_sim = np.mean(cosine_similarities)

print("Action/Object Accuracy:", avg_acc)
print("Action/Object Precision:", avg_prec)
print("Action/Object Recall:", avg_rec)
print("Action/Object F1:", avg_f1)
print("BLEU:", avg_bleu)
print("ROUGE-1:", avg_rouge1)
print("ROUGE-2:", avg_rouge2)
print("ROUGE-L:", avg_rougeL)
print("CLEU (chrf):", avg_chrf)
print("BERTScore F1:", avg_bert_f1)
print("Cosine Similarity:", avg_cos_sim)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.32 seconds, 0.76 sentences/sec
Action/Object Accuracy: 0.2
Action/Object Precision: 0.3
Action/Object Recall: 0.3
Action/Object F1: 0.3
BLEU: 0.5098449691788256
ROUGE-1: 0.6486486486486486
ROUGE-2: 0.4
ROUGE-L: 0.5945945945945946
CLEU (chrf): 41.38845495508588
BERTScore F1: 0.8032856583595276
Cosine Similarity: 0.8277864


In [None]:
import jieba.posseg as pseg
import numpy as np

# ---------------- Helper Functions ----------------
def extract_v_o_adj(text):
    """
    利用 jieba 進行詞性標註，從輸入的文本中抽取第一個動詞（v）、
    第一個名詞（n）作為受詞，以及在受詞之前出現的所有形容詞（a）。

    若找不到，則返回空字串。
    """
    words = list(pseg.cut(text))
    verb = ""
    obj = ""
    adjectives = []
    found_verb = False
    for word, flag in words:
        if not found_verb and flag.startswith('v'):
            verb = word
            found_verb = True
        elif found_verb and obj == "":
            # 在找到動詞後，先搜集形容詞
            if flag.startswith('a'):
                adjectives.append(word)
            elif flag.startswith('n'):
                obj = word
                break
    return verb, obj, " ".join(adjectives)

def parse_suggestions(text):
    """
    解析文本中每筆建議，假設每筆建議占兩行：
    第一行為 "數字. 建議標題"，第二行為效果描述（此處僅抽取標題部分）。

    對每筆建議，使用 extract_v_o_adj 來抽取動詞、受詞與受詞的形容詞。
    """
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    suggestions = []
    i = 0
    while i < len(lines):
        # 判斷是否以數字開頭
        if lines[i][0].isdigit():
            parts = lines[i].split(".", 1)
            title = parts[1].strip() if len(parts) == 2 else lines[i]
            verb, obj, adj = extract_v_o_adj(title)
            suggestions.append({
                "title": title,
                "verb": verb,
                "object": obj,
                "adj": adj
            })
            i += 2  # 假設每筆建議有兩行
        else:
            i += 1
    return suggestions

def compute_v_o_adj_metrics(ref_sugs, gen_sugs):
    """
    針對每筆建議，計算參考答案與生成答案在動詞、受詞和受詞形容詞三個元素的匹配情況。

    定義：
      - 每個建議有3個元素：動詞、受詞、受詞的形容詞
      - Accuracy: 完全正確（3/3）建議所占比例
      - Precision/Recall: (正確抽取的元素數) / (總抽取元素數)
      - F1: Precision 與 Recall 的調和平均值
    """
    total_elements = 0
    correct_elements = 0
    fully_correct = 0
    count = min(len(ref_sugs), len(gen_sugs))
    for i in range(count):
        # 取出參考與生成的元素
        ref_verb, ref_obj, ref_adj = ref_sugs[i]["verb"], ref_sugs[i]["object"], ref_sugs[i]["adj"]
        gen_verb, gen_obj, gen_adj = gen_sugs[i]["verb"], gen_sugs[i]["object"], gen_sugs[i]["adj"]
        correct = 0
        total_elements += 3
        if ref_verb == gen_verb:
            correct += 1
        if ref_obj == gen_obj:
            correct += 1
        if ref_adj == gen_adj:
            correct += 1
        correct_elements += correct
        if correct == 3:
            fully_correct += 1
    precision = correct_elements / total_elements if total_elements else 0
    recall = correct_elements / total_elements if total_elements else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = fully_correct / count if count else 0
    return accuracy, precision, recall, f1

# ---------------- 模擬測試資料 ----------------
test_sample = {
    "reference": """1. 將冷氣溫度設定在26至28度，同時搭配電風扇協助室內空氣循環。
每調高1°C可節省約6%空調用電，全年可大幅降低電費支出。
2. 選購符合國家能效標準的冷氣機
以560公升冷氣機為例，第一級冷氣機較低等級可省約40%耗電，一年可省304度電及863元。
3. 定期清洗冷氣機
保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
4. 確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。
良好散熱可使冷氣運轉效率提升，降低耗電量約5%-10%。
5. 使用智能節能設備輔助
如智能插座或節能控制系統，可在空閒時自動關機降低待機耗電。""",
    "generated": """1. 選購級數小、效率高的冷氣機
以6坪空間常用冷氣機為例，第一級冷氣機比第五級能節省約37%的耗電量，一年可省約470度電和1,335元。
2. 選購符合能效標準的冷氣機
以560公升冷氣機為例，第一級冷氣機較低等級可省40%耗電，一年可省304度電及863元。
3. 定期清洗冷氣機
保持冷氣機濾網清潔能維持運行效率，進一步降低耗電。
4. 建立隨手關燈的節能習慣，減少無謂的照明耗電。
以5盞13瓦省電燈泡為例，一年可節省約1,213元，減少225公斤CO2排放
5. 保持冰箱儲存量適中，不塞滿食物，並定期清潔內部和檢查密封條。
正確使用可降低冰箱耗電約5%，延長設備壽命。"""
}

# ---------------- 指標計算 ----------------
# 解析參考答案與生成答案中的建議
ref_suggestions = parse_suggestions(test_sample["reference"])
gen_suggestions = parse_suggestions(test_sample["generated"])

# 印出各建議抽取結果（供檢查用）
print("Reference suggestions:")
for sug in ref_suggestions:
    print(sug)
print("\nGenerated suggestions:")
for sug in gen_suggestions:
    print(sug)

# 計算動詞、受詞與受詞形容詞的指標
acc, prec, rec, f1 = compute_v_o_adj_metrics(ref_suggestions, gen_suggestions)
print("\n【動詞、受詞及形容詞匹配指標】")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 2.371 seconds.
DEBUG:jieba:Loading model cost 2.371 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Reference suggestions:
{'title': '將冷氣溫度設定在26至28度，同時搭配電風扇協助室內空氣循環。', 'verb': '設定', 'object': '電風扇', 'adj': ''}
{'title': '選購符合國家能效標準的冷氣機', 'verb': '選購', 'object': '國家', 'adj': ''}
{'title': '定期清洗冷氣機', 'verb': '定期', 'object': '氣機', 'adj': '冷'}
{'title': '確保冷氣機背部及側邊有足夠空間散熱，避免貼牆安裝。', 'verb': '確保', 'object': '冷氣', 'adj': ''}
{'title': '使用智能節能設備輔助', 'verb': '使用', 'object': '智能', 'adj': ''}

Generated suggestions:
{'title': '選購級數小、效率高的冷氣機', 'verb': '選購', 'object': '級數', 'adj': ''}
{'title': '選購符合能效標準的冷氣機', 'verb': '選購', 'object': '能效', 'adj': ''}
{'title': '定期清洗冷氣機', 'verb': '定期', 'object': '氣機', 'adj': '冷'}
{'title': '建立隨手關燈的節能習慣，減少無謂的照明耗電。', 'verb': '建立', 'object': '習慣', 'adj': ''}
{'title': '保持冰箱儲存量適中，不塞滿食物，並定期清潔內部和檢查密封條。', 'verb': '保持', 'object': '冰箱', 'adj': ''}

【動詞、受詞及形容詞匹配指標】
Accuracy: 0.2
Precision: 0.5333333333333333
Recall: 0.5333333333333333
F1: 0.5333333333333333


In [None]:
# ---------------- 模擬測試資料 ----------------
test_sample = {
    "reference": """購買級數小、效率高的冷氣機""",
    "generated": """購買級數小、效率高的冷氣機"""
}

# ---------------- 指標計算 ----------------
# 解析參考答案與生成答案中的建議
ref_suggestions = parse_suggestions(test_sample["reference"])
gen_suggestions = parse_suggestions(test_sample["generated"])

# 印出各建議抽取結果（供檢查用）
print("Reference suggestions:")
for sug in ref_suggestions:
    print(sug)
print("\nGenerated suggestions:")
for sug in gen_suggestions:
    print(sug)

# 計算動詞、受詞與受詞形容詞的指標
acc, prec, rec, f1 = compute_v_o_adj_metrics(ref_suggestions, gen_suggestions)
print("\n【動詞、受詞及形容詞匹配指標】")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Reference suggestions:

Generated suggestions:

【動詞、受詞及形容詞匹配指標】
Accuracy: 0
Precision: 0
Recall: 0
F1: 0


In [None]:
# ---------------- 模擬測試資料 ----------------
test_sample = {
    "reference": """1. 購買級數小、效率高的冷氣機。
每調高1°C可節省約6%空調用電，全年可大幅降低電費支出。
""",
    "generated": """1. 選購符合能效標準的冷氣機
以6坪空間常用冷氣機為例，第一級冷氣機比第五級能節省約37%的耗電量，一年可省約470度電和1,335元。
。"""
}

# ---------------- 指標計算 ----------------
# 解析參考答案與生成答案中的建議
ref_suggestions = parse_suggestions(test_sample["reference"])
gen_suggestions = parse_suggestions(test_sample["generated"])

# 印出各建議抽取結果（供檢查用）
print("Reference suggestions:")
for sug in ref_suggestions:
    print(sug)
print("\nGenerated suggestions:")
for sug in gen_suggestions:
    print(sug)

# 計算動詞、受詞與受詞形容詞的指標
acc, prec, rec, f1 = compute_v_o_adj_metrics(ref_suggestions, gen_suggestions)
print("\n【動詞、受詞及形容詞匹配指標】")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Reference suggestions:
{'title': '購買級數小、效率高的冷氣機。', 'verb': '購買', 'object': '級數', 'adj': ''}

Generated suggestions:
{'title': '選購符合能效標準的冷氣機', 'verb': '選購', 'object': '能效', 'adj': ''}

【動詞、受詞及形容詞匹配指標】
Accuracy: 0.0
Precision: 0.3333333333333333
Recall: 0.3333333333333333
F1: 0.3333333333333333


In [None]:
# ---------------- 模擬測試資料 ----------------
test_sample = {
    "reference": """1. 選購符合能效標準的冷氣機
每調高1°C可節省約6%空調用電，全年可大幅降低電費支出。
""",
    "generated": """1. 購買符合能效標準的冷氣機
以6坪空間常用冷氣機為例，第一級冷氣機比第五級能節省約37%的耗電量，一年可省約470度電和1,335元。
。"""
}

# ---------------- 指標計算 ----------------
# 解析參考答案與生成答案中的建議
ref_suggestions = parse_suggestions(test_sample["reference"])
gen_suggestions = parse_suggestions(test_sample["generated"])

# 印出各建議抽取結果（供檢查用）
print("Reference suggestions:")
for sug in ref_suggestions:
    print(sug)
print("\nGenerated suggestions:")
for sug in gen_suggestions:
    print(sug)

# 計算動詞、受詞與受詞形容詞的指標
acc, prec, rec, f1 = compute_v_o_adj_metrics(ref_suggestions, gen_suggestions)
print("\n【動詞、受詞及形容詞匹配指標】")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Reference suggestions:
{'title': '選購符合能效標準的冷氣機', 'verb': '選購', 'object': '能效', 'adj': ''}

Generated suggestions:
{'title': '購買符合能效標準的冷氣機', 'verb': '購買', 'object': '能效', 'adj': ''}

【動詞、受詞及形容詞匹配指標】
Accuracy: 0.0
Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1: 0.6666666666666666
