In [3]:
import jsonlines
import numpy as np
import sqlite3
import jieba
from rouge_chinese import Rouge
# from rouge import Rouge
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from tqdm import tqdm

In [5]:
# read preds, specify model name to get the test results
model_name = 'chatglm2'
# model_name = 'baichuan2'
# model_name = 'bloomz'
with open(f'./test_output_{model_name}.txt') as f:
    preds = f.read()
    preds = [line for line in preds.split('\n') if len(line) > 0]
# read labels
labels = [line['summary'] for line in jsonlines.open('dev.json')]

In [6]:
# compute score
score_dict = {
    "rouge-1": [],
    "rouge-2": [],
    "rouge-l": [],
    "bleu-2": [],
    "bleu-4": [],
    "meteor": []
}

for pred, label in tqdm(list(zip(preds, labels))):
    # # 中文输出用jieba分词
    hypothesis = list(jieba.cut(pred))
    reference = list(jieba.cut(label))
    # # 英文输出用nltk分词
    # hypothesis = word_tokenize(pred)
    # reference = word_tokenize(label)
    rouge = Rouge()
    hypothesis = ' '.join(hypothesis)
    reference = ' '.join(reference)
    if not hypothesis.strip() or not reference.strip():
        continue
    scores = rouge.get_scores(hypothesis , reference)
    result = scores[0]

    for k, v in result.items():
        score_dict[k].append(round(v["f"] * 100, 4))
    bleu_score2 = sentence_bleu([list(label)], list(pred), weights=(0.5, 0.5), smoothing_function=SmoothingFunction().method3)
    bleu_score4 = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
    meteor = meteor_score([list(label)], list(pred))
    score_dict["bleu-2"].append(round(bleu_score2 * 100, 4))
    score_dict["bleu-4"].append(round(bleu_score4 * 100, 4))
    score_dict["meteor"].append(round(meteor * 100, 4))

for k, v in score_dict.items():
    score_dict[k] = float(np.mean(v))

print(score_dict)

  0%|          | 0/1070 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\asus\AppData\Local\Temp\jieba.cache
Loading model cost 0.822 seconds.
Prefix dict has been built successfully.
100%|██████████| 1070/1070 [00:09<00:00, 117.09it/s]

{'rouge-1': 30.556886074766354, 'rouge-2': 7.87192046728972, 'rouge-l': 25.391477663551402, 'bleu-2': 23.517177943925237, 'bleu-4': 9.11495429906542, 'meteor': 32.56504878504672}





In [None]:
# all_outputs_f = open('./results/all_outputs.txt', 'a')
# all_outputs_f.truncate(0)
# test_samples = [line for line in jsonlines.open('dev.json')]
# preds = {}
# model_names = ['bloomz', 'baichuan2', 'chatglm2']
# for model_name in model_names:
#     with open(f'./results/test_output_{model_name}.txt') as f:
#         text = f.read()
#         preds[model_name] = [line for line in text.split('\n') if len(line) > 0]
# for i in range(len(test_samples)):
#     all_outputs_f.write(f"输入：{test_samples[i]['content']}\n")
#     for model_name in model_names:
#         all_outputs_f.write(f"{model_name}输出：{preds[model_name][i]}\n")
#     all_outputs_f.write(f"标注：{test_samples[i]['summary']}\n")
#     all_outputs_f.write('\n')