In [1]:
import os
import json
import sys
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import paddle

# 必须覆盖掉已经安装的 paddlenlp
sys.path.insert(0, r"G:\code\github\PaddleNLP")
from paddlenlp import Taskflow
from paddlenlp.transformers import UTC, AutoTokenizer

# 检查下路径对不对
import inspect
inspect.getfile(Taskflow)

'G:\\code\\github\\PaddleNLP\\paddlenlp\\taskflow\\taskflow.py'

In [2]:
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"

choices = []
with open(os.path.join(data_dir, "label.txt"), "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        choices.append(line)
print(len(choices))

15


In [3]:
# 还需要一个 label_id
raw_dir = r"G:\dataset\text_classify\tnews\raw"

label_list = []
zh2label_id = dict()
with open(os.path.join(raw_dir, "label_index2en2zh.json"), "r", encoding="utf-8") as f:
    for line in f:
        line = json.loads(line)
        label_list.append(line)
        label_id = line["label"]
        label_zh = line["label_zh"]
        zh2label_id[label_zh] = label_id

In [4]:
zh2label_id

{'故事': '100',
 '文化': '101',
 '娱乐': '102',
 '体育': '103',
 '财经': '104',
 '房产': '106',
 '汽车': '107',
 '教育': '108',
 '科技': '109',
 '军事': '110',
 '旅游': '112',
 '国际': '113',
 '股票': '114',
 '农业': '115',
 '电竞': '116'}

In [5]:
print(list(zh2label_id.keys()))

['故事', '文化', '娱乐', '体育', '财经', '房产', '汽车', '教育', '科技', '军事', '旅游', '国际', '股票', '农业', '电竞']


# 我对世界的理解又深刻了一点

如果用 Taskflow 可以有两种方式.

1. 使用训练目录下的 plm 目录加载

2. 使用训练目录加载, 需要设置 

```python
UTC.base_model_prefix = ""
```

总结就是 plm 保存的是 UTC 模型, 训练目录下保存的是 PromptModelForSequenceClassification 模型.

In [4]:
# 似乎直接用 plm 也行的, 是一样的效果
model_dir = r"G:\code\github\PaddleNLP\outputs\tnews\plm"

# 只要加这一句, 就可以完美解决前缀的问题了
# UTC.base_model_prefix = ""
task = Taskflow(
    "zero_shot_text_classification", 
    model="utc-base",
    schema=choices,
    task_path=model_dir,
    precision="fp32",
    single_label=True,
    batch_size=32,
    max_seq_len=512,
)
task_instance = task.task_instance
print(task("农村依然很重视土葬"))

[32m[2023-07-18 22:38:33,160] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'G:\code\github\PaddleNLP\outputs\tnews\plm'.[0m
[32m[2023-07-18 22:38:33,179] [    INFO][0m - Assigning ['[O-MASK]'] to the additional_special_tokens key of the tokenizer[0m
[32m[2023-07-18 22:38:33,921] [    INFO][0m - loading configuration file G:\code\github\PaddleNLP\outputs\tnews\plm\config.json[0m
[32m[2023-07-18 22:38:35,654] [    INFO][0m - All model checkpoint weights were used when initializing UTC.
[0m
[32m[2023-07-18 22:38:35,654] [    INFO][0m - All the weights of UTC were initialized from the model checkpoint at G:\code\github\PaddleNLP\outputs\tnews\plm.
If your task is similar to the task the model of the checkpoint was trained on, you can already use UTC for predictions without further training.[0m
[32m[2023-07-18 22:38:35,656] [    INFO][0m - Converting to the inference model cost a little time.[0m
[32m[2023-07-18 22:38

[{'text_a': '农村依然很重视土葬', 'predictions': [{'label': '农业', 'score': 0.8404078741764125}]}]


In [21]:
from paddlenlp.transformers import UTC, AutoTokenizer

# 我终于明白了, 不应该用 UTC 加载权重, 而是要用 PromptModelForSequenceClassification 加载权重
# 所以不支持直接用 Taskflow
model_dir = r"G:\code\github\PaddleNLP\outputs\tnews"
# model_dir = r"G:\code\github\PaddleNLP\outputs\medical"
UTC.base_model_prefix = "plm"
utc = UTC.from_pretrained(model_dir)

[32m[2023-07-18 21:11:54,100] [    INFO][0m - loading configuration file G:\code\github\PaddleNLP\outputs\tnews\config.json[0m
[32m[2023-07-18 21:11:55,117] [    INFO][0m - All model checkpoint weights were used when initializing UTC.
[0m
[32m[2023-07-18 21:11:55,118] [    INFO][0m - All the weights of UTC were initialized from the model checkpoint at G:\code\github\PaddleNLP\outputs\tnews.
If your task is similar to the task the model of the checkpoint was trained on, you can already use UTC for predictions without further training.[0m


In [40]:
input_file = os.path.join(data_dir, "test.txt")
output_file = os.path.join(data_dir, "../submit/tnewsf_predict.json")

with open(input_file, "r", encoding="utf-8") as fr, open(output_file, "w", encoding="utf-8") as f:
    data_list = []
    for line in fr:
        line = json.loads(line)
        data_list.append(line)
    
    result = task(data_list)
    for data, item in zip(data_list, result):
        print(item)
        index_id = data["id"]
        predict_label = item["predictions"][0]["label"]
        label_id = zh2label_id[predict_label]
        
        f.write(json.dumps({"id": index_id, "label": label_id}, ensure_ascii=False) + "\n")


{'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华', 'predictions': [{'label': '汽车', 'score': 0.9951346821444802}]}
{'text_a': '许广平的坎坷婚恋：逃婚来北京，初恋横死，追鲁迅多年、登报示爱', 'predictions': [{'label': '故事', 'score': 0.44362494972218863}]}
{'text_a': '监管出手，仁东控股被暂停融资买入，股价惨遭12连跌停丨热公司', 'predictions': [{'label': '股票', 'score': 0.9960702662046722}]}
{'text_a': '四口之家开着房车去旅行，来到南岭之中，正式告别江西省', 'predictions': [{'label': '旅游', 'score': 0.7539242709515069}]}
{'text_a': '库克的决心：把苹果代工移出中国', 'predictions': [{'label': '科技', 'score': 0.829261843152815}]}
{'text_a': '卓越的生命之花需要用美育浇灌', 'predictions': [{'label': '教育', 'score': 0.7943661277581486}]}
{'text_a': '艾草什么时间收割？', 'predictions': [{'label': '农业', 'score': 0.993191101994448}]}
{'text_a': '《品读诗人毛泽东》：诗意人生与革命精神', 'predictions': [{'label': '文化', 'score': 0.98355172147113}]}
{'text_a': '宏光PLUS牛了，空间堪比“小货车”，能轻松载7人，6年免检', 'predictions': [{'label': '汽车', 'score': 0.9960673669977747}]}
{'text_a': '徽班进京是中国文化的大事件', 'predictions': [{'label': '文化', 'score': 0.9530882372672479}]}
{'text_a'

In [5]:
# 用这种模型跑下 test_public.txt 看看效果
input_file = os.path.join(data_dir, "test_public.txt")
true_list = []
predict_list = []

with open(input_file, "r", encoding="utf-8") as f:
    data_list = []
    for line in f:
        line = json.loads(line)
        data_list.append(line)
        label = line["choices"][line["labels"][0]]
        true_list.append(label)

    result = task(data_list)
    for data, item in zip(data_list, result):
        print(item)
        predict_label = item["predictions"][0]["label"]
        predict_list.append(predict_label)

{'text_a': '农村依然很重视土葬', 'predictions': [{'label': '农业', 'score': 0.8405696633942297}]}
{'text_a': '传奇侠盗燕子李三，下场凄凉，死后更是无人埋葬', 'predictions': [{'label': '娱乐', 'score': 0.3807101534140575}]}
{'text_a': '婆婆，我是倒贴媳妇，想用我的房子给你儿子结婚，没门！', 'predictions': [{'label': '故事', 'score': 0.8580266424717592}]}
{'text_a': '参加同学婚礼被嘲笑大龄单身，郁闷想逃男神搂住我：考虑一下我', 'predictions': [{'label': '故事', 'score': 0.792271182970018}]}
{'text_a': '女子半夜上厕所，发现坑里有眼睛盯着她，警察赶来忍不住笑了', 'predictions': [{'label': '故事', 'score': 0.43689689071999}]}
{'text_a': '它是一艘星际战舰，觉醒了人类意识，即将面临着被拆解', 'predictions': [{'label': '军事', 'score': 0.9526999495623545}]}
{'text_a': '母亲为我遮挡所有风雨', 'predictions': [{'label': '故事', 'score': 0.23993578488685385}]}
{'text_a': '那个家境不好的女孩，请你明白，婚姻不是扶贫', 'predictions': [{'label': '故事', 'score': 0.36812590489576563}]}
{'text_a': '照顾隔壁独居老人七年，没想到老人临终前送我一大礼', 'predictions': [{'label': '故事', 'score': 0.9955318061934706}]}
{'text_a': '什么时候嫁人最合适', 'predictions': [{'label': '故事', 'score': 0.4581652795760519}]}
{'text_a': '外出打工，家

In [6]:
predict_list[:10]

['农业', '娱乐', '故事', '故事', '故事', '军事', '故事', '故事', '故事', '故事']

In [8]:
from sklearn.metrics import classification_report, f1_score, accuracy_score

print(f1_score(true_list, predict_list, average="micro"))
print(f1_score(true_list, predict_list, average="macro"))
print(accuracy_score(true_list, predict_list))

0.5985074626865672
0.5939150663178421
0.5985074626865672


# 跑一下看看 test_public.txt 的效果

In [56]:
import os
import subprocess
import sys

# 尝试覆盖掉已经安装的 paddlenlp
# sys.path.insert(0, r"G:\code\github\PaddleNLP")
os.environ["PYTHONPATH"] = r"G:\code\github\PaddleNLP"

cur_dir = r"G:\code\github\PaddleNLP\applications\zero_shot_text_classification"
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"
train_output_dir = r"G:\code\github\PaddleNLP\outputs\tnews"
# 这里需要的是训练输出目录, 而不是模型导出目录
model_dir = os.path.join(train_output_dir, "")
model_dir = r"C:\Users\zhenh\.paddlenlp\models\utc-base"
test_path = os.path.join(data_dir, "test_public.txt")
output_dir = os.path.join(train_output_dir, "eval_results")

cmd_list = [
    sys.executable,
    os.path.join(cur_dir, "run_eval.py"),
    "--device=gpu",
    "--single_label=True",
    # 如果将 model_path 注释掉, 就是 zero-shot
    f"--model_path={model_dir}",
    f"--test_path={test_path}",
    "--per_device_eval_batch_size=32",
    "--max_seq_len=512",
    f"--output_dir={output_dir}",
]
print(cmd_list)
result = subprocess.run(cmd_list, capture_output=True, text=True, encoding="utf-8")
# if result.returncode == 0:
#     print(result.stdout)
#     print(result.stderr)

['g:\\code\\github\\PaddleNLP\\venv\\Scripts\\python.exe', 'G:\\code\\github\\PaddleNLP\\applications\\zero_shot_text_classification\\run_eval.py', '--device=gpu', '--single_label=True', '--model_path=C:\\Users\\zhenh\\.paddlenlp\\models\\utc-base', '--test_path=G:\\dataset\\text_classify\\tnews\\paddlenlp\\test_public.txt', '--per_device_eval_batch_size=32', '--max_seq_len=512', '--output_dir=G:\\code\\github\\PaddleNLP\\outputs\\tnews\\eval_results']


# 原来在 test_public.txt 上的指标就是这么低的.

test.txt 的预测结果在 `FewCLUE小样本学习榜` 上 	TNEWSF 是 `74.4`.

PaddleNLP-AutoPrompt 有 `78.93`. 还是有点差距.

FewCLUE 的 github 上有个 2021 年的结果, 也是在 test_public.txt 上的效果, 最好的是 PET 的 `56.4`.

In [None]:
# plm 的结果
{
	"test_loss": 3.076721429824829,
	"test_accuracy": 0.5407960199004975,
	"test_runtime": 5.0955,
	"test_samples_per_second": 394.465,
	"test_steps_per_second": 12.364
}
# 输出根目录的结果
{
	"test_loss": 1.8031879663467408,
	"test_accuracy": 0.5985074626865672,
	"test_runtime": 5.111,
	"test_samples_per_second": 393.273,
	"test_steps_per_second": 12.326
}
# zero-shot 的结果
{
	"test_loss": 3.076721429824829,
	"test_accuracy": 0.5407960199004975,
	"test_runtime": 5.1101,
	"test_samples_per_second": 393.34,
	"test_steps_per_second": 12.329
}

# 我需要一个单体预测的, 且针对 onnx 模型的

1. 将模型转换成 onnx
2. 加上各种前后处理, 进行单样本预测