In [12]:
import os
import json
import sys
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

# 必须覆盖掉已经安装的 paddlenlp
sys.path.insert(0, r"G:\code\github\PaddleNLP")
from paddlenlp import Taskflow

# 检查下路径对不对
import inspect
inspect.getfile(Taskflow)

'G:\\code\\github\\PaddleNLP\\paddlenlp\\taskflow\\taskflow.py'

In [4]:
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"

choices = []
with open(os.path.join(data_dir, "label.txt"), "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        choices.append(line)
print(len(choices))

15


In [5]:
# 还需要一个 label_id
raw_dir = r"G:\dataset\text_classify\tnews\raw"

label_list = []
zh2label_id = dict()
with open(os.path.join(raw_dir, "label_index2en2zh.json"), "r", encoding="utf-8") as f:
    for line in f:
        line = json.loads(line)
        label_list.append(line)
        label_id = line["label"]
        label_zh = line["label_zh"]
        zh2label_id[label_zh] = label_id

In [6]:
zh2label_id

{'故事': '100',
 '文化': '101',
 '娱乐': '102',
 '体育': '103',
 '财经': '104',
 '房产': '106',
 '汽车': '107',
 '教育': '108',
 '科技': '109',
 '军事': '110',
 '旅游': '112',
 '国际': '113',
 '股票': '114',
 '农业': '115',
 '电竞': '116'}

In [27]:
model_dir = r"G:\code\github\PaddleNLP\outputs\tnews"

task = Taskflow(
    "zero_shot_text_classification", 
    model="utc-base",
    schema=choices,
    task_path=model_dir,
    precision="fp32",
    single_label=True,
    batch_size=32,
)
task_instance = task.task_instance
print(task("加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华"))

[32m[2023-07-17 23:19:05,855] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'G:\code\github\PaddleNLP\outputs\tnews'.[0m


[{'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华', 'predictions': [{'label': '汽车', 'score': 0.06766963982969058}]}]


In [28]:
from paddlenlp.transformers import UTC, AutoTokenizer

# TODO: 就是这里加载失败了. 
# 我终于明白了, 不应该用 UTC 加载权重, 而是要用 PromptModelForSequenceClassification 加载权重
# 所以不支持直接用 Taskflow
# TODO: 可以看下, 把 key 中的 plm. 前缀去掉, 或者可以用 UTC.from_pretrained(model_dir) 加载
model_dir = r"G:\code\github\PaddleNLP\outputs\tnews"
# model_dir = r"G:\code\github\PaddleNLP\outputs\medical"
utc = UTC.from_pretrained(model_dir)

[32m[2023-07-17 23:19:23,653] [    INFO][0m - loading configuration file G:\code\github\PaddleNLP\outputs\medical\config.json[0m
- This IS expected if you are initializing UTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing UTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.[0m


In [32]:
input_file = os.path.join(data_dir, "test.txt")
output_file = os.path.join(data_dir, "../submit/tnewsf_predict.json")

with open(input_file, "r", encoding="utf-8") as fr, open(output_file, "w", encoding="utf-8") as f:
    data_list = []
    for line in fr:
        line = json.loads(line)
        data_list.append(line)
    
    result = task(data_list)
    for data, item in zip(data_list, result):
        print(item)
        index_id = data["id"]
        predict_label = item["predictions"][0]["label"]
        label_id = zh2label_id[predict_label]
        
        f.write(json.dumps({"id": index_id, "label": label_id}, ensure_ascii=False) + "\n")


{'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华', 'predictions': [{'label': '旅游', 'score': 0.06786891624645669}]}
{'text_a': '许广平的坎坷婚恋：逃婚来北京，初恋横死，追鲁迅多年、登报示爱', 'predictions': [{'label': '旅游', 'score': 0.06772600824675352}]}
{'text_a': '监管出手，仁东控股被暂停融资买入，股价惨遭12连跌停丨热公司', 'predictions': [{'label': '旅游', 'score': 0.06782641007951017}]}
{'text_a': '四口之家开着房车去旅行，来到南岭之中，正式告别江西省', 'predictions': [{'label': '旅游', 'score': 0.06792042343429254}]}
{'text_a': '库克的决心：把苹果代工移出中国', 'predictions': [{'label': '旅游', 'score': 0.0684527185022983}]}
{'text_a': '卓越的生命之花需要用美育浇灌', 'predictions': [{'label': '旅游', 'score': 0.06872708156094062}]}
{'text_a': '艾草什么时间收割？', 'predictions': [{'label': '旅游', 'score': 0.06923226403920667}]}
{'text_a': '《品读诗人毛泽东》：诗意人生与革命精神', 'predictions': [{'label': '旅游', 'score': 0.06825760847781517}]}
{'text_a': '宏光PLUS牛了，空间堪比“小货车”，能轻松载7人，6年免检', 'predictions': [{'label': '旅游', 'score': 0.06782024153918739}]}
{'text_a': '徽班进京是中国文化的大事件', 'predictions': [{'label': '旅游', 'score': 0.06875298356645156}

In [12]:
line

{'id': '0',
 'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华',
 'text_b': '',
 'question': '',
 'choices': ['故事',
  '文化',
  '娱乐',
  '体育',
  '财经',
  '房产',
  '汽车',
  '教育',
  '科技',
  '军事',
  '旅游',
  '国际',
  '股票',
  '农业',
  '电竞']}

# 跑一下看看 test_public.txt 的效果

In [24]:
import os
import subprocess
import sys

# 尝试覆盖掉已经安装的 paddlenlp
# sys.path.insert(0, r"G:\code\github\PaddleNLP")
os.environ["PYTHONPATH"] = r"G:\code\github\PaddleNLP"

cur_dir = r"G:\code\github\PaddleNLP\applications\zero_shot_text_classification"
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"
train_output_dir = r"G:\code\github\PaddleNLP\outputs\tnews"
# 这里需要的是训练输出目录, 而不是模型导出目录
model_dir = os.path.join(train_output_dir, "")
model_dir = r"C:\Users\zhenh\.paddlenlp\models\utc-base"
test_path = os.path.join(data_dir, "test_public.txt")
output_dir = os.path.join(train_output_dir, "eval_results")

cmd_list = [
    sys.executable,
    os.path.join(cur_dir, "run_eval.py"),
    "--device=gpu",
    # 如果将 model_path 注释掉, 就是 zero-shot
    f"--model_path={model_dir}",
    f"--test_path={test_path}",
    "--per_device_eval_batch_size=32",
    "--max_seq_len=512",
    f"--output_dir={output_dir}",
]
print(cmd_list)
result = subprocess.run(cmd_list, capture_output=True, text=True, encoding="utf-8")
if result.returncode == 0:
    print(result.stdout)
    print(result.stderr)

['g:\\code\\github\\PaddleNLP\\venv\\Scripts\\python.exe', 'G:\\code\\github\\PaddleNLP\\applications\\zero_shot_text_classification\\run_eval.py', '--device=gpu', '--model_path=C:\\Users\\zhenh\\.paddlenlp\\models\\utc-base', '--test_path=G:\\dataset\\text_classify\\tnews\\paddlenlp\\test_public.txt', '--per_device_eval_batch_size=32', '--max_seq_len=512', '--output_dir=G:\\code\\github\\PaddleNLP\\outputs\\tnews\\eval_results']

[32m[2023-07-17 23:14:34,997] [    INFO][0m - The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).[0m
[32m[2023-07-17 23:14:34,998] [    INFO][0m -      Model Configuration Arguments      [0m
[32m[2023-07-17 23:14:34,998] [    INFO][0m - paddle commit id              : 0e92adceae06b6b7463f2dc7790ffb0601730009[0m
[32m[2023-07-17 23:14:34,9

In [None]:
# plm 的结果
{
	"test_loss": 3.076721429824829,
	"test_micro_f1": 0.9416915422885572,
	"test_macro_f1": 0.7575248111199813,
	"test_runtime": 4.9484,
	"test_samples_per_second": 406.192,
	"test_steps_per_second": 12.731
}
# 输出根目录的结果
{
	"test_loss": 1.8031879663467408,
	"test_micro_f1": 0.9516086235489221,
	"test_macro_f1": 0.7886135964319503,
	"test_runtime": 4.9238,
	"test_samples_per_second": 408.22,
	"test_steps_per_second": 12.795
}
# zero-shot 的结果
{
	"test_loss": 3.076721429824829,
	"test_micro_f1": 0.9416915422885572,
	"test_macro_f1": 0.7575248111199813,
	"test_runtime": 4.9209,
	"test_samples_per_second": 408.46,
	"test_steps_per_second": 12.802
}