In [9]:
import os
import json
import sys
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

# 必须覆盖掉已经安装的 paddlenlp
sys.path.insert(0, r"G:\code\github\PaddleNLP")
from paddlenlp import Taskflow

In [2]:
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"

choices = []
with open(os.path.join(data_dir, "label.txt"), "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        choices.append(line)
print(len(choices))

15


In [3]:
# 还需要一个 label_id
raw_dir = r"G:\dataset\text_classify\tnews\raw"

label_list = []
zh2label_id = dict()
with open(os.path.join(raw_dir, "label_index2en2zh.json"), "r", encoding="utf-8") as f:
    for line in f:
        line = json.loads(line)
        label_list.append(line)
        label_id = line["label"]
        label_zh = line["label_zh"]
        zh2label_id[label_zh] = label_id

In [4]:
zh2label_id

{'故事': '100',
 '文化': '101',
 '娱乐': '102',
 '体育': '103',
 '财经': '104',
 '房产': '106',
 '汽车': '107',
 '教育': '108',
 '科技': '109',
 '军事': '110',
 '旅游': '112',
 '国际': '113',
 '股票': '114',
 '农业': '115',
 '电竞': '116'}

In [12]:
model_dir = r"G:\code\github\PaddleNLP\outputs\tnews\plm"

task = Taskflow(
    "zero_shot_text_classification", 
    model="utc-base",
    schema=choices,
    task_path=model_dir,
    precision="fp32",
    single_label=True,
    batch_size=32,
)
task_instance = task.task_instance
print(task("加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华"))

[32m[2023-07-17 22:14:25,346] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'G:\code\github\PaddleNLP\outputs\tnews\plm'.[0m
[32m[2023-07-17 22:14:25,363] [    INFO][0m - Assigning ['[O-MASK]'] to the additional_special_tokens key of the tokenizer[0m


[{'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华', 'predictions': [{'label': '汽车', 'score': 0.9951342415991393}]}]


In [8]:
input_file = os.path.join(data_dir, "test.txt")
output_file = os.path.join(data_dir, "../submit/tnewsf_predict.json")

with open(input_file, "r", encoding="utf-8") as fr, open(output_file, "w", encoding="utf-8") as f:
    data_list = []
    for line in fr:
        line = json.loads(line)
        data_list.append(line)
    
    result = task(data_list)
    for data, item in zip(data_list, result):
        index_id = data["id"]
        predict_label = item["predictions"][0]["label"]
        label_id = zh2label_id[predict_label]
        
        f.write(json.dumps({"id": index_id, "label": label_id}, ensure_ascii=False) + "\n")


In [12]:
line

{'id': '0',
 'text_a': '加长3.4米，玛莎拉蒂Ghibli奇特改装，内饰极尽奢华',
 'text_b': '',
 'question': '',
 'choices': ['故事',
  '文化',
  '娱乐',
  '体育',
  '财经',
  '房产',
  '汽车',
  '教育',
  '科技',
  '军事',
  '旅游',
  '国际',
  '股票',
  '农业',
  '电竞']}

# 跑一下看看 test_public.txt 的效果

In [29]:
import os
import subprocess
import sys

# 尝试覆盖掉已经安装的 paddlenlp
# sys.path.insert(0, r"G:\code\github\PaddleNLP")
os.environ["PYTHONPATH"] = r"G:\code\github\PaddleNLP"

cur_dir = r"G:\code\github\PaddleNLP\applications\zero_shot_text_classification"
data_dir = r"G:\dataset\text_classify\tnews\paddlenlp"
train_output_dir = r"G:\code\github\PaddleNLP\outputs\tnews"
# 这里需要的是训练输出目录, 而不是模型导出目录
model_dir = os.path.join(train_output_dir, "")
model_dir = r"C:\Users\zhenh\.paddlenlp\models\utc-base"
test_path = os.path.join(data_dir, "test_public.txt")
output_dir = os.path.join(train_output_dir, "eval_results")

cmd_list = [
    sys.executable,
    os.path.join(cur_dir, "run_eval.py"),
    "--device=gpu",
    # 如果将 model_path 注释掉, 就是 zero-shot
    f"--model_path={model_dir}",
    f"--test_path={test_path}",
    "--per_device_eval_batch_size=32",
    "--max_seq_len=512",
    f"--output_dir={output_dir}",
]
print(cmd_list)
result = subprocess.run(cmd_list, capture_output=True, text=True, encoding="utf-8")
if result.returncode != 0:
    print(result.stdout)
    print(result.stderr)

['g:\\code\\github\\PaddleNLP\\venv\\Scripts\\python.exe', 'G:\\code\\github\\PaddleNLP\\applications\\zero_shot_text_classification\\run_eval.py', '--device=gpu', '--model_path=C:\\Users\\zhenh\\.paddlenlp\\models\\utc-base', '--test_path=G:\\dataset\\text_classify\\tnews\\paddlenlp\\test_public.txt', '--per_device_eval_batch_size=32', '--max_seq_len=512', '--output_dir=G:\\code\\github\\PaddleNLP\\outputs\\tnews\\eval_results']


In [None]:
# plm 的结果
{
	"test_loss": 3.076721429824829,
	"test_micro_f1": 0.9416915422885572,
	"test_macro_f1": 0.7575248111199813,
	"test_runtime": 4.9484,
	"test_samples_per_second": 406.192,
	"test_steps_per_second": 12.731
}
# 输出根目录的结果
{
	"test_loss": 1.8031879663467408,
	"test_micro_f1": 0.9516086235489221,
	"test_macro_f1": 0.7886135964319503,
	"test_runtime": 4.9238,
	"test_samples_per_second": 408.22,
	"test_steps_per_second": 12.795
}
# zero-shot 的结果
{
	"test_loss": 3.076721429824829,
	"test_micro_f1": 0.9416915422885572,
	"test_macro_f1": 0.7575248111199813,
	"test_runtime": 4.9209,
	"test_samples_per_second": 408.46,
	"test_steps_per_second": 12.802
}