In [16]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  #（保证程序cuda序号与实际cuda序号对应）
# os.environ['CUDA_VISIBLE_DEVICES'] = "1,2"  #（代表仅使用第1，2号GPU）

import pandas as pd
import time
import functools
import numpy as np
import paddle
import paddlenlp
import paddle.nn.functional as F
from paddle.io import DataLoader, BatchSampler
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from paddlenlp.data import DataCollatorWithPadding

# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        
    accu = metric.accumulate()
    print("eval loss: %.5f, accuracy: %.5f" % (np.mean(losses), accu))
    model.train()
    metric.reset()
    return accu

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length, is_test=False):

    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    if not is_test:
        result["labels"] = examples["label"]
    return result


In [17]:
testdata = pd.read_csv('/kaggle/input/datav4/dev.csv', sep=None, header=0, encoding='utf-8', engine='python')
test_ds=MapDataset([{'text':d.text,'label':"",'qid':str(d.qid)} for d in testdata.itertuples()])

In [18]:
#引入预训练模型
model_name = "ernie-3.0-xbase-zh"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=7)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#引入参数
params_path="/kaggle/input/modeld/model_state.pdparams"
state_dict = paddle.load(params_path)
model.set_dict(state_dict)


[32m[2022-11-24 03:46:35,734] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-xbase-zh'.[0m
[32m[2022-11-24 03:46:35,737] [    INFO][0m - Already cached /root/.paddlenlp/models/ernie-3.0-xbase-zh/ernie_3.0_xbase_zh.pdparams[0m
[32m[2022-11-24 03:46:39,895] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-xbase-zh'.[0m
[32m[2022-11-24 03:46:39,896] [    INFO][0m - Already cached /root/.paddlenlp/models/ernie-3.0-xbase-zh/ernie_3.0_xbase_zh_vocab.txt[0m
[32m[2022-11-24 03:46:39,926] [    INFO][0m - tokenizer config file saved in /root/.paddlenlp/models/ernie-3.0-xbase-zh/tokenizer_config.json[0m
[32m[2022-11-24 03:46:39,927] [    INFO][0m - Special tokens file saved in /root/.paddlenlp/models/ernie-3.0-xbase-zh/special_tokens_map.json[0m


In [19]:

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 测试集数据预处理，利用分词器将文本转化为整数序列
trans_func_test = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128, is_test=True)
test_ds_trans = test_ds.map(trans_func_test)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(test_ds_trans, batch_size=16, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds_trans, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

# 模型预测分类结果
label_map={0:"none",1:"happiness",2:"sadness",3:"anger",4:"surprise",5:"fear",6:"disgust"}
results = []
probs_for_8=[]
model.eval()
k=0
for batch in test_data_loader:
    if k%200==0:
        print(k)  
    input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
    logits = model(batch['input_ids'], batch['token_type_ids'])
    probs = F.softmax(logits, axis=-1)
    #存储标签
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    preds = [label_map[i] for i in idx]
    results.extend(preds)
    #存储可能性
    probs=probs.numpy()
    probs=probs.tolist()
    probs_for_8.extend([i for i in probs])
    k+=1


0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [24]:
# 存储预测结果  
testdata = pd.read_csv('/kaggle/input/datav4/dev.csv', sep=None, header=0, encoding='utf-8', engine='python')
test_ds=MapDataset([{'text':d.text,'label':"",'qid':str(d.qid),'date':str(d.date),'bid':d.bid} for d in testdata.itertuples()])

res_dir = "./results"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)
with open(os.path.join(res_dir, "weibosenti"+str(time.time())+".csv"), 'w', encoding="utf8") as f:
    f.write("qid@@text@@prediction@@date@@bid@@none@@happiness@@sadness@@anger@@surprise@@fear@@disgust\n")
    for i, pred in enumerate(results):
        f.write(test_ds[i]['qid']+"@@"+test_ds[i]['text']+"@@"+pred+"@@"+test_ds[i]['date']+"@@"+test_ds[i]['bid']+"@@"+\
            str(probs_for_8[i][0])+"@@"+str(probs_for_8[i][1])+"@@"+str(probs_for_8[i][2])+"@@"+str(probs_for_8[i][3])+"@@"+\
            str(probs_for_8[i][4])+"@@"+str(probs_for_8[i][5])+"@@"+str(probs_for_8[i][6])+"\n")

In [25]:
probs_for_8[1]

[0.0006404470186680555,
 0.8849683403968811,
 0.008866476826369762,
 0.06725722551345825,
 0.012095137499272823,
 0.0008301167981699109,
 0.025342129170894623]