In [2]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [3]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [17]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [4]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            print(self.schema_dict[schema_type], '==schema_type==', schema_type)
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        from roformer import RoFormerModel, RoFormerConfig

        config = RoFormerConfig.from_pretrained(args_path["model_path"])
        encoder = RoFormerModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx is not None and idx != dt_idx:
                        continue
                    
                    ce_logits = classifier(hidden_states)
                    outputs_list.append(ce_logits)
                return outputs_list, hidden_states

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()

    def predict(self, text):

        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
        for schema_type, logits in zip(list(self.schema_dict.keys()), logits_list):
            scores = torch.nn.Softmax(dim=1)(logits)[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
        return scores_dict

risk_api = RiskInfer('./risk_data/config.ini')




senti /root/xiaoda/query_topic/risk_data/senti_label.txt ===schema-path===
{'label2id': {'负向': 0, '正向': 1}, 'id2label': {0: '负向', 1: '正向'}, 'label_index': 0} ==schema_type== senti
bias /root/xiaoda/query_topic/risk_data/bias_label.txt ===schema-path===
{'label2id': {'偏见': 0, '正常': 1}, 'id2label': {0: '偏见', 1: '正常'}, 'label_index': 1} ==schema_type== bias
ciron /root/xiaoda/query_topic/risk_data/ciron_label.txt ===schema-path===
{'label2id': {'讽刺': 0, '正常': 1}, 'id2label': {0: '讽刺', 1: '正常'}, 'label_index': 2} ==schema_type== ciron
intent /root/xiaoda/query_topic/risk_data/intention_label_v0.txt ===schema-path===
{'label2id': {'主观评价/比较/判断': 0, '寻求建议/帮助': 1, '其它': 2}, 'id2label': {0: '主观评价/比较/判断', 1: '寻求建议/帮助', 2: '其它'}, 'label_index': 3} ==schema_type== intent
offensive /root/xiaoda/query_topic/risk_data/offensive_label.txt ===schema-path===
{'label2id': {'冒犯': 0, '正常': 1}, 'id2label': {0: '冒犯', 1: '正常'}, 'label_index': 4} ==schema_type== offensive
query_risk /root/xiaoda/query_topic/ri

12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/16/2022 17:23:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++


In [9]:

offensive = []
with open('/data/albert.xht/sentiment/dev/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
offensive_test = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive_test.append(content)

        
cdia_bias = []
with open('/data/albert.xht/sentiment/dev/cdial_bias.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        cdia_bias.append(content)
        
senti_copr = []
with open('/data/albert.xht/sentiment/dev/senti_copr.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_copr.append(content)
        
ciron = []
with open('/data/albert.xht/sentiment/dev/chinese_ciron.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        ciron.append(content)

senti_smp = []
with open('/data/albert.xht/sentiment/dev/senti_smp_usual.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smp.append(content)
        
senti_smpecisa = []
with open('/data/albert.xht/sentiment/dev/senti_smpecisa.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smpecisa.append(content)
        

In [14]:
from sklearn.metrics import classification_report
from tqdm import tqdm

def eval_all(data, model, key):
    pred = []
    gold = []
    pred_score = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        result = model.predict(text)
        score = sorted(result[key], key=lambda u:u[1], reverse=True)
        pred.append(score[0][0])
        pred_score.append(result[key])
    print(classification_report(gold, pred, digits=4))
    return pred, gold, pred_score
    


In [44]:

def evaluation_ece(pred_score, gold):
    pred_score_l = []
    mapping_dict = {}
    for item in pred_score:
        pred_score_l.append([])
        for idx, p in enumerate(item):
            if p[0] not in mapping_dict:
                mapping_dict[p[0]] = idx
            pred_score_l[-1].append(p[1])
    pred_score_l = torch.tensor(pred_score_l)
    gold_l = torch.tensor([mapping_dict[item] for item in gold])

    ece_fn = ECE(n_bins=15)
    print(ece_fn(pred_score_l, gold_l, mode='probs'), '==ece==')
# pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
# evaluation_ece(pred_score, gold)


{'冒犯': 0, '正常': 1}
tensor([0.1119]) ==ece==


In [40]:
p

['正常', 0.9768216013908386]

In [37]:
mapping_dict

{}

In [28]:
item

'正常'

In [45]:
def evaluation(model_path):
    risk_api.reload(model_path)
    print('===offensive===')
    pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
    evaluation_ece(pred_score, gold)
    print('===cdia-bias===')
    pred, gold, pred_score = eval_all(cdia_bias, risk_api, 'bias')
    evaluation_ece(pred_score, gold)
    print('===ciron===')
    pred, gold, pred_score = eval_all(ciron, risk_api, 'ciron')
    evaluation_ece(pred_score, gold)
    print('===chsenti===')
    pred, gold, pred_score = eval_all(senti_copr, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smpecisa===')
    pred, gold, pred_score = eval_all(senti_smpecisa, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smp===')
    pred, gold, pred_score = eval_all(senti_smp, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    

In [46]:
evaluation('/data/albert.xht/xiaodao/risk_classification/multitask_raw_all_intent_v1/multitask_cls.pth.9')

===offensive===


100%|██████████| 5304/5304 [00:42<00:00, 125.32it/s]


              precision    recall  f1-score   support

          冒犯     0.7050    0.8794    0.7826      2106
          正常     0.9051    0.7577    0.8249      3198

    accuracy                         0.8060      5304
   macro avg     0.8051    0.8185    0.8037      5304
weighted avg     0.8257    0.8060    0.8081      5304

tensor([0.1119]) ==ece==
===cdia-bias===


100%|██████████| 2829/2829 [00:25<00:00, 111.92it/s]


              precision    recall  f1-score   support

          偏见     0.6208    0.4903    0.5479       718
          正常     0.8382    0.8982    0.8671      2111

    accuracy                         0.7946      2829
   macro avg     0.7295    0.6942    0.7075      2829
weighted avg     0.7830    0.7946    0.7861      2829

tensor([0.0231]) ==ece==
===ciron===


100%|██████████| 875/875 [00:08<00:00, 98.27it/s]


              precision    recall  f1-score   support

          正常     0.9255    0.9730    0.9487       779
          讽刺     0.6250    0.3646    0.4605        96

    accuracy                         0.9063       875
   macro avg     0.7753    0.6688    0.7046       875
weighted avg     0.8925    0.9063    0.8951       875

tensor([0.0363]) ==ece==
===chsenti===


100%|██████████| 1200/1200 [00:12<00:00, 96.01it/s]


              precision    recall  f1-score   support

          正向     0.8838    0.9106    0.8970       593
          负向     0.9100    0.8830    0.8963       607

    accuracy                         0.8967      1200
   macro avg     0.8969    0.8968    0.8967      1200
weighted avg     0.8971    0.8967    0.8967      1200

tensor([0.0226]) ==ece==
===senti_smpecisa===


100%|██████████| 2529/2529 [00:22<00:00, 111.83it/s]


              precision    recall  f1-score   support

          正向     0.8053    0.8160    0.8106      1201
          负向     0.8316    0.8215    0.8265      1328

    accuracy                         0.8189      2529
   macro avg     0.8184    0.8188    0.8186      2529
weighted avg     0.8191    0.8189    0.8190      2529

tensor([0.0208]) ==ece==
===senti_smp===


100%|██████████| 2844/2844 [00:22<00:00, 125.04it/s]


              precision    recall  f1-score   support

          正向     0.8540    0.8366    0.8452      1126
          负向     0.8943    0.9063    0.9003      1718

    accuracy                         0.8787      2844
   macro avg     0.8742    0.8714    0.8727      2844
weighted avg     0.8784    0.8787    0.8785      2844

tensor([0.0247]) ==ece==


In [47]:
evaluation('/data/albert.xht/xiaodao/risk_classification/multitask_raw_all_intent_v1/multitask_cls.pth.8')

===offensive===


100%|██████████| 5304/5304 [00:42<00:00, 125.25it/s]


              precision    recall  f1-score   support

          冒犯     0.7100    0.8742    0.7836      2106
          正常     0.9023    0.7649    0.8279      3198

    accuracy                         0.8083      5304
   macro avg     0.8061    0.8195    0.8057      5304
weighted avg     0.8259    0.8083    0.8103      5304

tensor([0.1089]) ==ece==
===cdia-bias===


100%|██████████| 2829/2829 [00:22<00:00, 124.64it/s]


              precision    recall  f1-score   support

          偏见     0.6448    0.4652    0.5405       718
          正常     0.8338    0.9128    0.8716      2111

    accuracy                         0.7992      2829
   macro avg     0.7393    0.6890    0.7060      2829
weighted avg     0.7859    0.7992    0.7875      2829

tensor([0.0267]) ==ece==
===ciron===


100%|██████████| 875/875 [00:06<00:00, 125.62it/s]


              precision    recall  f1-score   support

          正常     0.9210    0.9730    0.9463       779
          讽刺     0.5962    0.3229    0.4189        96

    accuracy                         0.9017       875
   macro avg     0.7586    0.6480    0.6826       875
weighted avg     0.8854    0.9017    0.8885       875

tensor([0.0286]) ==ece==
===chsenti===


100%|██████████| 1200/1200 [00:09<00:00, 121.02it/s]


              precision    recall  f1-score   support

          正向     0.8873    0.9157    0.9012       593
          负向     0.9150    0.8863    0.9004       607

    accuracy                         0.9008      1200
   macro avg     0.9011    0.9010    0.9008      1200
weighted avg     0.9013    0.9008    0.9008      1200

tensor([0.0237]) ==ece==
===senti_smpecisa===


100%|██████████| 2529/2529 [00:20<00:00, 124.04it/s]


              precision    recall  f1-score   support

          正向     0.8112    0.8193    0.8152      1201
          负向     0.8351    0.8276    0.8313      1328

    accuracy                         0.8236      2529
   macro avg     0.8232    0.8234    0.8233      2529
weighted avg     0.8238    0.8236    0.8237      2529

tensor([0.0197]) ==ece==
===senti_smp===


100%|██████████| 2844/2844 [00:22<00:00, 124.53it/s]


              precision    recall  f1-score   support

          正向     0.8557    0.8428    0.8492      1126
          负向     0.8980    0.9069    0.9024      1718

    accuracy                         0.8815      2844
   macro avg     0.8769    0.8748    0.8758      2844
weighted avg     0.8813    0.8815    0.8813      2844

tensor([0.0279]) ==ece==


In [48]:
evaluation('/data/albert.xht/xiaodao/risk_classification/multitask_raw_all_focal/multitask_cls.pth.9')

===offensive===


100%|██████████| 5304/5304 [00:42<00:00, 125.80it/s]


              precision    recall  f1-score   support

          冒犯     0.7068    0.8746    0.7818      2106
          正常     0.9021    0.7611    0.8256      3198

    accuracy                         0.8062      5304
   macro avg     0.8045    0.8179    0.8037      5304
weighted avg     0.8246    0.8062    0.8082      5304

tensor([0.0195]) ==ece==
===cdia-bias===


100%|██████████| 2829/2829 [00:22<00:00, 125.39it/s]


              precision    recall  f1-score   support

          偏见     0.5951    0.4749    0.5283       718
          正常     0.8329    0.8901    0.8605      2111

    accuracy                         0.7847      2829
   macro avg     0.7140    0.6825    0.6944      2829
weighted avg     0.7725    0.7847    0.7762      2829

tensor([0.1170]) ==ece==
===ciron===


100%|██████████| 875/875 [00:06<00:00, 125.42it/s]


              precision    recall  f1-score   support

          正常     0.9230    0.9692    0.9455       779
          讽刺     0.5789    0.3438    0.4314        96

    accuracy                         0.9006       875
   macro avg     0.7510    0.6565    0.6884       875
weighted avg     0.8852    0.9006    0.8891       875

tensor([0.1106]) ==ece==
===chsenti===


100%|██████████| 1200/1200 [00:09<00:00, 121.55it/s]


              precision    recall  f1-score   support

          正向     0.8807    0.9089    0.8946       593
          负向     0.9082    0.8797    0.8937       607

    accuracy                         0.8942      1200
   macro avg     0.8944    0.8943    0.8942      1200
weighted avg     0.8946    0.8942    0.8942      1200

tensor([0.1561]) ==ece==
===senti_smpecisa===


100%|██████████| 2529/2529 [00:20<00:00, 126.43it/s]


              precision    recall  f1-score   support

          正向     0.8163    0.8251    0.8207      1201
          负向     0.8403    0.8321    0.8362      1328

    accuracy                         0.8288      2529
   macro avg     0.8283    0.8286    0.8284      2529
weighted avg     0.8289    0.8288    0.8288      2529

tensor([0.1556]) ==ece==
===senti_smp===


100%|██████████| 2844/2844 [00:22<00:00, 125.37it/s]


              precision    recall  f1-score   support

          正向     0.8627    0.8313    0.8467      1126
          负向     0.8920    0.9133    0.9025      1718

    accuracy                         0.8808      2844
   macro avg     0.8773    0.8723    0.8746      2844
weighted avg     0.8804    0.8808    0.8804      2844

tensor([0.1674]) ==ece==


In [24]:
ece_fn = ECE(n_bins=15)

In [45]:
mapping_dict = {
    '冒犯':0,
    '正常':1
}

gold_l = torch.tensor([mapping_dict[item] for item in gold])
pred_score_l = torch.tensor([[item[0][1], item[1][1]] for item in pred_score])

ece_fn(pred_score_l, gold_l, mode='probs')

tensor([0.1104])

In [43]:
mapping_dict = {
    '负向':0,
    '正向':1
}

gold_l = torch.tensor([mapping_dict[item] for item in gold])
pred_score_l = torch.tensor([[item[0][1], item[1][1]] for item in pred_score])

ece_fn(pred_score_l, gold_l, mode='probs')

tensor([0.0198])