In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [25]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier
from nets.simcse import MLPLayer, Similarity

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})
        self.schema2schema_id = {}
        self.schema_id2schema = {}

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            # print(self.schema_dict[schema_type], '==schema_type==', schema_type)
            self.schema2schema_id[schema_type] = label_index
            self.schema_id2schema[label_index] = schema_type
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        # from roformer import RoFormerModel, RoFormerConfig
        if args_path.get('model_type', 'bert') == 'bert':
            from transformers import BertModel, BertConfig
            config = BertConfig.from_pretrained(args_path["model_path"])
            encoder = BertModel(config=config)
        elif args_path.get('model_type', 'bert') == 'roformer':
            from roformer import RoFormerModel, RoFormerConfig
            config = RoFormerConfig.from_pretrained(args_path["model_path"])
            encoder = RoFormerModel(config=config)
        elif args_path.get('model_type', 'bert') == 'erine':
            from nets.erine import ErnieConfig, ErnieModel
            config = ErnieConfig.from_pretrained(args_path["model_path"])
            encoder = ErnieModel(config=config)
            
        print(args_path.get('model_type', 'bert'))
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda:1" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list
                
                self.pooler_mlp = MLPLayer(config.hidden_size, 256)

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None, mode='predict'):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx:
                        if idx not in dt_idx:
                            outputs_list.append([])
                            continue
                    
                    scores = classifier(hidden_states)
                    if mode == 'predict':
                        scores = torch.nn.Softmax(dim=1)(scores)
                    outputs_list.append(scores)
                pooler_output = self.pooler_mlp(hidden_states)
                embeddings = hidden_states / hidden_states.norm(dim=1, keepdim=True)
                pooler_output_embeddings = pooler_output / pooler_output.norm(dim=1, keepdim=True)
                return outputs_list, hidden_states, embeddings, pooler_output_embeddings

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()
        self.net = self.net.half()

    def predict(self, text, allowed_schema_type={}):

        """抽取输入text所包含的类型
        """
        # start = time.time()
        # encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        # input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        # token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        # attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        start = time.time()
        encoder_txt = self.tokenizer([text], max_length=512)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
        
        scores_dict = {}
        start = time.time()
        with torch.no_grad():
            [logits_list, 
            hidden_states, 
             embeddings, 
             pooler_output_embeddings] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        # print(time.time() - start, '====inference====')
        
        old_start = time.time()
        
        for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
            if allowed_schema_type:
                if schema_type not in allowed_schema_type:
                    continue
            scores = scores[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
            if len(scores_dict[schema_type]) >= 5:
                schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                scores_dict[schema_type] = schema_type_scores[0:5]
        return scores_dict, embeddings.data.cpu().numpy(), pooler_output_embeddings.data.cpu().numpy()
    
    def get_logitnorm(self, text):
        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=512)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        logits_norm_list = []
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
            for logits in logits_list:
                logits_norm_list.append(logits/torch.norm(logits, p=2, dim=-1, keepdim=True) + 1e-7)
        for schema_type, logit_norm in zip(list(self.schema_dict.keys()), logits_norm_list):
            scores_dict[schema_type] = logit_norm[0].data.cpu().numpy()
        return scores_dict
            
    
    def predict_batch(self, text, allowed_schema_type={}):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, max_length=512, truncation=True, return_tensors="pt",padding=True)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
            
        with torch.no_grad():
            [logits_list, 
            hidden_states,
            embeddings, 
             pooler_output_embeddings] = self.net(model_input['input_ids'], 
                                                model_input['attention_mask'], 
                                                model_input['token_type_ids'], 
                                                transformer_mode='cls', 
                                                dt_idx=allowed_schema_type_ids)
        score_dict_list = []
        embedding_array = []
        pooler_output_embeddings_array = []
        for idx, text in enumerate(text_list):
            scores_dict = {}
            for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
                if allowed_schema_type:
                    if schema_type not in allowed_schema_type:
                        continue
                # scores = torch.nn.Softmax(dim=1)(logits)[idx].data.cpu().numpy()
                scores = scores[idx].data.cpu().numpy()
                scores_dict[schema_type] = []
                for index, score in enumerate(scores):
                    scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                            float(score)])
                if len(scores_dict[schema_type]) >= 5:
                    schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                    scores_dict[schema_type] = schema_type_scores[0:5]
            score_dict_list.append(scores_dict)
        embedding_array = embeddings.data.cpu().numpy()
        pooler_output_embeddings_array = pooler_output_embeddings.data.cpu().numpy()
        return score_dict_list, embedding_array, pooler_output_embeddings_array




In [26]:
erine_green_green_topic_risk_open_all_politics_detail_api = RiskInfer('/root/xiaoda/query_topic/resources_open_all_politics_detail_yewu_en_erine_update_simcse/topic_query_risk/config.ini')
model_path = '/data/albert.xht/xiaoda/risk_classification/erine_simcse/opic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_add_nli_instruct_politics_detail_mtdnn_v39_simcse_zh_en_1e-4_odps//multitask_cls.pth.9'
erine_green_green_topic_risk_open_all_politics_detail_api.reload(model_path)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ErnieTokenizer'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ErnieTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.


toxicity /data/albert.xht/xiaoda/sentiment/jigsaw-unintended-bias-in-toxicity-classification/toxicity_multiclass_label_list.txt ===schema-path===
stackoverflow_topic /data/albert.xht/xiaoda/sentiment/stackoverflow_topic/stackoverflow_topic_label_list.txt ===schema-path===
nli /data/albert.xht/xiaoda/sentiment/classification/cmnli/cmnli_label_list.txt ===schema-path===
lcqmc /data/albert.xht/xiaoda/sentiment/classification/paws-x-zh/paws_label_list.txt ===schema-path===
yewu /data/albert.xht/xiaoda/sentiment/yewu_v1/yewu_label_list.txt ===schema-path===
risk_news /data/albert.xht/xiaoda/sentiment/risk_news/risk_news_label_list.txt ===schema-path===
tnews /data/albert.xht/xiaoda/sentiment/classification/tnews_v1/tnews_label_list.txt ===schema-path===
title2event /data/albert.xht/xiaoda/sentiment/classification/title2event_v1/title2event_label_list.txt ===schema-path===
fewfc_2022 /data/albert.xht/xiaoda/sentiment/classification/fewfc_2022/fewfc_2022_label_list.txt ===schema-path===
duee 

05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
05/04/2023 10:31:16 - INFO - n

erine


In [147]:
import time
from pprint import pprint
start = time.time()
allowed_schema_type = {
    # 'topic':'',
    # 'query_risk':'',
    # 'porn':'',
    # 'abusive':'',
    # # 'offensive':'',
    # 'intent':'',
    # 'teenager':'',
    # 'politics':''
}
allowed_schema_type = {}
resp = erine_green_green_topic_risk_open_all_politics_detail_api.predict_batch(
    ["阿斯顿了看回放个可视电话风口浪尖啊都是发就是的卡",'I really hate her', 'I hate you[SEP]I love you'],

 allowed_schema_type=allowed_schema_type)

print(time.time()-start)



0.01972198486328125


In [106]:
s = 'nli porn query_risk intent teenager senti stackoverflow_topic genocide politics tnews query_resposne_risk humiliate offensive title2event risk_news ciron attack_defend violence cmid senti_query lcqmc yewu bias topic hatespeech fewfc_2022 insult toxicity dehumanize abusive duee ethics_common'
root = '/mnt/albert.xht/xiaoda/sentiment/huggingface_dataset/dataset_{}'

sss = []
for d in s.split():
    sss.append(d+':'+root.format(d))

In [148]:
np.sum(resp[1][0]*resp[1][1])

0.723