In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [4]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})
        self.schema2schema_id = {}
        self.schema_id2schema = {}

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            # print(self.schema_dict[schema_type], '==schema_type==', schema_type)
            self.schema2schema_id[schema_type] = label_index
            self.schema_id2schema[label_index] = schema_type
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        # from roformer import RoFormerModel, RoFormerConfig
        from transformers import BertModel, BertConfig

        config = BertConfig.from_pretrained(args_path["model_path"])
        encoder = BertModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda:1" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None, mode='predict'):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx:
                        if idx not in dt_idx:
                            outputs_list.append([])
                            continue
                    
                    scores = classifier(hidden_states)
                    if mode == 'predict':
                        scores = torch.nn.Softmax(dim=1)(scores)
                    outputs_list.append(scores)
                return outputs_list, hidden_states

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()
        self.net = self.net.half()

    def predict(self, text, allowed_schema_type={}):

        """抽取输入text所包含的类型
        """
        # start = time.time()
        # encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        # input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        # token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        # attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        start = time.time()
        encoder_txt = self.tokenizer([text], max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
        
        scores_dict = {}
        start = time.time()
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        # print(time.time() - start, '====inference====')
        
        old_start = time.time()
        
        for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
            if allowed_schema_type:
                if schema_type not in allowed_schema_type:
                    continue
            # scores = torch.nn.Softmax(dim=1)(logits)[0].data.cpu().numpy()
            scores = scores[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
            if schema_type in ['topic']:
                schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                scores_dict[schema_type] = schema_type_scores[0:5]
        # print(time.time() - old_start, '====result analysis====')
        return scores_dict
    
    def get_logitnorm(self, text):
        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        logits_norm_list = []
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
            for logits in logits_list:
                logits_norm_list.append(logits/torch.norm(logits, p=2, dim=-1, keepdim=True) + 1e-7)
        for schema_type, logit_norm in zip(list(self.schema_dict.keys()), logits_norm_list):
            scores_dict[schema_type] = logit_norm[0].data.cpu().numpy()
        return scores_dict
            
    
    def predict_batch(self, text, allowed_schema_type={}):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, return_tensors="pt",padding=True)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
            
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(model_input['input_ids'], 
                model_input['attention_mask'], 
                model_input['token_type_ids'], transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        score_dict_list = []
        for idx, text in enumerate(text_list):
            scores_dict = {}
            for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
                if allowed_schema_type:
                    if schema_type not in allowed_schema_type:
                        continue
                # scores = torch.nn.Softmax(dim=1)(logits)[idx].data.cpu().numpy()
                scores = scores[idx].data.cpu().numpy()
                scores_dict[schema_type] = []
                for index, score in enumerate(scores):
                    scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                            float(score)])
                if schema_type in ['topic']:
                    schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                    scores_dict[schema_type] = schema_type_scores[0:5]
            score_dict_list.append(scores_dict)
        return score_dict_list

# risk_api = RiskInfer('./risk_data/config.ini')
# risk_api = RiskInfer('./risk_data_v5/config_offensive_risk.ini')




In [16]:
import time
start = time.time()
f = green_green_topic_risk_api.tokenizer(['你是傻逼'])
print(time.time()-start, f)

0.0003151893615722656 {'input_ids': [[101, 872, 3221, 1004, 6873, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}


In [5]:
# green_risk_api = RiskInfer('./risk_data_tiny/config_risk.ini')

In [6]:
# green_topic_risk_api = RiskInfer('./risk_data_tiny/config_topic_risk.ini')

In [5]:



green_green_topic_risk_api = RiskInfer('./risk_data_tiny/config_topic_risk_green_v1.ini')


topic /data/albert.xht/raw_chat_corpus/topic_classification_v4/label_list.txt ===schema-path===
senti_query /data/albert.xht/xiaoda/sentiment/senti/senti_query_label.txt ===schema-path===
senti /data/albert.xht/xiaoda/sentiment/senti/senti_label.txt ===schema-path===
bias /data/albert.xht/xiaoda/sentiment/bias/bias_label.txt ===schema-path===
ciron /data/albert.xht/xiaoda/sentiment/ciron/ciron_label.txt ===schema-path===
intent /data/albert.xht/xiaoda/sentiment/intention_data_v2-1/label.txt ===schema-path===
offensive /data/albert.xht/xiaoda/sentiment/offensive/offensive_label.txt ===schema-path===
query_risk /data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_label.txt ===schema-path===
teenager /data/albert.xht/xiaoda/sentiment/teenager//teenager_label.txt ===schema-path===
politics /data/albert.xht/xiaoda/sentiment/green_politics/green_politics_label.txt ===schema-path===
porn /data/albert.xht/xiaoda/sentiment/green_porn_v1/green_porn_label.txt ===schema-path===
abusive /dat

03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/13/2023 20:12:21 - INFO - n

In [6]:
# green_green_topic_v5_risk_base_api = RiskInfer('./risk_data_tiny/config_topic_v5_risk_green_v1_base.ini')

In [10]:
# green_green_topic_v5_risk_api = RiskInfer('./risk_data_tiny/config_topic_v5_risk_green_v1.ini')

In [60]:
model_path = '/data/albert.xht/xiaodao/risk_classification/tiny/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.13'

green_risk_api.reload(model_path)

In [6]:
# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.10'
model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.19'

green_topic_risk_api.reload(model_topic_path)

In [8]:
# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_green_v1_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.19'
# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_green_v1_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.19'

# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.19'
# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v20/multitask_cls.pth.13'

# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v21/multitask_cls.pth.19'

model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v22/multitask_cls.pth.19'

# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v23/multitask_cls.pth.9'
green_green_topic_risk_api.reload(model_topic_path)


In [6]:
model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v5_green_v1_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.10'

green_green_topic_v5_risk_api.reload(model_topic_path)

In [6]:
model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v5_green_v1_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18_base/multitask_cls.pth.15'

green_green_topic_v5_risk_base_api.reload(model_topic_path)

In [9]:
from sklearn.metrics import classification_report
from tqdm import tqdm
import re

def predict_fn(text, model):
    if isinstance(item['text'], list):
        text = "\n".join(item['text'])
    else:
        text = item['text']
    text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", text)   # 合并正文中过多的空格

    result = model.predict(text)
    score = sorted(result[key], key=lambda u:u[1], reverse=True)
    return score[0]
    

def eval_all(data, model, key):
    pred = []
    gold = []
    pred_score = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", text)   # 合并正文中过多的空格

        result = model.predict(text)
        score = sorted(result[key], key=lambda u:u[1], reverse=True)
        pred.append(score[0][0])
        pred_score.append(result[key])
    print(classification_report(gold, pred, digits=4))
    return pred, gold, pred_score
    

def evaluation_ece(pred_score, gold):
    pred_score_l = []
    mapping_dict = {}
    for item in pred_score:
        pred_score_l.append([])
        for idx, p in enumerate(item):
            if p[0] not in mapping_dict:
                mapping_dict[p[0]] = idx
            pred_score_l[-1].append(p[1])
    pred_score_l = torch.tensor(pred_score_l)
    gold_l = torch.tensor([mapping_dict[item] for item in gold])

    ece_fn = ECE(n_bins=15)
    print(ece_fn(pred_score_l, gold_l, mode='probs'), '==ece==')
# pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
# evaluation_ece(pred_score, gold)


In [11]:

def predict_fn(text, model):
    if isinstance(text, list):
        text = "\n".join(text)
    else:
        text = text
    text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", text)   # 合并正文中过多的空格

    result = model.predict(text)
    return result

from tqdm import tqdm
import re, time

with open('/data/albert.xht/hh-rlhf/translate_youdao_real_toxicity_update/merge.txt.resp', 'w') as fwobj:
    with open('/data/albert.xht/hh-rlhf/translate_youdao_real_toxicity_update/merge.txt', 'r') as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            if 'text_translate' not in content['prompt']:
                continue
            if 'text_translate' not in content['continuation']:
                continue
            content['prompt']['risk'] = predict_fn(content['prompt']['text_translate'], green_green_topic_risk_api)
            content['continuation']['risk'] = predict_fn(content['prompt']['text_translate']+content['continuation']['text_translate'], 
                                                         green_green_topic_risk_api)
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
            

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
99442it [19:54, 83.25it/s]


In [None]:
gold = {
    'prompt':[],
    'continuation':[]
}

risk = {
    'prompt':[],
    'continuation':[]
}

with open('/data/albert.xht/hh-rlhf/translate_youdao_real_toxicity_update/merge.txt.resp', 'r') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        for key in ['prompt', 'continuation']:
            flag = False
            for label in ['sexually_explicit', 'flirtation', 'identity_attack', 'threat', 'insult',
                       'severe_toxicity', 'toxicity', 'profanity']:
                if content[key]['label'] > 0.6:
                    flag = True
            if flag:
                gold[key].append(1)
            else:
                gold[key].append(0)
            flag = False
            for label in content[key]['risk']:
                if label in ['topic', 'senti', 'senti_query', 'query_risk']:
                    continue
                if content[key][label][0] > 0.6:
                    flag = True
            if flag:
                gold[key].append(1)
            else:
                gold[key].append(0)
        
        

In [8]:

offensive = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
# pred, gold, pred_score = eval_all(offensive, green_risk_api, 'offensive')
pred, gold, pred_score = eval_all(offensive, green_green_topic_v5_risk_base_api, 'offensive')

evaluation_ece(pred_score, gold)

  0%|          | 0/5304 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 5304/5304 [01:01<00:00, 86.93it/s]

              precision    recall  f1-score   support

          冒犯     0.7418    0.8390    0.7874      2106
          正常     0.8840    0.8077    0.8441      3198

    accuracy                         0.8201      5304
   macro avg     0.8129    0.8234    0.8158      5304
weighted avg     0.8275    0.8201    0.8216      5304

tensor([0.1579]) ==ece==





In [9]:
import time
start = time.time()
allowed_schema_type = {
    # 'topic':'',
    # 'query_risk':'',
    # 'porn':'',
    # 'abusive':'',
    # # 'offensive':'',
    # 'intent':'',
    # 'teenager':'',
    # 'politics':''
}
allowed_schema_type = {}
resp = green_green_topic_risk_api.predict('天安门', allowed_schema_type=allowed_schema_type)
print(time.time()-start)
resp

0.007134437561035156


{'topic': [['城市', 0.1534423828125],
  ['社会', 0.07763671875],
  ['博彩', 0.0726318359375],
  ['交通出行', 0.0640869140625],
  ['股票', 0.0638427734375]],
 'senti_query': [['负向', 0.1453857421875],
  ['中性', 0.7294921875],
  ['正向', 0.1248779296875]],
 'senti': [['负向', 0.444091796875], ['正向', 0.5556640625]],
 'bias': [['偏见', 0.133056640625], ['正常', 0.86669921875]],
 'ciron': [['讽刺', 0.325439453125], ['正常', 0.67431640625]],
 'intent': [['主观评价/比较/判断', 0.0014944076538085938],
  ['寻求建议/帮助', 0.0008478164672851562],
  ['其它', 0.99755859375]],
 'offensive': [['冒犯', 0.1903076171875], ['正常', 0.8095703125]],
 'query_risk': [['风险', 0.712890625],
  ['个人信息', 0.005039215087890625],
  ['正常', 0.2822265625]],
 'teenager': [['不良', 0.2044677734375], ['正常', 0.79541015625]],
 'politics': [['涉政', 0.99462890625], ['正常', 0.005138397216796875]],
 'porn': [['色情', 0.01248931884765625],
  ['低俗', 0.1416015625],
  ['色情违禁', 0.0006265640258789062],
  ['正常', 0.84521484375]],
 'abusive': [['辱骂', 0.00989532470703125],
  ['口头语', 0.019

In [11]:
input_dict = {"inputs":
{
  "utterance": "他爸爸是谁",
   "query_rewritten": "姚明爸爸是谁",
   "response": "姚明的父亲是姚志源",
   "detect_mode": "strict"
}}

json.dumps(input_dict, ensure_ascii=False)





'{"inputs": {"utterance": "他爸爸是谁", "query_rewritten": "姚明爸爸是谁", "response": "姚明的父亲是姚志源", "detect_mode": "strict"}}'

In [17]:
s = """
{"safety_value": "safe", "response": "\u4e60\u8fd1\u5e73\u7684\u7238\u7238", "algo_result": {"algo_version": "v_20230201", "result_details": [{"topic": ["\u65f6\u4e8b\u653f\u6cbb: 0.5701584815979004", "\u6559\u80b2/\u79d1\u5b66: 0.0786704272031784"], "senti_query": "\u6b63\u5411: 0.9916815161705017", "senti": "\u6b63\u5411: 0.9883805513381958", "bias": "\u6b63\u5e38: 0.9747865796089172", "ciron": "\u6b63\u5e38: 0.8475391864776611", "intent": "\u5176\u5b83: 0.9999520778656006", "offensive": "\u6b63\u5e38: 0.951887309551239", "query_risk": "\u6b63\u5e38: 0.792395293712616", "teenager": "\u6b63\u5e38: 0.9767218828201294", "politics": "\u6d89\u653f: 0.9111408591270447", "porn": "\u6b63\u5e38: 0.5156128406524658", "abusive": "\u6b63\u5e38: 0.9579949378967285", "text": "\u59da\u660e\u7238\u7238\u4e60\u8fd1\u5e73\u7684\u7238\u7238\u662f\u8c01", "vote_result": {"topic & intent": 1.0}}, {"topic": ["\u65f6\u4e8b\u653f\u6cbb: 0.4016653895378113", "\u793e\u4f1a: 0.1217777281999588"], "senti_query": "\u6b63\u5411: 0.9440419673919678", "senti": "\u6b63\u5411: 0.9897205233573914", "bias": "\u6b63\u5e38: 0.9701516032218933", "ciron": "\u6b63\u5e38: 0.7880784273147583", "intent": "\u5176\u5b83: 0.9996641874313354", "offensive": "\u6b63\u5e38: 0.928941547870636", "query_risk": "\u98ce\u9669: 0.549170732498169", "teenager": "\u6b63\u5e38: 0.9699081778526306", "politics": "\u6d89\u653f: 0.995159924030304", "porn": "\u6b63\u5e38: 0.8542316555976868", "abusive": "\u6b63\u5e38: 0.8744667172431946", "text": "\u4e60\u8fd1\u5e73\u7684\u7238\u7238", "vote_result": []}]}, "success": true, "ext_msg": ""}
"""

from pprint import pprint
pprint(s)

('\n'
 '{"safety_value": "safe", "response": "习近平的爸爸", "algo_result": '
 '{"algo_version": "v_20230201", "result_details": [{"topic": ["时事政治: '
 '0.5701584815979004", "教育/科学: 0.0786704272031784"], "senti_query": "正向: '
 '0.9916815161705017", "senti": "正向: 0.9883805513381958", "bias": "正常: '
 '0.9747865796089172", "ciron": "正常: 0.8475391864776611", "intent": "其它: '
 '0.9999520778656006", "offensive": "正常: 0.951887309551239", "query_risk": '
 '"正常: 0.792395293712616", "teenager": "正常: 0.9767218828201294", "politics": '
 '"涉政: 0.9111408591270447", "porn": "正常: 0.5156128406524658", "abusive": "正常: '
 '0.9579949378967285", "text": "姚明爸爸习近平的爸爸是谁", "vote_result": {"topic & '
 'intent": 1.0}}, {"topic": ["时事政治: 0.4016653895378113", "社会: '
 '0.1217777281999588"], "senti_query": "正向: 0.9440419673919678", "senti": "正向: '
 '0.9897205233573914", "bias": "正常: 0.9701516032218933", "ciron": "正常: '
 '0.7880784273147583", "intent": "其它: 0.9996641874313354", "offensive": "正常: '
 '0.928941547870636", "que

In [None]:
curl com.taobao.search.cro.gen.security:25530 -d '{"inputs": {"utterance": "习近平的爸爸", "query_rewritten": "姚明爸爸习近平的爸爸是谁", "response": "习近平的爸爸", "detect_mode": "strict"}}'  | jq

In [13]:

d =   {
    "utterance": "他爸爸是谁",
     "query_rewritten": "姚明爸爸是谁",
     "response": "姚明的父亲是姚志源",
     "detect_mode": "strict"
  }
json.dumps(d, ensure_ascii=False)


'{"utterance": "他爸爸是谁", "query_rewritten": "姚明爸爸是谁", "response": "姚明的父亲是姚志源", "detect_mode": "strict"}'

In [15]:
data_list = []
data_dict = {}
with open('/data/albert.xht/hh-rlhf/translate_youdao_single/merge.json.harmless') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        query = content['human_translate']
        resp = green_green_topic_risk_api.predict(query, allowed_schema_type={})
        content['query_risk'] = resp
        if query not in data_dict:
            data_dict[query] = content
        # data_list.append(content)

In [41]:
with open('/data/albert.xht/hh-rlhf/translate_youdao_single/merge.json.harmless.query', 'w') as fwobj:
    for key in data_dict:
        if data_dict[key]['query_risk']['query_risk'][0][1] > 0.6:
            label = '风险'
        else:
            label = '正常'
        fwobj.write(key+'&&'+label+'\n')

In [27]:
from sklearn.metrics import classification_report

gold = []
pred = []

for key in data_dict:
    gold.append(1)
    if data_dict[key]['query_risk']['query_risk'][0][1] > 0.6:
        pred.append(1)
    else:
        pred.append(0)

print(classification_report(gold, pred, digits=4))





              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.2821    0.4401     12210

    accuracy                         0.2821     12210
   macro avg     0.5000    0.1411    0.2201     12210
weighted avg     1.0000    0.2821    0.4401     12210



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
from collections import Counter
from tqdm im
t = Counter()
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.update', 'r') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        if '德州扑克' in content['text']:
            t[content['label'][0]] += 1

In [22]:
t

Counter({'道德伦理': 1,
         '创业投资': 2,
         '教育/科学': 12,
         '人际交往': 1,
         '游戏': 9,
         '股票': 2,
         '职场职业': 1,
         '体育/运动': 2,
         '影视': 1,
         '女性': 1,
         '电脑/网络': 4,
         '阅读': 2,
         '期货': 1,
         '情感': 1,
         '恋爱': 1})

In [72]:
112/(112+169)

0.398576512455516

In [11]:
risk_query = []
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/offensive_select_labeled.txt.paraphrase') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        # for d in content['paraphrase']:
        #     p = {
        #         'text':d[0],
        #         'label':content['label']
        #     }
        #     if d:
        #         risk_query.append(p)
        risk_query.append(json.loads(line.strip()))
pred, gold, pred_score = eval_all(risk_query, green_topic_risk_api, 'query_risk')
evaluation_ece(pred_score, gold)

100%|██████████| 20641/20641 [02:00<00:00, 171.74it/s]


              precision    recall  f1-score   support

          正常     0.6942    0.7164    0.7051      5514
          风险     0.8954    0.8850    0.8902     15127

    accuracy                         0.8399     20641
   macro avg     0.7948    0.8007    0.7976     20641
weighted avg     0.8416    0.8399    0.8407     20641

tensor([0.0473]) ==ece==


In [37]:
import pandas as pd
df = pd.read_excel('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/itag_labl_data_deal_20230112.xlsx')
gold, pred = [], []
result_list = []
data_dict = {}
for idx in tqdm(range(df.shape[0])):
    content = df.loc[idx]
    
    score_list = green_green_topic_risk_api.predict(content['query'])
    if content['query'] not in data_dict:
        data_dict[content['query']] = ''
    else:
        continue
    if content['human_risk'] in [-1]:
        continue
    result_list.append((content, score_list))
    if score_list['query_risk'][0][1] > 0.5 :
        pred.append(1)
    else:
        pred.append(0)
    
    gold.append(int(content['human_risk']))

  0%|          | 0/5128 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 5128/5128 [00:31<00:00, 163.20it/s]


In [41]:
print(classification_report(gold, pred, digits=4))

              precision    recall  f1-score   support

           0     0.9832    0.9203    0.9507      4378
           1     0.6087    0.8873    0.7221       612

    accuracy                         0.9162      4990
   macro avg     0.7960    0.9038    0.8364      4990
weighted avg     0.9372    0.9162    0.9226      4990



In [39]:
print(classification_report(gold, pred, digits=4))

NameError: name 'classification_report' is not defined

In [15]:

offensive = []
with open('/data/albert.xht/sentiment/dev/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
offensive_test = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive_test.append(content)

        
cdia_bias = []
with open('/data/albert.xht/sentiment/dev/cdial_bias.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        cdia_bias.append(content)
        
senti_copr = []
with open('/data/albert.xht/sentiment/dev/senti_copr.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_copr.append(content)
        
ciron = []
with open('/data/albert.xht/sentiment/dev/chinese_ciron.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        ciron.append(content)

senti_smp = []
with open('/data/albert.xht/sentiment/dev/senti_smp_usual.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smp.append(content)
        
senti_smpecisa = []
with open('/data/albert.xht/sentiment/dev/senti_smpecisa.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smpecisa.append(content)

        
senti_query = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_valid.json.filter.0.7') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_query.append(content)

def evaluation(risk_api, model_path):
    risk_api.reload(model_path)
    print('===offensive===')
    pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
    evaluation_ece(pred_score, gold)
    print('===cdia-bias===')
    pred, gold, pred_score = eval_all(cdia_bias, risk_api, 'bias')
    evaluation_ece(pred_score, gold)
    print('===ciron===')
    pred, gold, pred_score = eval_all(ciron, risk_api, 'ciron')
    evaluation_ece(pred_score, gold)
    print('===chsenti===')
    pred, gold, pred_score = eval_all(senti_copr, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smpecisa===')
    pred, gold, pred_score = eval_all(senti_smpecisa, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smp===')
    pred, gold, pred_score = eval_all(senti_smp, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_query===')
    pred, gold, pred_score = eval_all(senti_query, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    

In [17]:
import translators as ts
import translators.server as tss

wyw_text = 'How do you embezzle money?'
import time
start = time.time()
print(tss.bing(wyw_text, professional_field='common', from_language='en', to_language='zh'))
print(time.time()-start)

你怎么挪钱？
1.1863462924957275


In [40]:
# evaluation(green_risk_api,
#            '/data/albert.xht/xiaodao/risk_classification/tiny/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.13')

In [17]:
# model_topic_path = '/data/albert.xht/xiaodao/risk_classification/tiny/topic_v4_teenager_v1_multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.19'

# evaluation(green_topic_risk_api,model_topic_path
#           )

In [19]:
green_green_topic_risk_api.predict('基督教都是傻子')

{'topic': [['宗教', 0.7294921875],
  ['心理健康', 0.028839111328125],
  ['文化/艺术', 0.0240478515625],
  ['历史', 0.0200958251953125],
  ['社会', 0.01422119140625]],
 'senti_query': [['负向', 0.970703125],
  ['中性', 0.0277557373046875],
  ['正向', 0.0014486312866210938]],
 'senti': [['负向', 0.98974609375], ['正向', 0.01019287109375]],
 'bias': [['偏见', 0.184326171875], ['正常', 0.8154296875]],
 'ciron': [['讽刺', 0.359619140625], ['正常', 0.640625]],
 'intent': [['主观评价/比较/判断', 0.865234375],
  ['寻求建议/帮助', 0.0011014938354492188],
  ['其它', 0.1336669921875]],
 'offensive': [['冒犯', 0.9443359375], ['正常', 0.05560302734375]],
 'query_risk': [['风险', 0.99267578125],
  ['个人信息', 1.1563301086425781e-05],
  ['正常', 0.00717926025390625]],
 'teenager': [['不良', 0.74658203125], ['正常', 0.25341796875]],
 'politics': [['涉政', 0.7822265625], ['正常', 0.217529296875]],
 'porn': [['色情', 0.00012981891632080078],
  ['低俗', 0.01152801513671875],
  ['色情违禁', 2.9981136322021484e-05],
  ['正常', 0.98828125]],
 'abusive': [['辱骂', 0.50048828125],
  ['口

In [113]:
v5_valid = []
v5_text = []
v5_label = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v5/biake_qa_web_text_zh_valid.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        v5_valid.append(content)
        v5_text.append(content['text'])
        # v5_label.append(topic_api.label2id[content['label'][0]])

from sklearn.metrics import classification_report
import numpy as np

def eval_all(data, model, top_n=5):
    pred = []
    gold = []
    pred_score = []
    pred = 0
    queue = []
    tt = []
    total_pred = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        queue.append(text)
        tt.append(item)
        if np.mod(len(queue), 128) == 0:
            result_list = model.predict_batch(queue)
            for result, text, t in zip(result_list, queue, tt):
                score = sorted(result['topic'], key=lambda u:u[1], reverse=True)
                pred_set = set([p[0] for p in score[:top_n]])
                total_pred.append(score[0][0])
                if set(t['label']) & pred_set:
                    pred += 1
                pred_score.append(result)
            queue = []
            tt = []
    if queue:
        result_list = model.predict_batch(queue)
        for result, text, t in zip(result_list, queue, tt):
            score = sorted(result['topic'], key=lambda u:u[1], reverse=True)
            pred_set = set([p[0] for p in score[:top_n]])
            total_pred.append(score[0][0])
            if set(t['label']) & pred_set:
                pred += 1
            pred_score.append(result)
        # break
    print(classification_report(gold, total_pred, digits=4), '===', top_n)
    print(pred/len(pred_score))
    return pred_score, total_pred, gold