In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [77]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})
        self.schema2schema_id = {}
        self.schema_id2schema = {}

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            # print(self.schema_dict[schema_type], '==schema_type==', schema_type)
            self.schema2schema_id[schema_type] = label_index
            self.schema_id2schema[label_index] = schema_type
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        # from roformer import RoFormerModel, RoFormerConfig
        from transformers import BertModel, BertConfig

        config = BertConfig.from_pretrained(args_path["model_path"])
        encoder = BertModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda:1" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None, mode='predict'):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx:
                        if idx not in dt_idx:
                            outputs_list.append([])
                            continue
                    
                    scores = classifier(hidden_states)
                    if mode == 'predict':
                        scores = torch.nn.Softmax(dim=1)(scores)
                    outputs_list.append(scores)
                return outputs_list, hidden_states

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()
        self.net = self.net.half()

    def predict(self, text, allowed_schema_type={}):

        """抽取输入text所包含的类型
        """
        # start = time.time()
        # encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        # input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        # token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        # attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        start = time.time()
        encoder_txt = self.tokenizer([text], max_length=512)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).to(self.device)
        # print(time.time() - start, '====tokenization====')
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
        
        scores_dict = {}
        start = time.time()
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        # print(time.time() - start, '====inference====')
        
        old_start = time.time()
        
        for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
            if allowed_schema_type:
                if schema_type not in allowed_schema_type:
                    continue
            # scores = torch.nn.Softmax(dim=1)(logits)[0].data.cpu().numpy()
            scores = scores[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
            if len(scores_dict[schema_type]) >= 5:
                schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                scores_dict[schema_type] = schema_type_scores[0:5]
        # print(time.time() - old_start, '====result analysis====')
        return scores_dict
    
    def get_logitnorm(self, text):
        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=512)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        logits_norm_list = []
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
            for logits in logits_list:
                logits_norm_list.append(logits/torch.norm(logits, p=2, dim=-1, keepdim=True) + 1e-7)
        for schema_type, logit_norm in zip(list(self.schema_dict.keys()), logits_norm_list):
            scores_dict[schema_type] = logit_norm[0].data.cpu().numpy()
        return scores_dict
            
    
    def predict_batch(self, text, allowed_schema_type={}):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, max_length=512, truncation=True, return_tensors="pt",padding=True)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        
        allowed_schema_type_ids = {}
        for schema_type in allowed_schema_type:
            allowed_schema_type_ids[self.schema2schema_id[schema_type]] = schema_type
            
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(model_input['input_ids'], 
                model_input['attention_mask'], 
                model_input['token_type_ids'], transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        score_dict_list = []
        for idx, text in enumerate(text_list):
            scores_dict = {}
            for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
                if allowed_schema_type:
                    if schema_type not in allowed_schema_type:
                        continue
                # scores = torch.nn.Softmax(dim=1)(logits)[idx].data.cpu().numpy()
                scores = scores[idx].data.cpu().numpy()
                scores_dict[schema_type] = []
                for index, score in enumerate(scores):
                    scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                            float(score)])
                if len(scores_dict[schema_type]) >= 5:
                    schema_type_scores = sorted(scores_dict[schema_type], key=lambda item:item[1], reverse=True)
                    scores_dict[schema_type] = schema_type_scores[0:5]
            score_dict_list.append(scores_dict)
        return score_dict_list

# risk_api = RiskInfer('./risk_data/config.ini')
# risk_api = RiskInfer('./risk_data_v5/config_offensive_risk.ini')




In [5]:

green_green_topic_risk_api = RiskInfer('/root/xiaoda/query_topic/risk_data_tiny_query_response_cmid/risk_data_tiny/config_topic_risk_green_v1.ini')
model_path = '/data/albert.xht/xiaodao/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v24/multitask_cls.pth.8'

green_green_topic_risk_api.reload(model_path)

query_resposne_risk /data/albert.xht/xiaoda/query_response/red_team/query_response/query_response_label.txt ===schema-path===
cmid /data/albert.xht/xiaoda/sentiment/CMID-main/cmid_label_list.txt ===schema-path===
topic /data/albert.xht/raw_chat_corpus/topic_classification_v4/label_list.txt ===schema-path===
senti_query /data/albert.xht/xiaoda/sentiment/senti/senti_query_label.txt ===schema-path===
senti /data/albert.xht/xiaoda/sentiment/senti/senti_label.txt ===schema-path===
bias /data/albert.xht/xiaoda/sentiment/bias/bias_label.txt ===schema-path===
ciron /data/albert.xht/xiaoda/sentiment/ciron/ciron_label.txt ===schema-path===
intent /data/albert.xht/xiaoda/sentiment/intention_data_v2-1/label.txt ===schema-path===
offensive /data/albert.xht/xiaoda/sentiment/offensive/offensive_label.txt ===schema-path===
query_risk /data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_label.txt ===schema-path===
teenager /data/albert.xht/xiaoda/sentiment/teenager//teenager_label.txt ===schema-

03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/23/2023 06:25:11 - INFO - n

In [15]:
green_green_topic_risk_api.tokenizer([('我是谁', '你是谁')], add_special_tokens=False)

{'input_ids': [[2769, 3221, 6443, 872, 3221, 6443]], 'token_type_ids': [[0, 0, 0, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}

In [5]:

green_green_topic_risk_open_api = RiskInfer('/root/xiaoda/query_topic/risk_data_tiny_query_response_cmid_open/topic_query_risk/config.ini')
model_path = '/data/albert.xht/xiaodao/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v26_duplicate_with_opensource/multitask_cls.pth.7'

green_green_topic_risk_open_api.reload(model_path)


tnews /data/albert.xht/xiaoda/sentiment/classification/tnews/tnews_label_list.txt ===schema-path===
title2event /data/albert.xht/xiaoda/sentiment/classification/title2event/title2event_label_list.txt ===schema-path===
fewfc_2022 /data/albert.xht/xiaoda/sentiment/classification/fewfc_2022/fewfc_2022_label_list.txt ===schema-path===
duee /data/albert.xht/xiaoda/sentiment/classification/DuEE1.0/duee_label_list.txt ===schema-path===
query_resposne_risk /data/albert.xht/xiaoda/query_response/red_team/query_response/query_response_label.txt ===schema-path===
cmid /data/albert.xht/xiaoda/sentiment/CMID-main/cmid_label_list.txt ===schema-path===
topic /data/albert.xht/raw_chat_corpus/topic_classification_v4/label_list.txt ===schema-path===
senti_query /data/albert.xht/xiaoda/sentiment/senti/senti_query_label.txt ===schema-path===
senti /data/albert.xht/xiaoda/sentiment/senti/senti_label.txt ===schema-path===
bias /data/albert.xht/xiaoda/sentiment/bias/bias_label.txt ===schema-path===
ciron /da

03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/17/2023 20:51:07 - INFO - n

In [None]:
green_green_topic_risk_open_api.tokenier

In [13]:
green_green_topic_risk_open_api = RiskInfer('/root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open/topic_query_risk/config.ini')
model_path = '/data/albert.xht/xiaodao/risk_classification/topic_v4_update_green_v1_teenager_v1_potitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_v24//multitask_cls.pth.9'

green_green_topic_risk_open_api.reload(model_path)


tnews /data/albert.xht/xiaoda/sentiment/classification/tnews/tnews_label_list.txt ===schema-path===
title2event /data/albert.xht/xiaoda/sentiment/classification/title2event/title2event_label_list.txt ===schema-path===
fewfc_2022 /data/albert.xht/xiaoda/sentiment/classification/fewfc_2022/fewfc_2022_label_list.txt ===schema-path===
duee /data/albert.xht/xiaoda/sentiment/classification/DuEE1.0/duee_label_list.txt ===schema-path===
ethics_common /root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open/topic_query_risk/ethics_common_label_list.txt ===schema-path===
insult /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.insult ===schema-path===
humiliate /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.humiliate ===schema-path===
dehumanize /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.dehumanize ===schema-path===
violence /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.violence ===schem

03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/20/2023 10:55:17 - INFO - n

In [78]:
green_green_topic_risk_open_all_api = RiskInfer('/root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open_all/topic_query_risk/config.ini')
model_path = '/data/albert.xht/xiaodao/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_v24/multitask_cls.pth.9'
green_green_topic_risk_open_all_api.reload(model_path)


yewu /data/albert.xht/xiaoda/sentiment/yewu/yewu_label_list.txt ===schema-path===
risk_news /data/albert.xht/xiaoda/sentiment/risk_news/risk_news_label_list.txt ===schema-path===
tnews /data/albert.xht/xiaoda/sentiment/classification/tnews/tnews_label_list.txt ===schema-path===
title2event /data/albert.xht/xiaoda/sentiment/classification/title2event/title2event_label_list.txt ===schema-path===
fewfc_2022 /data/albert.xht/xiaoda/sentiment/classification/fewfc_2022/fewfc_2022_label_list.txt ===schema-path===
duee /data/albert.xht/xiaoda/sentiment/classification/DuEE1.0/duee_label_list.txt ===schema-path===
ethics_common /root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open/topic_query_risk/ethics_common_label_list.txt ===schema-path===
insult /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.insult ===schema-path===
humiliate /data/albert.xht/xiaoda/measuring_hate_speech/hate_speech_label_list.txt.humiliate ===schema-path===
dehumanize /data/albert.xht/xiaod

03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:55:51 - INFO - n

In [43]:
green_green_topic_risk_open_all_nli_api = RiskInfer('/root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open_nli/topic_query_risk/config.ini')
# model_path = '/data/albert.xht/xiaoda/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_add_nli_v26/multitask_cls.pth.9'
# model_path = '/data/albert.xht/xiaoda/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_add_nli_cls_nli_all_v26/multitask_cls.pth.7'
# model_path = '/data/albert.xht/xiaoda/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_add_nli_instruct_cls_nli_v26/multitask_cls.pth.7'

model_path = '/data/albert.xht/xiaoda/risk_classification/topic_v4_update_green_v1_teenager_v1_porn_multitask_raw_filter_senti_query_risk_v13_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_add_hate_speech_detail_ethics_open_all_add_nli_instruct_v27/multitask_cls.pth.9'
green_green_topic_risk_open_all_nli_api.reload(model_path)


nli /data/albert.xht/xiaoda/sentiment/classification/cmnli/cmnli_label_list.txt ===schema-path===
lcqmc /data/albert.xht/xiaoda/sentiment/classification/paws-x-zh/paws_label_list.txt ===schema-path===
yewu /data/albert.xht/xiaoda/sentiment/yewu/yewu_label_list.txt ===schema-path===
risk_news /data/albert.xht/xiaoda/sentiment/risk_news/risk_news_label_list.txt ===schema-path===
tnews /data/albert.xht/xiaoda/sentiment/classification/tnews/tnews_label_list.txt ===schema-path===
title2event /data/albert.xht/xiaoda/sentiment/classification/title2event/title2event_label_list.txt ===schema-path===
fewfc_2022 /data/albert.xht/xiaoda/sentiment/classification/fewfc_2022/fewfc_2022_label_list.txt ===schema-path===
duee /data/albert.xht/xiaoda/sentiment/classification/DuEE1.0/duee_label_list.txt ===schema-path===
ethics_common /root/xiaoda/query_topic/risk_data_tiny_query_reseponse_open/topic_query_risk/ethics_common_label_list.txt ===schema-path===
insult /data/albert.xht/xiaoda/measuring_hate_sp

03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
03/29/2023 11:23:47 - INFO - n

In [97]:
import time
start = time.time()
allowed_schema_type = {
    # 'topic':'',
    # 'query_risk':'',
    # 'porn':'',
    # 'abusive':'',
    # # 'offensive':'',
    # 'intent':'',
    # 'teenager':'',
    # 'politics':''
}
allowed_schema_type = {}
resp = green_green_topic_risk_open_all_nli_api.predict("""
习近平好人
""",allowed_schema_type=allowed_schema_type)
print(time.time()-start)
resp

0.013921260833740234


{'nli': [['neutral', 0.276611328125],
  ['entailment', 0.35595703125],
  ['contradiction', 0.367431640625]],
 'lcqmc': [['不相似', 0.88330078125], ['相似', 0.116943359375]],
 'yewu': [['导购', 0.95068359375],
  ['售后', 0.022705078125],
  ['广告外投', 0.01043701171875],
  ['权益公平性', 0.00957489013671875],
  ['个性化退出', 0.0032138824462890625]],
 'risk_news': [['无', 0.908203125],
  ['实控人变更', 0.045257568359375],
  ['重大诉讼仲裁', 0.01546478271484375],
  ['破产重整', 0.0118408203125],
  ['主板/创业板/中小板/债券退市', 0.00586700439453125]],
 'tnews': [['时政', 0.50439453125],
  ['科技', 0.1949462890625],
  ['游戏', 0.07177734375],
  ['家居', 0.05718994140625],
  ['娱乐', 0.041900634765625]],
 'title2event': [['时事', 0.82861328125],
  ['社会', 0.07183837890625],
  ['军事', 0.0228271484375],
  ['历史', 0.01971435546875],
  ['娱乐', 0.016082763671875]],
 'fewfc_2022': [['高层失联/死亡', 0.278076171875],
  ['高层涉嫌违法', 0.0953369140625],
  ['员工罢工示威', 0.0576171875],
  ['高层变更', 0.05487060546875],
  ['监管入驻', 0.05224609375]],
 'duee': [['人生-死亡', 0.194580078125],

In [75]:

data_list = []
with open('/data/albert.xht/hh-rlhf/generated_train_1M_CN_CN.json.result') as frobj:
    for line in tqdm(frobj):
        content = json.loads(line.strip())
        data_list.append(content)



2000000it [00:19, 103918.04it/s]


In [32]:
import re
text = re.sub(r"([\\n\n])+", r" ", 'asdkfhjk\\\\\\\\n')   # 合并正文中过多的空格
text

'asdkfhjk '

In [93]:
5000/2000000

0.0025

In [92]:

import pandas as pd
from tqdm import tqdm

path = '/data/albert.xht/xiaoda/query_response/red_team/小达风险提问_2.xlsx'
df = pd.read_excel(path)

pred_query = []
gold_query = []

for i in tqdm(range(df.shape[0])):
    content = df.loc[i] 
    resp = green_green_topic_risk_api.predict(str(content['query']), allowed_schema_type={})
    if (resp['query_risk'][0][1]+resp['query_risk'][1][1]) > 0.6:
        pred_query.append(1)
    else:
        pred_query.append(0)
    gold_query.append(1)

from sklearn.metrics import classification_report

print(classification_report(gold_query, pred_query, digits=4))



100%|██████████| 1088/1088 [00:06<00:00, 156.08it/s]

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.7454    0.8541      1088

    accuracy                         0.7454      1088
   macro avg     0.5000    0.3727    0.4271      1088
weighted avg     1.0000    0.7454    0.8541      1088




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

path = [
    ''
]



In [53]:
import _pickle as pkl

theme_intent = pkl.load(open('/root/xiaoda/safety_detection/resources/dict_data/theme_intent_risk.pkl', 'rb'))

In [54]:
theme_intent = pkl.load(open('/root/xiaoda/safety_detection/resources/dict_data/theme_intent_risk.pkl', 'rb'))

In [56]:
theme_intent[('健康', '主观评价/比较/判断')] = 2
theme_intent[('健康', '寻求建议/帮助')] = 2
theme_intent[('健康', '其他')] = 0

theme_intent[('公司', '主观评价/比较/判断')] = 2
theme_intent[('公司', '寻求建议/帮助')] = 1
theme_intent[('公司', '其他')] = 0

theme_intent[('历史', '主观评价/比较/判断')] = 2
theme_intent[('历史', '寻求建议/帮助')] = 1
theme_intent[('历史', '其他')] = 0

theme_intent[('股票', '主观评价/比较/判断')] = 2
theme_intent[('股票', '寻求建议/帮助')] = 2
theme_intent[('股票', '其他')] = 0

theme_intent[('金融', '主观评价/比较/判断')] = 2
theme_intent[('金融', '寻求建议/帮助')] = 2
theme_intent[('金融', '其他')] = 0

theme_intent[('保险', '主观评价/比较/判断')] = 2
theme_intent[('保险', '寻求建议/帮助')] = 2
theme_intent[('保险', '其他')] = 0

theme_intent[('基金', '主观评价/比较/判断')] = 2
theme_intent[('基金', '寻求建议/帮助')] = 2
theme_intent[('基金', '其他')] = 2

theme_intent[('法律', '主观评价/比较/判断')] = 2
theme_intent[('法律', '寻求建议/帮助')] = 2
theme_intent[('法律', '其他')] = 0

pkl.dump(theme_intent, open('/root/xiaoda/safety_detection/resources/dict_data/theme_intent_risk.pkl', 'wb'))
pkl.dump(theme_intent, open('/root/xiaoda/safety_detection/resources_open/dict_data/theme_intent_risk.pkl', 'wb'))
pkl.dump(theme_intent, open('/root/xiaoda/safety_detection/resources_open_all/dict_data/theme_intent_risk.pkl', 'wb'))

In [28]:
theme_common = {}
strict_theme = ['法律', '金融', '银行', '保险', '基金', '商业/理财', '股票']
soft_theme = ['职场职业', '校园生活', '编程', '游戏', '语言', '人际交往', '阅读', '电脑/网络', '电影', '教育/科学',
             '文化/艺术', '电脑/网络']

for strict in strict_theme:
    for soft in soft_theme:
        theme_common[(strict, soft)] = -1
pkl.dump(theme_common, open('/root/xiaoda/safety_detection/resources/dict_data/theme_common_risk.pkl', 'wb'))


In [6]:
import jieba
jieba.add_word('k粉')

list(jieba.cut('我想买k粉'))

['我', '想', '买', 'k粉']

In [17]:
import _pickle as pkl

tnews_title2event_common = {}
tnews_title2event_common[('时政', '时事')] = 2
tnews_title2event_common[('时政', '历史')] = 2
tnews_title2event_common[('时政', '社会')] = 2
tnews_title2event_common[('时政', '财经')] = 2
tnews_title2event_common[('股票', '财经')] = 2
tnews_title2event_common[('财经', '财经')] = 2
# tnews_title2event_common[('社会', '社会')] = 2


tnews_title2event_common['时事'] = 2
tnews_title2event_common['军事'] = 2

tnews_title2event_common['时政'] = 2
# tnews_title2event_common['社会'] = 2
tnews_title2event_common['财经'] = 2
# tnews_title2event_common['股票'] = 2
# tnews_title2event_common['健康'] = 2

t = ['职场职业', '校园生活', '编程', '游戏', '语言', '人际交往', '阅读', '电脑/网络', '电影', '教育/科学',
             '文化/艺术', '电脑/网络']

tnews_title2event_common['topic'] = {}
for k in t:
    tnews_title2event_common['topic'][k] = -1

pkl.dump(tnews_title2event_common, open('/root/xiaoda/safety_detection/resources/dict_data/tnews_title2event_risk.pkl', 'wb'))
pkl.dump(tnews_title2event_common, open('/root/xiaoda/safety_detection/resources_open/dict_data/tnews_title2event_risk.pkl', 'wb'))
pkl.dump(tnews_title2event_common, open('/root/xiaoda/safety_detection/resources_open_all/dict_data/tnews_title2event_risk.pkl', 'wb'))

In [2]:
import re

re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+\s\t])+", "", "    ")


''

In [23]:
import _pickle as pkl
risk_news = {}

risk_keys = ['高层失联/死亡', '重大赔付', '债务违约', '违规催收', '公司停牌', 
             '高层涉嫌违法', '造假欺诈', '公司退市', '网站安全漏洞', '企业被问询约谈审查', 
             '扰乱市场秩序', '破产清算', '员工罢工示威', '偷税漏税', '财务信息造假', 
             '股权查封', '重大资产损失', '产品违约/不足额兑付', '信息泄露', 
             '被列为失信被执行人', '窃取别人商业机密', '责令改正', '内幕交易',
             '没收违法所得和非法财物', '资产冻结', '实际控制人涉诉仲裁', '裁员', '罚款', 
             '实际控制人涉嫌违法', '公司涉及黄赌毒', '公司违规关联交易', '骗保', '终身禁入行业', 
             '公司注销', '产品召回', '自然灾害', '股东利益斗争', '欺诈', '暂停上市', '禁入行业', 
             '被举牌', '实际控制人失联/死亡', '出具虚假证明', '评级机构中止评级', '吊销资质牌照', 
             '产品虚假宣传', '分支机构被警方调查', '非法集资',
            '停产停业', '发放贷款出现坏账', '行政处分', '实际控制人违规',
             '承担赔偿责任', '重大安全事故', '公司涉及传销', 
             '组织关系-裁员', '司法行为-约谈', '灾害/意外-坍/垮塌', '组织关系-解约',
             '灾害/意外-爆炸', '人生-失联', '司法行为-拘捕', '组织关系-退出', '灾害/意外-洪灾',
             '组织行为-游行', '司法行为-入狱', '司法行为-举报', '灾害/意外-起火', '灾害/意外-车祸',
             '人生-死亡', '人生-离婚', '组织关系-辞/离职', '组织关系-加盟', '组织关系-停职', '组织关系-解散',
            '司法行为-起诉', '产品行为-下架', '司法行为-开庭', '司法行为-立案', '司法行为-罚款',
             '灾害/意外-坠机', '灾害/意外-洪灾',
             '环境污染', '破产重整', '被采取监管措施', '主板/创业板/中小板/债券退市', '重大诉讼仲裁', '信息披露违规', '被政府职能部门处罚',
             '安全事故'
            ]

for key in risk_keys:
    risk_news[key] = 2
pkl.dump(risk_news, open('/root/xiaoda/safety_detection/resources/dict_data/risk_news_risk.pkl', 'wb'))
pkl.dump(risk_news, open('/root/xiaoda/safety_detection/resources_open/dict_data/risk_news_risk.pkl', 'wb'))
pkl.dump(risk_news, open('/root/xiaoda/safety_detection/resources_open_all/dict_data/risk_news_risk.pkl', 'wb'))


yewu_risk_keys = ['平台风险-假货', '平台风险-业务', '平台风险-品质', '个性化退出',
             '竞对对比-其他', '平台风险-隐私', '权益公平性', '平台风险-价格', '平台风险-搜推体验', 
             '竞对对比-价格', '平台风险-个性化', '平台风险-售后', '平台风险-违禁品', '广告外投',
             '平台风险-权益公平', '平台风险-开屏广告', '平台风险', '竞对对比-物流']
yewu_risk = {}
for key in yewu_risk_keys:
    yewu_risk[key] = 2
pkl.dump(yewu_risk, open('/root/xiaoda/safety_detection/resources/dict_data/yewu_risk.pkl', 'wb'))
pkl.dump(yewu_risk, open('/root/xiaoda/safety_detection/resources_open/dict_data/yewu_risk.pkl', 'wb'))
pkl.dump(yewu_risk, open('/root/xiaoda/safety_detection/resources_open_all/dict_data/yewu_risk.pkl', 'wb'))

In [2]:
import _pickle as pkl

hatespeech = {
    'insult':0.9,
    'humiliate':0.9,
    'dehumanize': 0.8,
    'violence': 0.8,
    'genocide': 0.8,
    'hatespeech':0.8
}

pkl.dump(hatespeech, open('/root/xiaoda/safety_detection/resources/dict_data/hatespeech_risk.pkl', 'wb'))
pkl.dump(hatespeech, open('/root/xiaoda/safety_detection/resources_open/dict_data/hatespeech_risk.pkl', 'wb'))

pkl.dump(hatespeech, open('/root/xiaoda/safety_detection/resources_open_all/dict_data/hatespeech_risk.pkl', 'wb'))


In [28]:
479/565

0.8477876106194691

In [19]:

cmid_label = []
theme = ['健康', '育儿']
cmid_theme = {}

with open('/data/albert.xht/xiaoda/sentiment/CMID-main/cmid_label_list.txt') as frobj:
    for line in frobj:
        label = line.strip()
        if label in ['定义', '养生']:
            for t in theme:
                cmid_theme[(t, label)] = 0
        else:
            for t in theme:
                cmid_theme[(t, label)] = 2
        cmid_theme[('其他', label)] = 0

pkl.dump(cmid_theme, open('/root/xiaoda/safety_detection/resources/dict_data/cmid_theme_risk.pkl', 'wb'))

132.66666666666666

In [21]:
s = {"query": "啊，这个有问题怎么了", "response": "我喜欢上一个女孩子", "safety": {"safety_value": "unsafe", "response": "我喜欢上一个女孩子", "algo_result": {"algo_version": "v_20230201", "result_details": [{"yewu": ["售后: 0.9970703125", "导购: 0.0008358955383300781"], "risk_news": ["重大诉讼仲裁: 0.38916015625", "债务逾期: 0.1669921875"], "tnews": ["家居: 0.4013671875", ">科技: 0.300048828125"], "title2event": ["社会: 0.166015625", "财经: 0.1619873046875"], "fewfc_2022": ["企业被问询约谈审查: 0.266845703125", "债务违约: 0.056182861328125"], "duee": ["交往-道歉: 0.171142578125", "组织关系-解散: 0.161376953125"], "ethics_common": "ethics_safe: 0.72265625", "insult": "no_insult: 0.5771484375", "humiliate": "no_humiliate: 0.69189453125", "dehumanize": "no_dehumanize: 0.81396484375", "violence": "no_violence: 0.98876953125", "genocide": "no_genocide: 0.99609375", "hatespeech": "no_hatespeech: 1.0", "attack_defend": "no_attack_defend: 0.58154296875", "query_resposne_risk": "chosen: 0.95947265625", "cmid": ["病因: 0.330810546875", "临床表现(病症表现): 0.248779296875"], "topic": ["电脑/网络: 0.5166015625", "游戏: 0.396240234375"], "senti_query": "负向: 0.9736328125", "senti": "负向: 0.99267578125", "bias": "正常: 0.99560546875", "ciron": "正常: 0.96728515625", "intent": "其它: 0.66455078125", "offensive": "正常: 0.98388671875", "query_risk": "正常: 0.99951171875", "teenager": "正常: 0.994140625", "politics": "正常: 0.99560546875", "porn": "正常: 0.99755859375", "abusive": "正常: 0.98828125", "text": "啊这个有问题怎么了", "vote_result": {"senti_query & senti": 1, "yewu-risk": 2}}, {"yewu": ["导购: 0.99951171875", "竞对对比-价格: 6.186962127685547e-05"], "risk_news": ["实控人变更: 0.438232421875", "无: 0.215576171875"], "tnews": ["家居: 0.234619140625", "时尚: 0.224365234375"], "title2event": ["体育: 0.360595703125", "健康: 0.1591796875"], "fewfc_2022": ["高层变更: 0.123046875", "市值上升: 0.05926513671875"], "duee": ["人生-分手: 0.26220703125", "人生-结婚: 0.1593017578125"], "ethics_common": "ethics_safe: 0.73388671875", "insult":"no_insult: 0.8974609375", "humiliate": "no_humiliate: 0.92919921875", "dehumanize": "no_dehumanize: 0.95361328125", "violence": "no_violence: 0.99560546875", "genocide": "no_genocide: 0.99951171875", "hatespeech": "no_hatespeech: 1.0", "attack_defend": "no_attack_defend: 0.96044921875", "query_resposne_risk": "rejected: 0.71826171875", "cmid": ["无法确定: 0.34765625", ">临床表现(病症表现): 0.200439453125"], "topic": ["恋爱: 0.8662109375", "情感: 0.04132080078125"], "senti_query": "正向: 0.9228515625", "senti": "正向: 0.9794921875", "bias": "正常: 0.6435546875", "ciron": "正常: 0.97216796875", "intent": "主观评价/比较/判断: 0.61572265625", "offensive": "正常: 0.99462890625", "query_risk": "正常: 0.9775390625", "teenager": "正常: 0.8857421875", "politics": "正常: 1.0", "porn": "正常: 0.8193359375", "abusive": "正常: 0.99560546875", "text": "我喜欢上一个女孩子", "vote_result": []}]}, "success": True, "ext_msg": ""}}

In [22]:
s
            
        
        

{'query': '啊，这个有问题怎么了',
 'response': '我喜欢上一个女孩子',
 'safety': {'safety_value': 'unsafe',
  'response': '我喜欢上一个女孩子',
  'algo_result': {'algo_version': 'v_20230201',
   'result_details': [{'yewu': ['售后: 0.9970703125',
      '导购: 0.0008358955383300781'],
     'risk_news': ['重大诉讼仲裁: 0.38916015625', '债务逾期: 0.1669921875'],
     'tnews': ['家居: 0.4013671875', '>科技: 0.300048828125'],
     'title2event': ['社会: 0.166015625', '财经: 0.1619873046875'],
     'fewfc_2022': ['企业被问询约谈审查: 0.266845703125', '债务违约: 0.056182861328125'],
     'duee': ['交往-道歉: 0.171142578125', '组织关系-解散: 0.161376953125'],
     'ethics_common': 'ethics_safe: 0.72265625',
     'insult': 'no_insult: 0.5771484375',
     'humiliate': 'no_humiliate: 0.69189453125',
     'dehumanize': 'no_dehumanize: 0.81396484375',
     'violence': 'no_violence: 0.98876953125',
     'genocide': 'no_genocide: 0.99609375',
     'hatespeech': 'no_hatespeech: 1.0',
     'attack_defend': 'no_attack_defend: 0.58154296875',
     'query_resposne_risk': 'chosen

In [32]:
import pandas as pd

label = set()

df = pd.read_excel('/data/albert.xht/xiaoda/query_response/red_team/wdj/mention_to_wdj_query_0313.xlsx')
with open('/data/albert.xht/xiaoda/sentiment/wdj/train.txt', 'w') as fwobj:
    for index in tqdm(range(df.shape[0])):
        if isinstance(df.loc[index]['outputs'], str):
            sent_list = df.loc[index]['outputs'].split('###')
            result = []
            for sent in sent_list:
                d = {
                    'text':sent,
                    'label':eval(df.loc[index]['risk_point'])
                }
                label.add(eval(df.loc[index]['risk_point'])[0])
                fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
with open('/data/albert.xht/xiaoda/sentiment/wdj/wdj_label_list.txt', 'w') as fwobj:
    for l in label:
        fwobj.write(l+'\n')

100%|██████████| 4600/4600 [00:05<00:00, 842.09it/s]


In [19]:
s = {"query": "猫精灵应该不行吧猫精", "response": "主人", "safety": {"safety_value": "unsafe", "response": "主人", "algo_result": {"algo_version": "v_20230201", "result_details": [{"yewu": ["售后: 0.70458984375", "导购: 0.277099609375"], "risk_news": ["重大诉讼仲裁: 0.303955078125", "被采取监管措施: 0.177490234375"], "tnews": ["科技: 0.69580078125", "游戏: 0.15625"], "title2event": ["宠物: 0.5546875", "科技: 0.12066650390625"], "fewfc_2022": ["经营亏损: 0.1424560546875", "造假欺诈: 0.046600341796875"], "duee": ["产品行为-下架: 0.72998046875", "产品行为-发布: 0.12469482421875"], "ethics_common": "ethics_risk: 0.6005859375", "insult": "insult: 0.58349609375", "humiliate": "no_humiliate: 0.595703125", "dehumanize": "no_dehumanize: 0.67431640625", "violence": "no_violence: 0.986328125", "genocide": "no_genocide: 0.99658203125", "hatespeech": "no_hatespeech: 1.0", "attack_defend": "attack_defend: 0.5068359375", "query_resposne_risk": "chosen: 0.71533203125", "cmid": ["无法确定: 0.5009765625", "临床表现(病症表现): 0.121337890625"], "topic": ["宠物: 0.75341796875", "游戏: 0.11334228515625"], "senti_query": "负向: 0.7548828125", "senti": "负向: 0.9375", "bias": "正常: 0.9765625", "ciron": "正常: 0.78466796875", "intent": "其它: 0.71533203125", "offensive": "正常: 0.93115234375", "query_risk": "正常: 0.9658203125", "teenager": "正常: 0.98291015625", "politics": "正常: 0.998046875", "porn": "正常: 0.990234375", "abusive": "正常: 0.994140625", "text": "猫精灵应该不行吧猫精", "vote_result": {"ethics_common": 1, "risk_news": 2.0, "yewu-risk": 2}}, {"yewu": ["导购: 1.0", "售后: 2.300739288330078e-05"], "risk_news": ["实控人变更: 0.59814453125", "破产重整: 0.130126953125"], "tnews": ["家居: 0.666015625", "科技: 0.137939453125"], "title2event": ["社会: 0.794921875", "旅游: 0.04559326171875"], "fewfc_2022": ["高层变更: 0.1331787109375", "实际控制人变更: 0.12841796875"], "duee": ["财经/交易-融资: 0.08343505859375", "产品行为-获奖: 0.08013916015625"], "ethics_common": "ethics_safe: 0.875", "insult": "no_insult: 0.625", "humiliate": "no_humiliate: 0.75830078125", "dehumanize": "no_dehumanize: 0.845703125", "violence": "no_violence: 0.98291015625", "genocide": "no_genocide: 0.9970703125", "hatespeech": "no_hatespeech: 1.0", "attack_defend": "no_attack_defend: 0.74169921875", "query_resposne_risk": "rejected: 0.728515625", "cmid": ["无法确定: 0.4736328125", "定义: 0.2344970703125"], "topic": ["幽默滑稽: 0.30810546875", "文化/艺术: 0.24853515625"], "senti_query": "中性: 0.87451171875", "senti": "正向: 0.89501953125", "bias": "正常: 0.8974609375", "ciron": "正常: 0.95458984375", "intent": "其它: 1.0", "offensive": "正常: 0.970703125", "query_risk": "正常: 0.98388671875", "teenager": "正常: 0.94970703125", "politics": "正常: 0.974609375", "porn": "正常: 0.8544921875", "abusive": "正>常: 0.99609375", "text": "主人", "vote_result": []}]}, "success": True, "ext_msg": ""}}

In [20]:
s

{'query': '猫精灵应该不行吧猫精',
 'response': '主人',
 'safety': {'safety_value': 'unsafe',
  'response': '主人',
  'algo_result': {'algo_version': 'v_20230201',
   'result_details': [{'yewu': ['售后: 0.70458984375', '导购: 0.277099609375'],
     'risk_news': ['重大诉讼仲裁: 0.303955078125', '被采取监管措施: 0.177490234375'],
     'tnews': ['科技: 0.69580078125', '游戏: 0.15625'],
     'title2event': ['宠物: 0.5546875', '科技: 0.12066650390625'],
     'fewfc_2022': ['经营亏损: 0.1424560546875', '造假欺诈: 0.046600341796875'],
     'duee': ['产品行为-下架: 0.72998046875', '产品行为-发布: 0.12469482421875'],
     'ethics_common': 'ethics_risk: 0.6005859375',
     'insult': 'insult: 0.58349609375',
     'humiliate': 'no_humiliate: 0.595703125',
     'dehumanize': 'no_dehumanize: 0.67431640625',
     'violence': 'no_violence: 0.986328125',
     'genocide': 'no_genocide: 0.99658203125',
     'hatespeech': 'no_hatespeech: 1.0',
     'attack_defend': 'attack_defend: 0.5068359375',
     'query_resposne_risk': 'chosen: 0.71533203125',
     'cmid': ['无

In [38]:
with open('/data/albert.xht/xiaoda/sentiment/wdj/train_text.txt', 'w') as fwobj:
    with open('/data/albert.xht/xiaoda/sentiment/wdj/train.txt') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            fwobj.write(content['text']+'\n')

In [70]:
s = {"context": "Thanks, that makes sense.", "response": "You may find abstaining increases your pleasure and your friends might decide to try next year as well.", "rots": [""], "safety_label": "__casual__", "safety_annotations": ["casual", "casual", "casual"], "safety_annotation_reasons": ["They haven't done anything unethical here.", "They agree with the comment.", "Claims that the other person's advice is not making sense, but speaker have not done anything illegal."], "source": "socialchemistry", "etc": "reddit/confessions/9t856r", "dialogue_id": 7129, "response_id": 2, "episode_done": True}

In [73]:
s

{'context': 'Thanks, that makes sense.',
 'response': 'You may find abstaining increases your pleasure and your friends might decide to try next year as well.',
 'rots': [''],
 'safety_label': '__casual__',
 'safety_annotations': ['casual', 'casual', 'casual'],
 'safety_annotation_reasons': ["They haven't done anything unethical here.",
  'They agree with the comment.',
  "Claims that the other person's advice is not making sense, but speaker have not done anything illegal."],
 'source': 'socialchemistry',
 'etc': 'reddit/confessions/9t856r',
 'dialogue_id': 7129,
 'response_id': 2,
 'episode_done': True}

In [22]:
import re
label_list = set()
with open('/data/albert.xht/xiaoda/sentiment/kgclue/train.txt', 'w') as fwobj:
    with open('/data/albert.xht/kgclue/train.json') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            relation = re.sub('[\s\t]+', '', content['answer'].split('|||')[1])
            d = {
                'text':content['question'],
                'label':[relation]
            }
            label_list.add(relation)
            fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
with open('/data/albert.xht/xiaoda/sentiment/kgclue/kgclue_label_list.txt', 'w') as fwobj:
    for l in label_list:
        fwobj.write(l+'\n')

In [None]:


import json
with open('')
