In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [11]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            print(self.schema_dict[schema_type], '==schema_type==', schema_type)
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        from roformer import RoFormerModel, RoFormerConfig

        config = RoFormerConfig.from_pretrained(args_path["model_path"])
        encoder = RoFormerModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx is not None and idx != dt_idx:
                        continue
                    
                    ce_logits = classifier(hidden_states)
                    outputs_list.append(ce_logits)
                return outputs_list, hidden_states

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()

    def predict(self, text):

        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
        for schema_type, logits in zip(list(self.schema_dict.keys()), logits_list):
            scores = torch.nn.Softmax(dim=1)(logits)[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
        return scores_dict

risk_api = RiskInfer('./risk_data/config.ini')




senti /root/xiaoda/query_topic/risk_data/senti_label.txt ===schema-path===
{'label2id': {'负向': 0, '正向': 1}, 'id2label': {0: '负向', 1: '正向'}, 'label_index': 0} ==schema_type== senti
bias /root/xiaoda/query_topic/risk_data/bias_label.txt ===schema-path===
{'label2id': {'偏见': 0, '正常': 1}, 'id2label': {0: '偏见', 1: '正常'}, 'label_index': 1} ==schema_type== bias
ciron /root/xiaoda/query_topic/risk_data/ciron_label.txt ===schema-path===
{'label2id': {'讽刺': 0, '正常': 1}, 'id2label': {0: '讽刺', 1: '正常'}, 'label_index': 2} ==schema_type== ciron
intent /root/xiaoda/query_topic/risk_data/intention_label_v0.txt ===schema-path===
{'label2id': {'主观评价/比较/判断': 0, '寻求建议/帮助': 1, '其它': 2}, 'id2label': {0: '主观评价/比较/判断', 1: '寻求建议/帮助', 2: '其它'}, 'label_index': 3} ==schema_type== intent
offensive /root/xiaoda/query_topic/risk_data/offensive_label.txt ===schema-path===
{'label2id': {'冒犯': 0, '正常': 1}, 'id2label': {0: '冒犯', 1: '正常'}, 'label_index': 4} ==schema_type== offensive
query_risk /root/xiaoda/query_topic/ri

12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
12/23/2022 10:58:26 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++


In [12]:

offensive = []
with open('/data/albert.xht/sentiment/dev/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
offensive_test = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive_test.append(content)

        
cdia_bias = []
with open('/data/albert.xht/sentiment/dev/cdial_bias.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        cdia_bias.append(content)
        
senti_copr = []
with open('/data/albert.xht/sentiment/dev/senti_copr.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_copr.append(content)
        
ciron = []
with open('/data/albert.xht/sentiment/dev/chinese_ciron.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        ciron.append(content)

senti_smp = []
with open('/data/albert.xht/sentiment/dev/senti_smp_usual.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smp.append(content)
        
senti_smpecisa = []
with open('/data/albert.xht/sentiment/dev/senti_smpecisa.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smpecisa.append(content)
        

In [13]:
from sklearn.metrics import classification_report
from tqdm import tqdm

def eval_all(data, model, key):
    pred = []
    gold = []
    pred_score = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        result = model.predict(text)
        score = sorted(result[key], key=lambda u:u[1], reverse=True)
        pred.append(score[0][0])
        pred_score.append(result[key])
    print(classification_report(gold, pred, digits=4), '===', key)
    return pred, gold, pred_score
    

def evaluation_ece(pred_score, gold):
    pred_score_l = []
    mapping_dict = {}
    for item in pred_score:
        pred_score_l.append([])
        for idx, p in enumerate(item):
            if p[0] not in mapping_dict:
                mapping_dict[p[0]] = idx
            pred_score_l[-1].append(p[1])
    pred_score_l = torch.tensor(pred_score_l)
    gold_l = torch.tensor([mapping_dict[item] for item in gold])

    ece_fn = ECE(n_bins=15)
    print(ece_fn(pred_score_l, gold_l, mode='probs'), '==ece==')
# pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
# evaluation_ece(pred_score, gold)


In [16]:
def evaluation(model_path):
    print(model_path)
    risk_api.reload(model_path)
    print('===offensive===')
    pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
    evaluation_ece(pred_score, gold)
    print('===cdia-bias===')
    pred, gold, pred_score = eval_all(cdia_bias, risk_api, 'bias')
    evaluation_ece(pred_score, gold)
    print('===ciron===')
    pred, gold, pred_score = eval_all(ciron, risk_api, 'ciron')
    evaluation_ece(pred_score, gold)
    print('===chsenti===')
    pred, gold, pred_score = eval_all(senti_copr, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smpecisa===')
    pred, gold, pred_score = eval_all(senti_smpecisa, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smp===')
    pred, gold, pred_score = eval_all(senti_smp, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    

In [15]:
model_path = '/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v4_intent_v2/multitask_cls.pth.9'

evaluation(model_path)


/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v4_intent_v2/multitask_cls.pth.9
===offensive===


  0%|          | 0/5304 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 5304/5304 [00:44<00:00, 120.47it/s]


              precision    recall  f1-score   support

          冒犯     0.7338    0.8680    0.7953      2106
          正常     0.9012    0.7927    0.8435      3198

    accuracy                         0.8226      5304
   macro avg     0.8175    0.8303    0.8194      5304
weighted avg     0.8347    0.8226    0.8243      5304
 === offensive
tensor([0.1152]) ==ece==
===cdia-bias===


  0%|          | 0/2829 [00:00<?, ?it/s]


KeyError: 'bias#1'

In [324]:
model_path = '/data/albert.xht/xiaodao/risk_classification/query_risk_v2/multitask_raw_all_risk_v2_10/multitask_cls.pth.3'
evaluation(model_path)

/data/albert.xht/xiaodao/risk_classification/query_risk_v2/multitask_raw_all_risk_v2_10/multitask_cls.pth.3
===offensive===


100%|██████████| 5304/5304 [00:49<00:00, 106.89it/s]


              precision    recall  f1-score   support

          冒犯     0.7057    0.8837    0.7847      2106
          正常     0.9081    0.7573    0.8259      3198

    accuracy                         0.8075      5304
   macro avg     0.8069    0.8205    0.8053      5304
weighted avg     0.8278    0.8075    0.8096      5304

tensor([0.1108]) ==ece==
===cdia-bias===


100%|██████████| 2829/2829 [00:26<00:00, 106.62it/s]


              precision    recall  f1-score   support

          偏见     0.6466    0.4178    0.5076       718
          正常     0.8233    0.9223    0.8700      2111

    accuracy                         0.7943      2829
   macro avg     0.7349    0.6701    0.6888      2829
weighted avg     0.7784    0.7943    0.7780      2829

tensor([0.0118]) ==ece==
===ciron===


100%|██████████| 875/875 [00:08<00:00, 107.29it/s]


              precision    recall  f1-score   support

          正常     0.9118    0.9692    0.9396       779
          讽刺     0.4894    0.2396    0.3217        96

    accuracy                         0.8891       875
   macro avg     0.7006    0.6044    0.6307       875
weighted avg     0.8655    0.8891    0.8718       875

tensor([0.0205]) ==ece==
===chsenti===


100%|██████████| 1200/1200 [00:11<00:00, 105.07it/s]


              precision    recall  f1-score   support

          正向     0.9294    0.9325    0.9310       593
          负向     0.9339    0.9308    0.9323       607

    accuracy                         0.9317      1200
   macro avg     0.9316    0.9317    0.9317      1200
weighted avg     0.9317    0.9317    0.9317      1200

tensor([0.0193]) ==ece==
===senti_smpecisa===


100%|██████████| 2529/2529 [00:23<00:00, 107.95it/s]


              precision    recall  f1-score   support

          正向     0.8765    0.8035    0.8384      1201
          负向     0.8347    0.8976    0.8650      1328

    accuracy                         0.8529      2529
   macro avg     0.8556    0.8505    0.8517      2529
weighted avg     0.8546    0.8529    0.8524      2529

tensor([0.0264]) ==ece==
===senti_smp===


100%|██████████| 2844/2844 [00:26<00:00, 107.31it/s]

              precision    recall  f1-score   support

          正向     0.9084    0.8721    0.8899      1126
          负向     0.9183    0.9424    0.9302      1718

    accuracy                         0.9146      2844
   macro avg     0.9134    0.9072    0.9100      2844
weighted avg     0.9144    0.9146    0.9142      2844

tensor([0.0252]) ==ece==





In [None]:
risk_api.predict('傻逼')

In [166]:
# risk_api.reload(model)
result_list = []
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/疑似有风险query_from对话预训练数据-20221130.txt') as frobj:
    for line in tqdm(frobj):
        text = line.strip()
        result = risk_api.predict(text)
        result_list.append((text, result))
with open('/root/xiaoda/risk_detection/data/tmjl_1w.txt') as frobj:
    for line in tqdm(frobj):
        text = line.strip()
        result = risk_api.predict(text)
        result_list.append((text, result))


56443it [08:43, 107.87it/s]
10001it [01:31, 109.30it/s]


In [313]:
result_matrix = []
for item in result_list:
    p = []
    for key in ['senti#1', 'senti#2', 'senti#3', 'senti#4', 'senti#5', 
                'senti#6', 'bias#1', 'bias#2', 'ciron', 'offensive', 'teenager', 'query_risk']:
        if item[1][key][0][1] >= 0.75:
            p.append(1)
        else:
            p.append(0)
    result_matrix.append(p)

In [219]:
predict_matrix = result_matrix

In [312]:

from sklearn.ensemble import RandomForestClassifier
import random


result_matrix = result_matrix[:56443]
random.shuffle(result_matrix)
weak_result_matrix = np.array(result_matrix)

def SIMPLE(result_matrix):
    votes = np.sum(result_matrix, axis=-1)
    labels = np.array(votes >= 5).astype(np.int)
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    
    for i in range(5):
        result = clf.fit(result_matrix, labels)
        probs = clf.predict_proba(result_matrix)
        print(probs.shape)
        labels = np.argmax(probs, axis=-1)
    return probs, clf

probs, clf = SIMPLE(weak_result_matrix)
        

(56443, 2)
(56443, 2)
(56443, 2)
(56443, 2)
(56443, 2)


In [314]:
probs = clf.predict_proba(np.array(result_matrix))
probs.shape

(66444, 2)

In [350]:
ff = []
clean = []
for idx in range(probs.shape[0]):
    if idx <= 56443:
        continue
    if probs[idx,1] >= 0.7:
        ff.append((idx, result_list[idx], probs[idx]))
    else:
        clean.append((idx, result_list[idx], probs[idx]))
        
len(ff), len(clean)

(262, 9738)

In [None]:
ff[0]

In [354]:
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/tmjl_1w_clean.json', 'w') as fwobj:
    for item in clean:
        tmp = {
            'text':item[1][0],
            'label':['正常']
        }
        fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
    for item in ff:
        tmp = {
            'text':item[1][0],
            'label':['风险']
        }
        fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
        

In [319]:
import joblib
joblib.dump(clf, "/data/albert.xht/xiaodao/risk_classification/query_risk_v2/multitask_raw_all_risk_v2_10/random_forest.joblib")

['/data/albert.xht/xiaodao/risk_classification/query_risk_v2/multitask_raw_all_risk_v2_10/random_forest.joblib']

In [155]:
tmjl_1w_resp = []
tmjl_1w_detail = []
from tqdm import tqdm
with open('/root/xiaoda/risk_detection/data/tmjl_1w.txt') as frobj:
    for line in tqdm(frobj):
        content = line.strip()
        resp, detail = predict(content)
        tmjl_1w_resp.append(resp)
        tmjl_1w_detail.append(detail)


10001it [03:01, 55.06it/s]


In [322]:

def predict(text):
    resutl = risk_api.predict(text)
    from pprint import pprint
    p = []
    for key in ['senti#1', 'senti#2', 'senti#3', 'senti#4', 'senti#5', 
                'senti#6', 'bias#1', 'bias#2', 'ciron', 'offensive', 'teenager', 'query_risk']:
        if resutl[key][0][1] > 0.7:
            p.append(1)
        else:
            p.append(0)
    p = np.array([p])
    resp = clf.predict_proba(p).tolist()[0]
    key = ['正常', '风险']
    return dict(zip(key, resp)), resutl

In [349]:
predict('ABB有哪些')

({'正常': 0.8240041961898732, '风险': 0.17599580381012703},
 {'senti#1': [['负向', 0.7929189205169678], ['正向', 0.20708103477954865]],
  'senti#2': [['负向', 0.6586761474609375], ['正向', 0.3413238227367401]],
  'senti#3': [['负向', 0.9517019987106323], ['正向', 0.04829796031117439]],
  'senti#4': [['负向', 0.6687620282173157], ['正向', 0.3312379717826843]],
  'senti#5': [['负向', 0.9325788021087646], ['正向', 0.06742116808891296]],
  'senti#6': [['负向', 0.584587574005127], ['正向', 0.41541245579719543]],
  'bias#1': [['偏见', 0.0816979855298996], ['正常', 0.9183019995689392]],
  'bias#2': [['偏见', 0.032336749136447906], ['正常', 0.9676632881164551]],
  'ciron': [['讽刺', 0.06859628111124039], ['正常', 0.9314036965370178]],
  'intent': [['主观评价/比较/判断', 2.3500122097175336e-06],
   ['寻求建议/帮助', 0.00012920489825773984],
   ['其它', 0.9998683929443359]],
  'offensive': [['冒犯', 0.014626090414822102], ['正常', 0.985373854637146]],
  'query_risk': [['风险', 0.17795933783054352],
   ['个人信息', 0.0015156776644289494],
   ['正常', 0.8205249905

In [158]:




from urllib import request, parse
import json
data = {
    "query":"ABB形式都有哪些",
    "history":[ 
        # {
        #     "role":"human",
        #     "utterance":"天空为什么是蓝色",
        #     "rewritten_utterance":"天空为什么是蓝色"
        # },
        # {
        #     "utterance":"因为大气对太阳光的散射作用,使我们看到的天空呈现蓝色",
        #     "role":"bot"
        # }
        
    ],
    "user_profile":{
        "persona":["你叫小明"]
    },
    "bot_profile":{
        "persona":[ 
            "我是小达;我是女生;我今年21岁;我生日是2001年11月11日;我是天蝎座;我现在在复旦大学上学;我现在在上海;我的专业是工商管理;我大三了;我还没有工作;我还没有毕业;我是浙江杭州人;我从小在杭州长大;我喜欢阅读，尤其是诗歌;我是个小吃货;我最爱巧克力了"
        ]
    }
}

data = json.dumps(data).encode('utf-8')
req =  request.Request('http://39.105.47.48:8003', data=data) # this will make the method "POST"
resp = request.urlopen(req).read()
resp = json.loads(resp)
print(resp)


{'debug_info': {}, 'dialog_state': {}, 'grounding_evidence': {'obj': {'sc_name': 'structure_web_info', 'score': 0.0, 'snippet': '沉甸甸、白茫茫、孤零零、黑乎乎、眼睁睁、绿油油、 慢吞吞、急匆匆、笑眯眯、懒洋洋、齐刷刷、笑盈盈、 黑沉沉、亮闪闪、亮晶晶、香喷喷、静悄悄、金灿灿、', 'url': 'http://m.xuexila.com/ciyu/636003.html'}, 'score': 1.0, 'source': 'openweb'}, 'response': '黑沉沉、亮闪闪、亮晶晶、香喷喷、静悄悄、金灿灿', 'response_type': ''}


In [None]:


reject_response = []
