In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [27]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

cur_dir_path = '/root/xiaoda/query_topic/'

def load_label(filepath):
    label_list = []
    with open(filepath, 'r') as frobj:
        for line in frobj:
            label_list.append(line.strip())
        n_classes = len(label_list)

        label2id = {}
        id2label = {}
        for idx, label in enumerate(label_list):
            label2id[label] = idx
            id2label[idx] = label
        return label2id, id2label

class RiskInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        from collections import OrderedDict
        self.schema_dict = OrderedDict({})

        for label_index, schema_info in enumerate(args_path["label_path"].split(',')):
            schema_type, schema_path = schema_info.split(':')
            schema_path = os.path.join(cur_dir_path, schema_path)
            print(schema_type, schema_path, '===schema-path===')
            label2id, id2label = load_label(schema_path)
            self.schema_dict[schema_type] = {
                'label2id':label2id,
                'id2label':id2label,
                'label_index':label_index
            }
            print(self.schema_dict[schema_type], '==schema_type==', schema_type)
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        from roformer import RoFormerModel, RoFormerConfig

        config = RoFormerConfig.from_pretrained(args_path["model_path"])
        encoder = RoFormerModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        self.device = "cuda:2" if torch.cuda.is_available() else "cpu"

        classifier_list = []

        schema_list = list(self.schema_dict.keys())

        for schema_key in schema_list:
            classifier = RobertaClassifier(
                hidden_size=config.hidden_size, 
                dropout_prob=con.getfloat('para', 'out_dropout_rate'),
                num_labels=len(self.schema_dict[schema_key]['label2id']), 
                dropout_type=con.get('para', 'dropout_type'))
            classifier_list.append(classifier)

        classifier_list = nn.ModuleList(classifier_list)

        class MultitaskClassifier(nn.Module):
            def __init__(self, transformer, classifier_list):
                super().__init__()

                self.transformer = transformer
                self.classifier_list = classifier_list

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, 
                        transformer_mode='mean_pooling', 
                        dt_idx=None, mode='predict'):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                outputs_list = []
                
                for idx, classifier in enumerate(self.classifier_list):
                    
                    if dt_idx:
                        if idx not in dt_idx:
                            continue
                    
                    scores = classifier(hidden_states)
                    if mode == 'predict':
                        scores = torch.nn.Softmax(dim=1)(scores)
                    outputs_list.append(scores)
                return outputs_list, hidden_states

        self.net = MultitaskClassifier(encoder_net, classifier_list).to(self.device)

        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_cls.pth.{}.raw.focal'.format(eo)), map_location=self.device)
        # # ckpt = torch.load(os.path.join(output_path, 'multitask_contrast_cls.pth.{}'.format(eo)), map_location=self.device)
        # self.net.load_state_dict(ckpt)
        # self.net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval()

    def predict(self, text, allowed_schema_type_ids={}):

        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
            print(scores, allowed_schema_type_ids, schema_idx)
            if allowed_schema_type_ids:
                if schema_idx not in allowed_schema_type_ids:
                    continue
            # scores = torch.nn.Softmax(dim=1)(logits)[0].data.cpu().numpy()
            scores = scores[0].data.cpu().numpy()
            scores_dict[schema_type] = []
            for index, score in enumerate(scores):
                scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                        float(score)])
        return scores_dict
    
    def get_logitnorm(self, text):
        """抽取输入text所包含的类型
        """
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        scores_dict = {}
        logits_norm_list = []
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(input_ids, 
                attention_mask, token_type_ids, transformer_mode='cls')
            for logits in logits_list:
                logits_norm_list.append(logits/torch.norm(logits, p=2, dim=-1, keepdim=True) + 1e-7)
        for schema_type, logit_norm in zip(list(self.schema_dict.keys()), logits_norm_list):
            scores_dict[schema_type] = logit_norm[0].data.cpu().numpy()
        return scores_dict
            
    
    def predict_batch(self, text, allowed_schema_type_ids={}):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, return_tensors="pt",padding=True)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        with torch.no_grad():
            [logits_list, 
            hidden_states] = self.net(model_input['input_ids'], 
                model_input['attention_mask'], 
                model_input['token_type_ids'], transformer_mode='cls', dt_idx=allowed_schema_type_ids)
        score_dict_list = []
        for idx, text in enumerate(text_list):
            scores_dict = {}
            for schema_idx, (schema_type, scores) in enumerate(zip(list(self.schema_dict.keys()), logits_list)):
                if allowed_schema_type_ids:
                    if schema_idx not in allowed_schema_type_ids:
                        continue
                # scores = torch.nn.Softmax(dim=1)(logits)[idx].data.cpu().numpy()
                scores = scores[idx].data.cpu().numpy()
                scores_dict[schema_type] = []
                for index, score in enumerate(scores):
                    scores_dict[schema_type].append([self.schema_dict[schema_type]['id2label'][index], 
                                            float(score)])
            score_dict_list.append(scores_dict)
        return score_dict_list

# risk_api = RiskInfer('./risk_data/config.ini')
# risk_api = RiskInfer('./risk_data_v5/config_offensive_risk.ini')




In [5]:
risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v5/multitask_cls.pth.9')

In [28]:
green_risk_api = RiskInfer('./risk_data_v5/config_offensive_risk_green.ini')

senti_query /data/albert.xht/xiaoda/sentiment/senti/senti_query_label.txt ===schema-path===
{'label2id': {'负向': 0, '中性': 1, '正向': 2}, 'id2label': {0: '负向', 1: '中性', 2: '正向'}, 'label_index': 0} ==schema_type== senti_query
senti /data/albert.xht/xiaoda/sentiment/senti/senti_label.txt ===schema-path===
{'label2id': {'负向': 0, '正向': 1}, 'id2label': {0: '负向', 1: '正向'}, 'label_index': 1} ==schema_type== senti
bias /data/albert.xht/xiaoda/sentiment/bias/bias_label.txt ===schema-path===
{'label2id': {'偏见': 0, '正常': 1}, 'id2label': {0: '偏见', 1: '正常'}, 'label_index': 2} ==schema_type== bias
ciron /data/albert.xht/xiaoda/sentiment/ciron/ciron_label.txt ===schema-path===
{'label2id': {'讽刺': 0, '正常': 1}, 'id2label': {0: '讽刺', 1: '正常'}, 'label_index': 3} ==schema_type== ciron
intent /data/albert.xht/xiaoda/sentiment/intention_data_v2-1/label.txt ===schema-path===
{'label2id': {'主观评价/比较/判断': 0, '寻求建议/帮助': 1, '其它': 2}, 'id2label': {0: '主观评价/比较/判断', 1: '寻求建议/帮助', 2: '其它'}, 'label_index': 4} ==schema_typ

01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++
01/20/2023 09:13:48 - INFO - n

In [None]:
green_risk_api_base = RiskInfer('./risk_data_v5/config_offensive_risk_green_base.ini')

In [10]:
green_risk_api_base.reload('/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v17_base_3090/multitask_cls.pth.8')

In [29]:
# green_risk_api.reload('/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v6/multitask_cls.pth.6')
# green_risk_api.reload('/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v14/multitask_cls.pth.9')

# green_risk_api.reload('/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18_base_distill/multitask_cls.pth.9')
green_risk_api.reload('/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v18/multitask_cls.pth.9')

In [12]:
from sklearn.metrics import classification_report
from tqdm import tqdm
import re

def eval_all(data, model, key):
    pred = []
    gold = []
    pred_score = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", text)   # 合并正文中过多的空格

        result = model.predict(text)
        score = sorted(result[key], key=lambda u:u[1], reverse=True)
        pred.append(score[0][0])
        pred_score.append(result[key])
    print(classification_report(gold, pred, digits=4))
    return pred, gold, pred_score
    

def evaluation_ece(pred_score, gold):
    pred_score_l = []
    mapping_dict = {}
    for item in pred_score:
        pred_score_l.append([])
        for idx, p in enumerate(item):
            if p[0] not in mapping_dict:
                mapping_dict[p[0]] = idx
            pred_score_l[-1].append(p[1])
    pred_score_l = torch.tensor(pred_score_l)
    gold_l = torch.tensor([mapping_dict[item] for item in gold])

    ece_fn = ECE(n_bins=15)
    print(ece_fn(pred_score_l, gold_l, mode='probs'), '==ece==')
# pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
# evaluation_ece(pred_score, gold)


In [22]:
offensive = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
pred, gold, pred_score = eval_all(offensive, green_risk_api, 'offensive')
evaluation_ece(pred_score, gold)

100%|██████████| 5304/5304 [00:48<00:00, 109.93it/s]

              precision    recall  f1-score   support

          冒犯     0.7169    0.8803    0.7903      2106
          正常     0.9073    0.7711    0.8337      3198

    accuracy                         0.8145      5304
   macro avg     0.8121    0.8257    0.8120      5304
weighted avg     0.8317    0.8145    0.8164      5304

tensor([0.0971]) ==ece==





In [307]:
risk_query = []
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/offensive_select_labeled.txt.paraphrase') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        # for d in content['paraphrase']:
        #     p = {
        #         'text':d[0],
        #         'label':content['label']
        #     }
        #     if d:
        #         risk_query.append(p)
        risk_query.append(json.loads(line.strip()))
pred, gold, pred_score = eval_all(risk_query, green_risk_api, 'query_risk')
evaluation_ece(pred_score, gold)

 31%|███       | 6362/20641 [01:44<03:55, 60.62it/s] 


KeyboardInterrupt: 

In [None]:
an = []
for pred_, gold_, text in zip(pred, gold, risk_query):
    if pred_ != gold_:
        an.append((pred_, gold_, text))

In [44]:
risk_query = []
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/offensive_select_labeled.txt.paraphrase') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        # for d in content['paraphrase']:
        #     p = {
        #         'text':d[0],
        #         'label':content['label']
        #     }
        #     if d:
        #         risk_query.append(p)
        risk_query.append(json.loads(line.strip()))
pred, gold, pred_score = eval_all(risk_query, green_risk_api, 'query_risk')
evaluation_ece(pred_score, gold)

100%|██████████| 20641/20641 [03:07<00:00, 110.37it/s]


              precision    recall  f1-score   support

          正常     0.6708    0.6396    0.6548      5514
          风险     0.8708    0.8856    0.8781     15127

    accuracy                         0.8199     20641
   macro avg     0.7708    0.7626    0.7665     20641
weighted avg     0.8174    0.8199    0.8185     20641

tensor([0.0514]) ==ece==


In [7]:
from tqdm import tqdm
import numpy as np
import json, re

def risk_predict_batch(risk_api, text):
    if isinstance(text, list):
        text_list = text
    else:
        text_list = [text]
    result_list = risk_api.predict_batch(text_list)
    return result_list

In [10]:
def predict_data(risk_api, input_path, output_path):
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    with open(output_path, 'w') as fwobj:
        with open(input_path) as frobj:
            for idx, line in tqdm(enumerate(frobj)):
                if idx == 0:
                    continue
                content = json.loads(line.strip())
                text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
                # if content['label'] not in ['black']:
                #     continue
                if len(text) >= 164:
                    text = text[:164]
                queue.append(text)
                t.append(content)
                if np.mod(len(queue), 128) == 0 and queue:
                    probs = risk_predict_batch(risk_api, queue)
                    for prob_dict, text, tt in zip(probs, queue, t):
                        tt['score_list'] = prob_dict
                        fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                    queue = []
                    t = []
            if queue:
                probs = risk_predict_batch(risk_api, queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['score_list'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

In [12]:
input_path = '/data/albert.xht/pretrained_model_risk/corpus/efaqa-corpus-zh/efaqa-corpus-zh.utf8.other'
output_path = '/data/albert.xht/pretrained_model_risk/corpus/efaqa-corpus-zh/efaqa-corpus-zh.utf8.other.green'


predict_data(green_risk_api, input_path, output_path)

19655it [00:40, 488.10it/s]


In [101]:

input_path = '/data/albert.xht/sentiment/green_teenager.json'
output_path = '/data/albert.xht/sentiment/green_teenager.json.offensive_query_risk.green'


predict_data(green_risk_api, input_path, output_path)

242576it [06:15, 646.82it/s]


In [None]:
input_path = '/data/albert.xht/sentiment/green_teenager.json'
output_path = '/data/albert.xht/sentiment/green_teenager.json.offensive_query_risk.green'


predict_data(green_risk_api, input_path, output_path)

In [133]:

input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.keyword'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.keyword.green'


predict_data(green_risk_api, input_path, output_path)

44565it [00:52, 845.15it/s]


In [325]:
input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v18'


predict_data(green_risk_api, input_path, output_path)


2071401it [40:15, 857.44it/s]


In [10]:
with open('/data/albert.xht/raw_chat_corpus/baike2018qa/baike_qa_train.json') as frobj:
    data_list = []
    for line in frobj:
        content = json.loads(line.strip())
        if content['category'] in ['娱乐-博彩']:
            data_list.append(content)

46124

In [31]:


green_risk_api.predict('吃冰毒', allowed_schema_type_ids={
    4:'',6:'',9:'', 10:''
})

tensor([[5.4775e-05, 3.7232e-02, 9.6271e-01]], device='cuda:2') {4: '', 6: '', 9: '', 10: ''}
tensor([[9.6828e-01, 3.5527e-06, 3.1714e-02]], device='cuda:2') {4: '', 6: '', 9: '', 10: ''}
tensor([[0.0053, 0.9947]], device='cuda:2') {4: '', 6: '', 9: '', 10: ''}
tensor([[0.0067, 0.9933]], device='cuda:2') {4: '', 6: '', 9: '', 10: ''}


{}

In [326]:
input_path = '/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json.disu'
output_path = '/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json.disu.v18'


predict_data(green_risk_api, input_path, output_path)

728266it [19:20, 627.32it/s]


In [None]:
output_path = '/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json.disu.v18'
with open(output_path) as frobj:
    for line in tqdm(frobj):
        content = json.loads(line.strip())
        

In [334]:
input_path = '/data/albert.xht/sentiment/offensive_cold.json'
output_path = '/data/albert.xht/sentiment/offensive_cold.json.v18'


predict_data(green_risk_api, input_path, output_path)

25663it [00:38, 659.46it/s]


In [339]:
input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter.offensive_query_risk'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter.offensive_query_risk.v18'


predict_data(green_risk_api, input_path, output_path)




42195it [00:49, 854.22it/s]


In [376]:
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v18'
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v18.white', 'w') as fwobj:
    with open(output_path) as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            if content['score_list']['query_risk'][0][1] < 0.3 and  (content['score_list']['senti_query'][0][1] < 0.3 or content['score_list']['senti'][0][1] < 0.3)\
            and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
            and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                and content['score_list']['teenager'][0][1] < 0.5):
                text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
                content['keywords'] = keyword_api.extract_keywords(text)
                if content['keywords']:
                    continue
                d = {
                    'text':content['text'],
                    'label':['正常'],
                    'source':'topic_query_risk_gren_v18',
                    'topic':content['label'],
                    'score_list':content['score_list']
                }
                fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
            elif content['score_list']['query_risk'][0][1] < 0.3 and  (content['score_list']['senti_query'][0][1] > 0.5 or content['score_list']['senti'][0][1] > 0.5)\
                and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
                and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                    and content['score_list']['teenager'][0][1] < 0.5):
                text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
                content['keywords'] = keyword_api.extract_keywords(text)
                if content['keywords']:
                    continue
                d = {
                    'text':content['text'],
                    'label':['正常'],
                    'source':'topic_query_risk_gren_v18',
                    'topic':content['label'],
                    'score_list':content['score_list']
                }
                fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

2071400it [03:54, 8827.78it/s]


In [26]:
import time
start = time.time()
print(green_risk_api.predict('我想拉屎', {
    4:'', 6:'', 9:'', 10:''
}), time.time()-start)


{} 0.008749961853027344


In [359]:
from keyword_processor import KeywordProcesser
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/risk_event.txt') as frobj:
    for line in frobj:
        content = line.strip()
    keyword_list = content.split('||')

keyword_api = KeywordProcesser(keywords=keyword_list)
keyword_api.add_keyword('性关系')
keyword_api.add_keyword('手淫')
keyword_api.add_keyword('性生活')
keyword_api.add_keyword('性能力')

In [357]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter.offensive_query_risk.v18.white', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter.offensive_query_risk.v18') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            if content['label'][0] in ['正常']:
                if content['score_list']['query_risk'][0][1] < 0.5 and  (content['score_list']['senti_query'][0][1] < 0.3 or content['score_list']['senti'][0][1] < 0.3)\
            and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
            and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                and content['score_list']['teenager'][0][1] < 0.5):
                    text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
                    content['keywords'] = keyword_api.extract_keywords(text)
                    if content['keywords']:
                        continue
                    d = {
                        'text':content['text'],
                        'label':['正常'],
                        'source':'offensive_query_risk_gren_v18'
                    }
                    fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
                elif content['score_list']['query_risk'][0][1] < 0.5 and  (content['score_list']['senti_query'][0][1] > 0.5 or content['score_list']['senti'][0][1] > 0.5)\
                and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
                and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                    and content['score_list']['teenager'][0][1] < 0.5):
                    text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
                    content['keywords'] = keyword_api.extract_keywords(text)
                    if content['keywords']:
                        continue
                    d = {
                        'text':content['text'],
                        'label':['正常'],
                        'source':'offensive_query_risk_gren_v18'
                    }
                    fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
            

In [338]:
with open('/data/albert.xht/sentiment/offensive_cold.json.v18.risk', 'w') as fwobj:
    with open('/data/albert.xht/sentiment/offensive_cold.json.v18') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            if content['label'][0] in ['正常']:
                if content['score_list']['query_risk'][0][1] < 0.5 and  (content['score_list']['senti_query'][0][1] < 0.3 or content['score_list']['senti'][0][1] < 0.3)\
            and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
            and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                and content['score_list']['teenager'][0][1] < 0.5):
                    d = {
                        'text':content['text'],
                        'label':['正常'],
                        'source':'offensive_query_risk_gren_v18'
                    }
                    fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
                elif content['score_list']['query_risk'][0][1] < 0.5 and  (content['score_list']['senti_query'][0][1] > 0.5 or content['score_list']['senti'][0][1] > 0.5)\
                and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
                and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5\
                    and content['score_list']['teenager'][0][1] < 0.5):
                    d = {
                        'text':content['text'],
                        'label':['正常'],
                        'source':'offensive_query_risk_gren_v18'
                    }
                    fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
            

In [53]:
input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v17'


predict_data(green_risk_api, input_path, output_path)

2071401it [39:16, 879.17it/s]


In [None]:
input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v17'


predict_data(green_risk_api, input_path, output_path)

In [332]:
white = []
black = []
query_risk_black = []
query_risk_black_v1 = []
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.green.v18'
with open(output_path) as frobj:
    for line in tqdm(frobj):
        content = json.loads(line.strip())
        if content['score_list']['bias'][0][1] > 0.5 and (content['score_list']['senti_query'][0][1] < 0.1 or content['score_list']['senti'][0][1] < 0.1) \
        and (content['score_list']['ciron'][0][1] < 0.5 and content['score_list']['offensive'][0][1] < 0.5) \
        and (content['score_list']['porn'][0][1] < 0.5):
            white.append(content)
        elif  content['score_list']['bias'][0][1] > 0.8 and (content['score_list']['senti_query'][0][1] > 0.8 or content['score_list']['senti'][0][1] > 0.8):
            black.append(content)
        if content['score_list']['query_risk'][0][1] > 0.5 and  (content['score_list']['senti_query'][0][1] < 0.3 or content['score_list']['senti'][0][1] < 0.3)\
        and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
        and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5):
            query_risk_black.append(content)
        if content['score_list']['query_risk'][0][1] > 0.5 and  (content['score_list']['senti_query'][0][1] > 0.5 or content['score_list']['senti'][0][1] > 0.5)\
        and (content['score_list']['porn'][0][1] < 0.5 and content['score_list']['abusive'][0][1] < 0.5)\
        and (content['score_list']['offensive'][0][1] < 0.5 and content['score_list']['politics'][0][1] < 0.5):
            query_risk_black_v1.append(content)
            

2071400it [01:05, 31510.55it/s]


In [333]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.green.v18', 'w') as fwobj:
    for d in query_risk_black+query_risk_black_v1:
        # if d['label'][0] in ['恐怖主义', '灵异灵修', '死亡', 'LGBT', '战争', 'BDSM', '性生活', '心理健康', '健康', '国家', '历史', '法律', 'LGBT', '军事', '时事政治', '社会', '宗教', '毒品', '博彩', '色情']:
        #     continue
        # # if '杀死' in d['text'] or '生不如死' in d['text'] or '色情' in d['text'] or '一夜情' in d['text'] or '出轨' in d['text'] or '代孕' in d['text'] or '敲诈' in d['text'] or '勒索' in d['text'] or '邪教' in d['text'] or '找小姐' in d['text'] or '赌博' in d['text'] or '约炮' in d['text'] or '强奸' in d['text'] or '暴力' in d['text'] or '嫖娼' in d['text'] or '淫秽' in d['text'] or '口交' in d['text'] or '安眠药' in d['text'] or '做爱' in d['text'] or '报复' in d['text'] or '侵略' in d['text'] or '攻打' in d['text'] or '砍刀' in d['text'] or '原子弹' in d['text'] or '闯红灯' in d['text'] or '跳楼' in d['text'] or '轻生' in d['text'] or '自杀' in d['text']:
        # #     continue
        # keyword_resp = keyword.extract_keywords(d['text'])
        # flag = False
        # if keyword_resp and d['label'][0] not in ['游戏', '动漫', '电脑/网络', '电子数码', '体育/运动']:
        #     continue
        # if '如何评价' in d['text']:
        #     continue
        content = {
            'text':d['text'],
            'topic':d['label'],
            'label':['风险']
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n') 

In [71]:
keyword = [
    '找小姐',
    '赌博',
    '约炮',
    '强奸',
    '暴力',
    '嫖娼',
    '淫秽',
    '口交',
    '安眠药',
    '做爱',
    '报复',
    '报仇',
    '侵略',
    '原子弹',
    '跳楼',
    '闯红灯',
    '自杀',
    '轻生',
    '大麻',
    '冰毒',
    '邪教',
    '敲诈',
    '勒索',
    '出轨',
    '代孕',
    '色情',
    '一夜情',
    '生不如死',
    '杀死',
    '诈骗',
    '骗钱',
    '被骗',
    '砍刀',
    '伊斯兰教',
    '屠杀',
    '蹂躏',
    '强暴',
    '虐待',
    '家暴',
    '炸弹',
    '岛国片',
    '战争',
    '暴打',
    '残忍',
    '处死',
    '911',
    '体罚',
    '炮友',
    '核武',
    '吴亦凡',
    '毁灭',
    '打架',
    '小三',
    '犯罪',
    '性侵',
    '强奸',
    '歧视',
    '羞辱',
    '淫妻癖',
    '嫖妓',
    '敲诈',
    '毒品',
    '烧死',
    '爱国',
    '暴徒',
    '打伤',
    '打死',
    '歧视',
    '杀害',
    '奥巴马',
    'AV',
    '黑人',
    '诱奸',
    '迷奸',
    '病态',
    '同性恋',
    '打到跪地求饶',
    '陪睡',
    '血腥',
    '杀人',
    '行窃',
    '黑社会',
    '暴力执法',
    '残杀',
    '不要脸',
    '羞辱',
    '愚民政策',
    '性骚扰',
    '烧死',
    '玩弄女性',
    '拍裸照',
    '威胁',
    '校园暴力',
    '迫害',
    '袭警',
    '腐败',
    'A片',
    '侵略',
    '核战争',
    '恶毒',
    '挂卖',
    '鸦片',
    '猥亵',
    '打一架',
    '打架',
    '斗殴',
    '械斗',
    '殉情',
    '抽人',
    '砸玻璃',
    '砸东西',
    '最恶心',
    '堕胎',
    '虐杀',
    '怒骂',
    '虐狗',
    '虐猫',
    '虐待',
    '结束生命',
    '偷欢',
    '穿刺',
    'av',
    '想杀'
]

from keyword_processor import KeywordProcesser
keyword_api = KeywordProcesser(keywords=keyword)


with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.green', 'w') as fwobj:
    for d in query_risk_black+query_risk_black_v1:
        if d['label'][0] in ['恐怖主义', '灵异灵修', '死亡', 'LGBT', '战争', 'BDSM', '性生活', '心理健康', '健康', '国家', '历史', '法律', 'LGBT', '军事', '时事政治', '社会', '宗教', '毒品', '博彩', '色情']:
            continue
        # if '杀死' in d['text'] or '生不如死' in d['text'] or '色情' in d['text'] or '一夜情' in d['text'] or '出轨' in d['text'] or '代孕' in d['text'] or '敲诈' in d['text'] or '勒索' in d['text'] or '邪教' in d['text'] or '找小姐' in d['text'] or '赌博' in d['text'] or '约炮' in d['text'] or '强奸' in d['text'] or '暴力' in d['text'] or '嫖娼' in d['text'] or '淫秽' in d['text'] or '口交' in d['text'] or '安眠药' in d['text'] or '做爱' in d['text'] or '报复' in d['text'] or '侵略' in d['text'] or '攻打' in d['text'] or '砍刀' in d['text'] or '原子弹' in d['text'] or '闯红灯' in d['text'] or '跳楼' in d['text'] or '轻生' in d['text'] or '自杀' in d['text']:
        #     continue
        keyword_resp = keyword_api.extract_keywords(d['text'])
        flag = False
        if keyword_resp and d['label'][0] not in ['游戏', '动漫', '电脑/网络', '电子数码', '体育/运动']:
            continue
        if '如何评价' in d['text']:
            continue
        content = {
            'text':d['text'],
            'topic':d['label'],
            'label':['正常']
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        

In [90]:
import json, jieba_fast
termdict = {}
with open('/data/albert.xht/TermTree.V1-1.0.json', 'w') as fwobj:
    with open('/data/albert.xht/TermTree.V1-1.0', 'r') as frobj:
        for line in frobj:
            content = json.loads(line)
            content['text'] = content['term']
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        


In [119]:
from keyword_processor import KeywordProcesser
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/risk_event.txt') as frobj:
    for line in frobj:
        content = line.strip()
    keyword_list = content.split('||')

keyword_api = KeywordProcesser(keywords=keyword_list)


In [122]:
import jieba_fast as jieba
for word in keyword_list:
    jieba.add_word(word)

with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final.keyword', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.knn.final') as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
            content['keywords'] = keyword_api.extract_keywords(text)
            words_list = list(jieba.cut(text))
            words_set = {}
            for word in words_list:
                words_set[word] = ''
            if content['keywords']:
                keyword_list = []
                for keyword_ in content['keywords']:
                    if keyword_[-1] in words_set:
                        keyword_list.append(keyword_)
                if keyword_list:
                    if content['label'][0] not in ['游戏', '电脑/网络', '小说', '网络安全', '动漫']:
                        content['topic'] = content['label']
                        content['label'] = ['风险']
                        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
                    elif content['label'][0] in ['游戏', '电脑/网络', '小说', '网络安全', '动漫']:
                        content['topic'] = content['label']
                        content['label'] = ['正常']
                        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')

2071401it [03:19, 10407.04it/s]


In [134]:
 keyword_api.extract_keywords('我讨厌你')

[]

In [91]:
input_path = '/data/albert.xht/TermTree.V1-1.0.json'
output_path = '/data/albert.xht/TermTree.V1-1.0.json.green.v17'


predict_data(green_risk_api, input_path, output_path)

1019179it [17:41, 960.55it/s] 


In [317]:
green_risk_api_base.predict('我承认我馋妹妹身子了 我的天哪')

{'senti_query': [['负向', 0.7054874300956726],
  ['中性', 0.23167049884796143],
  ['正向', 0.06284210085868835]],
 'senti': [['负向', 0.4684368669986725], ['正向', 0.5315631628036499]],
 'bias': [['偏见', 0.10638052970170975], ['正常', 0.8936194777488708]],
 'ciron': [['讽刺', 0.0028106968384236097], ['正常', 0.9971893429756165]],
 'intent': [['主观评价/比较/判断', 0.6584299206733704],
  ['寻求建议/帮助', 0.02026059664785862],
  ['其它', 0.32130950689315796]],
 'offensive': [['冒犯', 0.0009839484700933099], ['正常', 0.9990160465240479]],
 'query_risk': [['风险', 0.0022602395620197058],
  ['个人信息', 1.4249596347326587e-07],
  ['正常', 0.9977396726608276]],
 'teenager': [['不良', 0.09984611719846725], ['正常', 0.9001538753509521]],
 'politics': [['涉政', 0.0002020606625592336], ['正常', 0.9997979998588562]],
 'porn': [['色情', 0.0021054197568446398], ['正常', 0.9978945851325989]],
 'abusive': [['辱骂', 0.00041765838977880776], ['正常', 0.9995823502540588]]}

In [5]:
green_risk_api.predict(""""手腕上有18道伤口，今晚过后又会多两道，我在等，等熬不下去的那一天逃离这个世界""")

NameError: name 'green_risk_api' is not defined

In [4]:
881/1119, 333/1119, 236/333, 266/1119

(0.7873100983020554,
 0.2975871313672922,
 0.7087087087087087,
 0.23771224307417338)

In [75]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.green.manual', 'w') as fwobj:
    for word in keyword:
        d = {
            'text':'如何'+word,
            'label':['风险']
        }
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
        d = {
            'text':'想'+word,
            'label':['风险']
        }
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
        d = {
            'text':'可以'+word,
            'label':['风险']
        }
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [284]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json.topic.green.v1', 'w') as fwobj:
    for d in query_risk_black_v1:
        if d['label'][0] in ['BDSM', '性生活', '心理健康', '健康', '国家', '历史', '法律', 'LGBT', '军事', '时事政治', '社会', '宗教', '毒品', '博彩', '色情']:
            continue
        if '原子弹' in d['text'] or '闯红灯' in d['text'] or '跳楼' in d['text'] or '轻生' in d['text'] or '自杀' in d['text']:
            continue
        content = {
            'text':d['text'],
            'topic':d['label'],
            'label':['正常']
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')

In [152]:
green_risk_api_base.predict('你是不是一个鬼')

{'senti_query': [['负向', 0.017739232629537582],
  ['中性', 0.9809319376945496],
  ['正向', 0.0013288870686665177]],
 'senti': [['负向', 0.9829708337783813], ['正向', 0.01702919416129589]],
 'bias': [['偏见', 0.017038287594914436], ['正常', 0.9829617142677307]],
 'ciron': [['讽刺', 0.002870894968509674], ['正常', 0.9971291422843933]],
 'intent': [['主观评价/比较/判断', 0.9999998807907104],
  ['寻求建议/帮助', 2.48466243135681e-08],
  ['其它', 1.3022548728258698e-07]],
 'offensive': [['冒犯', 0.24652498960494995], ['正常', 0.7534750699996948]],
 'query_risk': [['风险', 0.015497354790568352],
  ['个人信息', 2.2719566459272755e-06],
  ['正常', 0.9845004081726074]],
 'teenager': [['不良', 0.004878206178545952], ['正常', 0.9951217770576477]],
 'politics': [['涉政', 0.0005057504749856889], ['正常', 0.999494194984436]],
 'porn': [['色情', 4.695681855082512e-05], ['正常', 0.999953031539917]],
 'abusive': [['辱骂', 0.00043745216680690646], ['正常', 0.9995625615119934]]}

In [None]:
input_path = '/data/albert.xht/sentiment/green_teenager.json'
output_path = '/data/albert.xht/sentiment/green_teenager.json.offensive_query_risk.green'


predict_data(green_risk_api, input_path, output_path)

In [102]:
with open('/data/albert.xht/sentiment/green_teenager.json.offensive_query_risk.filter.green', 'w') as fwobj:
    with open('/data/albert.xht/sentiment/green_teenager.json.offensive_query_risk.green') as frobj:
        white = []
        black = []
        for line in frobj:
            content = json.loads(line.strip())
            if content['score_list']['query_risk'][0][1]< 0.2 and content['score_list']['offensive'][0][1] < 0.3 and  content['label'][0] in ['正常']:
                white.append(content)
            elif content['score_list']['query_risk'][0][1]> 0.9 and content['score_list']['offensive'][0][1] > 0.9:
                black.append(content)
            elif content['score_list']['query_risk'][0][1]> 0.8 and content['score_list']['offensive'][0][1] > 0.8 and content['label'][0] in ['不良']:
                black.append(content)
            elif content['score_list']['query_risk'][0][1] > 0.6 and content['score_list']['offensive'][0][1] > 0.6 and  content['label'][0] in ['不良']:                                                                                                              
                black.append(content)
            elif content['score_list']['porn'][0][1] > 0.8:
                black.append(content)
            elif content['score_list']['politics'][0][1] > 0.8:
                black.append(content)
            elif content['score_list']['abusive'][0][1] > 0.8:
                black.append(content)
            elif content['score_list']['offensive'][0][1] > 0.9:
                black.append(content)
            elif content['ori_label'] in ['博彩相关内容', '博彩广告、宣传']:
                black.append(content)
    for d in white:
        tmp = {
            'text':d['text'],
            'label':['正常'],
            'source':'green_teenager_filter'
        }
        fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
    for d in black:
        tmp = {
            'text':d['text'],
            'label':['风险'],
            'source':'green_teenager_filter'
        }
        fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
    
            

In [None]:

input_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter'
output_path = '/data/albert.xht/raw_chat_corpus/topic_classification_v4/smal_white_positive.json.filter.offensive_query_risk'


predict_data(green_risk_api, input_path, output_path)

In [98]:
input_path = '/data/albert.xht/xiaoda/sentiment/senti/senti_ocemotion.json'
output_path = '/data/albert.xht/xiaoda/sentiment/senti/senti_ocemotion.json.green'

predict_data(green_risk_api, input_path, output_path)

35694it [00:57, 624.96it/s]


In [100]:
input_path = '/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.topic'
output_path = '/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.topic.green'

predict_data(green_risk_api, input_path, output_path)

113317it [02:12, 853.19it/s]


In [125]:
p = []
from collections import Counter
k = {}
with open(output_path) as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        if content['score_list']['politics'][0][1] > 0.5:
            if content['topic'] not in k:
                k[content['topic']] = []
            k[content['topic']].append(content)
# for key in k:
#     print(key, len(k[key]))

In [28]:
violence_query = []
with open('/data/albert.xht/raw_chat_corpus/Gender-Base-Violence-main/GBV/gbv/Train.csv.json.translate') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        text = ''.join(json.loads(line.strip())['zh'])
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", text)   # 合并正文中过多的空格
        violence_query.append(text)
violence_result = []
for d in tqdm(violence_query):
    result = green_risk_api.predict(d)
    violence_result.append((d, result))

100%|██████████| 39650/39650 [06:05<00:00, 108.52it/s]


In [31]:
dddd = []
dddd_1 = []
for p in violence_result:
    if p[1]['query_risk'][0][1] > 0.5:
        dddd.append(p)
    else:
        dddd_1.append(p)

In [370]:
import pandas as pd
df = pd.read_excel('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/itag_labl_data_deal_20230112.xlsx')
gold, pred = [], []
result_list = []
data_dict = {}
for idx in tqdm(range(df.shape[0])):
    content = df.loc[idx]
    
    score_list = green_risk_api_base.predict(content['query'])
    if content['query'] not in data_dict:
        data_dict[content['query']] = ''
    else:
        continue
    if content['human_risk'] in [-1]:
        continue
    result_list.append((content, score_list))
    if score_list['query_risk'][0][1] > 0.5 :
        pred.append(1)
    else:
        pred.append(0)
    
    gold.append(int(content['human_risk']))

100%|██████████| 5128/5128 [01:19<00:00, 64.15it/s]


In [132]:
left = []
for p1, p2, p3 in zip(pred, gold, result_list):
    if p1 != p2:
        left.append((p1, p2, p3))

In [371]:
print(classification_report(gold, pred, digits=4))

              precision    recall  f1-score   support

           0     0.9710    0.9253    0.9476      4378
           1     0.6002    0.8023    0.6867       612

    accuracy                         0.9102      4990
   macro avg     0.7856    0.8638    0.8172      4990
weighted avg     0.9255    0.9102    0.9156      4990



In [163]:
green_risk_api.predict('饭前打孩子一顿还是饭后呢')

{'senti_query': [['负向', 0.8237579464912415],
  ['中性', 0.12852226197719574],
  ['正向', 0.04771984741091728]],
 'senti': [['负向', 0.9493701457977295], ['正向', 0.05062990263104439]],
 'bias': [['偏见', 0.08553604781627655], ['正常', 0.9144639372825623]],
 'ciron': [['讽刺', 0.06932748854160309], ['正常', 0.9306724667549133]],
 'intent': [['主观评价/比较/判断', 0.9464393258094788],
  ['寻求建议/帮助', 0.00016714385128580034],
  ['其它', 0.05339352786540985]],
 'offensive': [['冒犯', 0.5559340119361877], ['正常', 0.44406598806381226]],
 'query_risk': [['风险', 0.6405723094940186],
  ['个人信息', 3.481139856376103e-07],
  ['正常', 0.3594273328781128]],
 'teenager': [['不良', 0.12507572770118713], ['正常', 0.8749243021011353]],
 'politics': [['涉政', 0.002126638311892748], ['正常', 0.9978733062744141]],
 'porn': [['色情', 0.0002676665026228875], ['正常', 0.9997323155403137]],
 'abusive': [['辱骂', 0.007041711825877428], ['正常', 0.9929583072662354]]}

In [37]:

offensive = []
with open('/data/albert.xht/sentiment/dev/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive.append(content)
        
offensive_test = []
with open('/data/albert.xht/sentiment/test/offensive_cold.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        offensive_test.append(content)

        
cdia_bias = []
with open('/data/albert.xht/sentiment/dev/cdial_bias.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        cdia_bias.append(content)
        
senti_copr = []
with open('/data/albert.xht/sentiment/dev/senti_copr.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_copr.append(content)
        
ciron = []
with open('/data/albert.xht/sentiment/dev/chinese_ciron.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        ciron.append(content)

senti_smp = []
with open('/data/albert.xht/sentiment/dev/senti_smp_usual.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smp.append(content)
        
senti_smpecisa = []
with open('/data/albert.xht/sentiment/dev/senti_smpecisa.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_smpecisa.append(content)

        
senti_query = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_valid.json.filter.0.7') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        senti_query.append(content)

def evaluation(risk_api, model_path):
    risk_api.reload(model_path)
    print('===offensive===')
    pred, gold, pred_score = eval_all(offensive_test, risk_api, 'offensive')
    evaluation_ece(pred_score, gold)
    print('===cdia-bias===')
    pred, gold, pred_score = eval_all(cdia_bias, risk_api, 'bias')
    evaluation_ece(pred_score, gold)
    print('===ciron===')
    pred, gold, pred_score = eval_all(ciron, risk_api, 'ciron')
    evaluation_ece(pred_score, gold)
    print('===chsenti===')
    pred, gold, pred_score = eval_all(senti_copr, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smpecisa===')
    pred, gold, pred_score = eval_all(senti_smpecisa, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_smp===')
    pred, gold, pred_score = eval_all(senti_smp, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    print('===senti_query===')
    pred, gold, pred_score = eval_all(senti_query, risk_api, 'senti')
    evaluation_ece(pred_score, gold)
    

In [61]:
# evaluation(green_risk_api,
#            '/data/albert.xht/xiaodao/risk_classification/multitask_raw_filter_senti_query_risk_v12_intent_v2-1_10_no_symbol_senti_query_senta_green_mtdnn_v11/multitask_cls.pth.4'
#           )

In [None]:
politics_abusive = []
# risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v2/multitask_cls.pth.9')

with open('/data/albert.xht/xiaoda/sentiment/senti/', 'r') as frobj:
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    for idx, line in tqdm(enumerate(frobj)):
        if idx == 0:
            continue
        content = json.loads(line.strip())
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
        # if content['label'] not in ['black']:
        #     continue
        if len(text) >= 164:
            text = text[:164]
        queue.append(text)
        t.append(content)
        if np.mod(len(queue), 128) == 0 and queue:
            probs = risk_predict_batch(green_risk_api, queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                content = {
                    'text':text,
                    'topic':tt['label'],
                    'score_list':prob_dict
                }
                politics_abusive.append(content)
            queue = []
            t = []
    if queue:
        probs = risk_predict_batch(green_risk_api, queue)
        for prob_dict, text, tt in zip(probs, queue, t):
            content = {
                'text':text,
                'topic':tt['label'],
                'score_list':prob_dict
            }
            politics_abusive.append(content)

In [11]:
politics_abusive = []
# risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v2/multitask_cls.pth.9')

with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json', 'r') as frobj:
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    for idx, line in tqdm(enumerate(frobj)):
        if idx == 0:
            continue
        content = json.loads(line.strip())
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
        # if content['label'] not in ['black']:
        #     continue
        if len(text) >= 164:
            text = text[:164]
        queue.append(text)
        t.append(content)
        if np.mod(len(queue), 128) == 0 and queue:
            probs = risk_predict_batch(green_risk_api, queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                content = {
                    'text':text,
                    'topic':tt['label'],
                    'score_list':prob_dict
                }
                politics_abusive.append(content)
            queue = []
            t = []
    if queue:
        probs = risk_predict_batch(green_risk_api, queue)
        for prob_dict, text, tt in zip(probs, queue, t):
            content = {
                'text':text,
                'topic':tt['label'],
                'score_list':prob_dict
            }
            politics_abusive.append(content)

451726it [12:53, 583.69it/s]


In [None]:
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_poltics.json.green', 'w') as fwobj:
    for d in politics_abusive:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [12]:
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_poltics.json.green', 'w') as fwobj:
    for d in politics_abusive:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [9]:
abusive_list = []
import re
# risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v2/multitask_cls.pth.9')

with open('/data/albert.xht/xiaoda/sentiment/green_abusive/green_abusive.json', 'r') as frobj:
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    for idx, line in tqdm(enumerate(frobj)):
        if idx == 0:
            continue
        content = json.loads(line.strip())
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
        # if content['label'] not in ['black']:
        #     continue
        if len(text) >= 164:
            text = text[:164]
        queue.append(text)
        t.append(content)
        if np.mod(len(queue), 128) == 0 and queue:
            probs = risk_predict_batch(green_risk_api, queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                content = {
                    'text':text,
                    'topic':tt['label'],
                    'score_list':prob_dict
                }
                abusive_list.append(content)
            queue = []
            t = []
    if queue:
        probs = risk_predict_batch(green_risk_api, queue)
        for prob_dict, text, tt in zip(probs, queue, t):
            content = {
                'text':text,
                'topic':tt['label'],
                'score_list':prob_dict
            }
            abusive_list.append(content)
            

1105924it [29:34, 623.16it/s]


In [10]:
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/green_abusive.json.green', 'w') as fwobj:
    for d in abusive_list:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [10]:
porn_list = []
# risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v2/multitask_cls.pth.9')

with open('/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json', 'r') as frobj:
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    for idx, line in tqdm(enumerate(frobj)):
        if idx == 0:
            continue
        content = json.loads(line.strip())
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
        # if content['label'] not in ['black']:
        #     continue
        if len(text) >= 164:
            text = text[:164]
        queue.append(text)
        t.append(content)
        if np.mod(len(queue), 128) == 0 and queue:
            probs = risk_predict_batch(green_risk_api, queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                content = {
                    'text':text,
                    'topic':tt['label'],
                    'score_list':prob_dict
                }
                porn_list.append(content)
            queue = []
            t = []
    if queue:
        probs = risk_predict_batch(green_risk_api, queue)
        for prob_dict, text, tt in zip(probs, queue, t):
            content = {
                'text':text,
                'topic':tt['label'],
                'score_list':prob_dict
            }
            porn_list.append(content)
            

2758518it [1:13:02, 629.39it/s]


In [14]:
with open('/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json.green', 'w') as fwobj:
    for d in porn_list:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [23]:
porn_list = []
with open('/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json.green', 'r') as frobj:
    for line in frobj:
        porn_list.append(json.loads(line.strip()))

In [325]:
import os, sys

sys.path.extend(['/root/deepIE'])

from utils.keyword_processor import KeywordProcesser

In [56]:
from collections import namedtuple
_DocSpan = namedtuple(  # pylint: disable=invalid-name
        "DocSpan", ["start", "length"])

def slide_window(all_doc_tokens, max_length, doc_stride, offset=32):
    doc_spans = []
    start_offset = 0
    while start_offset < len(all_doc_tokens):
        length = len(all_doc_tokens) - start_offset
        if length > max_length - offset:
            length = max_length - offset
        doc_spans.append(_DocSpan(start=start_offset, length=length))
        if start_offset + length == len(all_doc_tokens):
            break
        start_offset += min(length, doc_stride)
    return doc_spans

In [62]:
poltics_new = []
white = []
for d in politics_abusive:
    # if (d['score_list']['abusive'][0][1] < 0.5 and d['score_list']['porn'][0][1] < 0.5) and d['topic'][0] in ['涉政']:
    if  d['topic'][0] in ['涉政']:
        poltics_new.append(d)
for d in politics_abusive:
    if d['topic'][0] in ['正常']:
        poltics_new.append(d)
        white.append(d)
        
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.politics', 'w') as fwobj:
    for d in poltics_new:
        content = {
            'text':d['text'],
            'label':d['topic'],
            'topic':d['topic']
        }
        if d['topic'][0] in ['正常']:
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
            text = d['text']
            spans = slide_window(text[16:], 16, 8, 0)
            for span in spans:
                span_text = text[:16]+text[span.start+16:span.start+span.length+16]
                content = {
                    'text':span_text,
                    'label':d['topic'],
                    'topic':d['topic']
                }
                fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        else:
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        

In [68]:
porn_politics = []
for d in porn_list:
    # if (d['score_list']['abusive'][0][1] < 0.5 and d['score_list']['porn'][0][1] < 0.5) and d['score_list']['politics'][0][1] > 0.9:
    # if (d['score_list']['abusive'][0][1] > 0.9 or d['score_list']['porn'][0][1] > 0.9) and d['score_list']['politics'][0][1] > 0.5 and d['score_list']['politics'][0][1] < 0.8:
    if (d['score_list']['abusive'][0][1] > 0.9 or d['score_list']['porn'][0][1] > 0.9) and d['score_list']['politics'][0][1] > 0.3 and d['score_list']['politics'][0][1] < 0.8:
        porn_politics.append(d)
    if (d['score_list']['abusive'][0][1] < 0.3 and d['score_list']['porn'][0][1] < 0.3) and d['topic'][0] in ['色情']:
        porn_politics.append(d)
    # if (d['score_list']['abusive'][0][1] < 0.3 and d['score_list']['porn'][0][1] < 0.3) and d['score_list']['politics'][0][1] < 0.3:
    #     porn_politics.append(d)

print(len(porn_politics))

with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.porn', 'w') as fwobj:
    for d in porn_politics:
        content = {
            'text':d['text'],
            'label':['正常'],
            'topic':d['topic']
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')

221325


In [82]:
porn_abusive = []
for d in porn_list:
    # if (d['score_list']['abusive'][0][1] > 0.9 and d['score_list']['porn'][0][1] > 0.5) :
    #     porn_abusive.append(d)
    # if (d['score_list']['abusive'][0][1] < 0.5 and d['score_list']['porn'][0][1] > 0.5) :
    #     porn_abusive.append(d)
    # if (d['score_list']['abusive'][0][1] > 0.5 and d['score_list']['porn'][0][1] < 0.5) :
    #     porn_abusive.append(d)
    # if (d['topic'][0] in ['色情'] and d['score_list']['porn'][0][1] < 0.5) :
    #     porn_abusive.append(d)
    # if (d['score_list']['abusive'][0][1] < 0.5 and d['score_list']['porn'][0][1] < 0.5) :
    #     porn_abusive.append(d)

In [67]:
abusive_politics = []
for d in abusive_list:
    # if (d['score_list']['abusive'][0][1] < 0.5 and d['score_list']['porn'][0][1] < 0.5) and d['score_list']['politics'][0][1] > 0.9:
    # if (d['score_list']['abusive'][0][1] > 0.9 or d['score_list']['porn'][0][1] > 0.9) and d['score_list']['politics'][0][1] > 0.5 and d['score_list']['politics'][0][1] < 0.8:
    if (d['score_list']['abusive'][0][1] > 0.9 or d['score_list']['porn'][0][1] > 0.9) and d['score_list']['politics'][0][1] > 0.3 and d['score_list']['politics'][0][1] < 0.8:
        abusive_politics.append(d)
    if (d['score_list']['abusive'][0][1] < 0.3 and d['score_list']['porn'][0][1] < 0.3) and d['topic'][0] in ['辱骂']:
        abusive_politics.append(d)

print(len(abusive_politics))

with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.abusive', 'w') as fwobj:
    for d in abusive_politics:
        content = {
            'text':d['text'],
            'label':['正常'],
            'topic':d['topic']
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')

91527


In [85]:
from tqdm import tqdm
import numpy as np
import json, re

def risk_predict_batch(risk_api, text):
    if isinstance(text, list):
        text_list = text
    else:
        text_list = [text]
    result_list = risk_api.predict_batch(text_list)
    return result_list

In [6]:
abusive = []
risk_api.reload('/data/albert.xht/xiaoda/risk_classification/multitask_raw_filter_senti_query_risk_v11_offensive_v2/multitask_cls.pth.9')

with open('/data/albert.xht/xiaoda/sentiment/green_abusive/short_text_benchmark_for_cro_albert_ruma.json.txt', 'r') as frobj:
    queue = []
    t = []
    from collections import Counter
    pppp = Counter()
    for idx, line in tqdm(enumerate(frobj)):
        if idx == 0:
            continue
        content = json.loads(line.strip())
        text = re.sub(r"([，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+])+", "", content['text'])   # 合并正文中过多的空格
        # if content['label'] not in ['black']:
        #     continue
        if len(text) >= 164:
            text = text[:164]
        pppp[content['label']] += 1
        queue.append(text)
        t.append(content)
        if np.mod(len(queue), 128) == 0 and queue:
            probs = risk_predict_batch(risk_api, queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                content = {
                    'text':text,
                    'topic':tt['label'],
                    'score_list':prob_dict
                }
                abusive.append(content)
            queue = []
            t = []
    if queue:
        probs = risk_predict_batch(risk_api, queue)
        for prob_dict, text, tt in zip(probs, queue, t):
            content = {
                'text':text,
                'topic':tt['label'],
                'score_list':prob_dict
            }
            abusive.append(content)

1128122it [17:42, 1062.09it/s]


In [7]:
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/short_text_benchmark_for_cro_albert_ruma.json.txt.offensive', 'w') as fwobj:
    for d in tqdm(abusive):
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
        

100%|██████████| 1128121/1128121 [00:17<00:00, 65633.13it/s]


In [62]:
abusive = []
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/short_text_benchmark_for_cro_albert_ruma.json.txt.offensive') as frobj:
    for line in tqdm(frobj):
        abusive.append(json.loads(line.strip()))


1128121it [00:25, 44229.65it/s]


In [147]:


mapping = {
    '辱骂':'辱骂',
    '辱骂-白样本':'正常'
}

import random
random.shuffle(abusive)

from tqdm import tqdm

with open('/data/albert.xht/xiaoda/sentiment/green_abusive/green_abusive.json', 'w') as fwobj:
    black = []
    left = []
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    data_dict = {}
    for d in tqdm(abusive):
        p = {
                'text':d['text'],
                'label':[mapping[d['topic']]]
            }
        if d['text'] not in data_dict:
            data_dict[d['text']] = set()
        data_dict[d['text']].add(p['label'][0])
    for key in tqdm(data_dict):
        label = data_dict[key]
        if len(label) == 1:
            p = {
                'text':key,
                'label':list(label)
            }
        fwobj.write(json.dumps(p, ensure_ascii=False)+'\n')
    #     if d['score_list']['offensive'][0][1] > 0.9 or d['score_list']['query_risk'][0][1] > 0.9:
    #         if d['topic'] in ['辱骂']:
    #             p = {
    #                 'text':d['text'],
    #                 'label':[mapping[d['topic']]],
    #                 'source':'risk-offensive'
    #             }
    #             fwobj.write(json.dumps(p, ensure_ascii=False)+'\n')
    #             count1 += 1
    #         elif d['topic'] in ['辱骂-白样本', '辱骂-白样本']:
    #             p = {
    #                 'text':d['text'],
    #                 'label':[mapping[d['topic']]],
    #                 'source':'risk-offensive'
    #             }
    #             fwobj.write(json.dumps(p, ensure_ascii=False)+'\n')
    #             count2 += 1
    #         if count1+count2 >= 200000:
    #             break
    # count = 0
    # for d in tqdm(abusive):
    #     if d['score_list']['offensive'][0][1] < 0.1 and d['score_list']['query_risk'][0][1] < 0.1:
    #         if d['topic'] in ['辱骂']:
    #             p = {
    #                 'text':d['text'],
    #                 'label':[mapping[d['topic']]],
    #                 'source':'abusive'
    #             }
    #             fwobj.write(json.dumps(p, ensure_ascii=False)+'\n')
    #             count3 += 1
    #         elif d['topic'] in ['辱骂-白样本', '辱骂-白样本']:
    #             p = {
    #                 'text':d['text'],
    #                 'label':[mapping[d['topic']]],
    #                 'source':'abusive'
    #             }
    #             fwobj.write(json.dumps(p, ensure_ascii=False)+'\n')
    #             count4 += 1
    #         if count3+count4 >= 200000:
    #             break



100%|██████████| 1128121/1128121 [00:03<00:00, 370510.78it/s]
100%|██████████| 1105924/1105924 [00:07<00:00, 139382.76it/s]


In [108]:
from collections import namedtuple
_DocSpan = namedtuple(  # pylint: disable=invalid-name
        "DocSpan", ["start", "length"])

def slide_window(all_doc_tokens, max_length, doc_stride, offset=0):
    doc_spans = []
    start_offset = 0
    while start_offset < len(all_doc_tokens):
        length = len(all_doc_tokens) - start_offset
        if length > max_length - offset:
            length = max_length - offset
        doc_spans.append(_DocSpan(start=start_offset, length=length))
        if start_offset + length == len(all_doc_tokens):
            break
        start_offset += min(length, doc_stride)
    return doc_spans

In [112]:
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json.add', 'w') as fwobj:
    with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    import random
    random.shuffle(white)
    for d in white[:50000]:
        d['label'] = ['正常']
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
        

In [132]:
with open('/data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_final.json.merge.add', 'w') as fwobj:
    with open('/data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_final.json.merge', 'r') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    for d in politics+porn+abusive_list:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
    
    import random
    random.shuffle(politics_black)
    random.shuffle(porn_black)
    random.shuffle(abusive_black)
    for d in politics_black[:int(len(politics_black)*0.5)]:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
    for d in porn_black[:int(len(porn_black)*0.5)]:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
    for d in abusive_black[:int(len(abusive_black)*0.5)]:
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [131]:
len(politics_black)

39431

In [130]:

politics = []
politics_black = []
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics.json') as frobj:
    for line in frobj:
        d = json.loads(line.strip())
        if d['label'][0] in ['涉政'] and d['source'] in ['politics']:
            d['label'] = ['风险']
            politics.append(d)
        if d['label'][0] in ['涉政'] and d['source'] in ['risk-offensive']:
            d['label'] = ['风险']
            politics_black.append(d)
        if d['label'][0] in ['正常'] and d['source'] in ['politics']:
            d['label'] = ['正常']
            politics.append(d)
print(len(politics))


50000


In [129]:
porn = []
white = []
porn_black = []
with open('/data/albert.xht/xiaoda/sentiment/green_porn/green_porn.json') as frobj:
    for line in frobj:
        d = json.loads(line.strip())
        if d['label'][0] in ['色情'] and d['source'] in ['porn']:
            d['label'] = ['风险']
            porn.append(d)
        elif d['label'][0] in ['色情'] and d['source'] in ['risk-offensive']:
            d['label'] = ['风险']
            porn_black.append(d)
        elif d['label'][0] in ['正常'] and d['source'] in ['porn']:
            d['label'] = ['正常']
            porn.append(d)
            white.append(d)
print(len(porn), len(white))

200000 150969


In [128]:
abusive_list = []
abusive_black = []
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/green_abusive.json') as frobj:
    for line in frobj:
        d = json.loads(line.strip())
        if d['label'][0] in ['辱骂'] and d['source'] in ['abusive']:
            d['label'] = ['风险']
            abusive_list.append(d)
        if d['label'][0] in ['辱骂'] and d['source'] in ['risk-offensive']:
            d['label'] = ['风险']
            abusive_black.append(d)
        if d['label'][0] in ['辱骂'] and d['source'] in ['abusive']:
            d['label'] = ['正常']
            abusive_list.append(d)
            white.append(d)
print(len(abusive_list))

29701


In [77]:
len(white)

150786

In [70]:
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/green_abusive_label.txt', 'w') as fwobj:
    for label in ['辱骂', '正常']:
        fwobj.write(label+'\n')
        
with open('/data/albert.xht/xiaoda/sentiment/green_porn/green_porn_label.txt', 'w') as fwobj:
    for label in ['色情', '正常']:
        fwobj.write(label+'\n')
        
with open('/data/albert.xht/xiaoda/sentiment/green_politics/green_politics_label.txt', 'w') as fwobj:
    for label in ['涉政', '正常']:
        fwobj.write(label+'\n')

In [135]:
abusive = []
with open('/data/albert.xht/xiaoda/sentiment/green_abusive/short_text_benchmark_for_cro_albert_ruma.json.txt.offensive') as frobj:
    for line in tqdm(frobj):
        abusive.append(json.loads(line.strip()))
        

1128121it [00:13, 85335.15it/s]


In [174]:
porn = []
with open('/data/albert.xht/xiaoda/sentiment/green_porn/short_text_benchmark_for_cro_albert_seqing.json.txt.offensive') as frobj:
    for line in tqdm(frobj):
        porn.append(json.loads(line.strip()))

2805958it [01:11, 39464.73it/s]


In [175]:
politics = []
with open('/data/albert.xht/xiaoda/sentiment/green_politics/short_text_benchmark_for_cro_albert_politics.json.txt.offensive') as frobj:
    for line in tqdm(frobj):
        politics.append(json.loads(line.strip()))

456219it [00:05, 79169.32it/s]


In [182]:
black = {
    'abusive':{1:[], 2:[], 3:[], 4:[]},
    'porn':{1:[], 2:[], 3:[], 4:[]},
    'politics':{1:[], 2:[], 3:[], 4:[]}
}
white = {
    'abusive':{1:[], 2:[], 3:[], 4:[]},
    'porn':{1:[], 2:[], 3:[], 4:[]},
    'politics':{1:[], 2:[], 3:[], 4:[]}
}
abusive_mapping = {
    '辱骂':'辱骂',
    '辱骂-白样本':'正常'
}

porn_mapping = {
    '色情':'色情',
    '色情-白样本':'正常',
    '色情-正常':'正常'
}

politics_mapping = {
    'black':'涉政',
    'white':'正常'
}

def filter_fn(d, stage):
    if stage == 1:
        cond1 = d['score_list']['offensive'][0][1] < 0.5 and d['score_list']['offensive'][0][1] > 0.3
        cond2 = d['score_list']['query_risk'][0][1] < 0.5 and d['score_list']['query_risk'][0][1] > 0.3
        if cond1 and cond2:
            return True
        else:
            return False
    if stage == 2:
        cond1 = d['score_list']['offensive'][0][1] > 0.5 and d['score_list']['offensive'][0][1] < 0.9
        cond2 = d['score_list']['query_risk'][0][1] > 0.5 and d['score_list']['query_risk'][0][1] < 0.9
        if cond1 and cond2:
            return True
        else:
            return False
    if stage ==3:
        cond1 = d['score_list']['offensive'][0][1] > 0.9
        cond2 = d['score_list']['query_risk'][0][1] > 0.9
        if cond1 and cond2:
            return True
        else:
            return False
    if stage == 4:
        cond1 = d['score_list']['offensive'][0][1] < 0.2
        cond2 = d['score_list']['query_risk'][0][1] < 0.2
        if cond1 and cond2:
            return True
        else:
            return False

for d in abusive:
    if abusive_mapping[d['topic']] in ['辱骂']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):
                d['label'] = abusive_mapping[d['topic']]
                black['abusive'][stage].append(d)
    if abusive_mapping[d['topic']] in ['正常']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):
                d['label'] = abusive_mapping[d['topic']]
                white['abusive'][stage].append(d)
                
for d in porn:
    if porn_mapping[d['topic']] in ['色情']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):
                d['label'] = porn_mapping[d['topic']]
                black['porn'][stage].append(d)
    if porn_mapping[d['topic']] in ['正常']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):
                d['label'] = porn_mapping[d['topic']]
                white['porn'][stage].append(d)
                
for d in politics:
    if politics_mapping[d['topic']] in ['涉政']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):                    
                d['label'] = politics_mapping[d['topic']]
                black['politics'][stage].append(d)
    if politics_mapping[d['topic']] in ['正常']:
        for stage in [1, 2, 3, 4]:
            if filter_fn(d, stage):
                d['label'] = politics_mapping[d['topic']]
                white['politics'][stage].append(d)

In [183]:
with open('/data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_final.json.merge.add', 'w') as fwobj:
    with open('/data/albert.xht/xiaoda/sentiment/query_risk_v12/query_risk_final.json.merge', 'r') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    
    for d in white['porn'][1]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    
    random.shuffle(white['porn'][4])
    for d in white['porn'][4][:500000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    
    random.shuffle(black['porn'][4])
    for d in black['porn'][4][:100000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        
    random.shuffle(black['porn'][2])
    for d in black['porn'][2][:100000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    
    random.shuffle(black['porn'][2])
    for d in black['porn'][2][:100000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        
    random.shuffle(black['politics'][4])
    for d in black['politics'][4][:100000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        
    random.shuffle(black['politics'][4])
    for d in black['politics'][4][:100000]:
        content = {
            'text':d['text'],
            'label':[d['label']]
        }
        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        
    

[{'text': '对的给你点赞我还没购买呢就问它龟儿子一句三通一达都上班了咋还发不了货呢龟儿子怼老子去找三通一达买日它妈就跟只有它一家在卖一样就这副德行还想做生意拉黑老子一样骂它看到它商品有评论老子就来骂',
  'topic': '色情-白样本',
  'score_list': {'offensive': [['冒犯', 0.9548986554145813],
    ['正常', 0.04510140046477318]],
   'query_risk': [['风险', 0.9991697072982788],
    ['个人信息', 5.0525082770036533e-05],
    ['正常', 0.0007798463338986039]]},
  'label': '正常'},
 {'text': '2019年刷单不给钱死骗子',
  'topic': '色情-白样本',
  'score_list': {'offensive': [['冒犯', 0.9943484663963318],
    ['正常', 0.00565155828371644]],
   'query_risk': [['风险', 0.9631624221801758],
    ['个人信息', 5.80549931328278e-06],
    ['正常', 0.03683171793818474]]},
  'label': '正常'},
 {'text': 'sb关你屌事',
  'topic': '色情-白样本',
  'score_list': {'offensive': [['冒犯', 0.991685688495636],
    ['正常', 0.00831428449600935]],
   'query_risk': [['风险', 0.9979343414306641],
    ['个人信息', 1.1481271940283477e-05],
    ['正常', 0.0020541255362331867]]},
  'label': '正常'},
 {'text': 'sb客服死全家',
  'topic': '色情-白样本',
  'score_list': {'offensive': [['冒犯', 0.9836064577

In [195]:
len(black['porn'][1])

5317

In [203]:
black['porn'][3][20:30]

[{'text': '小骚货搞死你哦',
  'topic': '色情',
  'score_list': {'offensive': [['冒犯', 0.9431781768798828],
    ['正常', 0.05682184919714928]],
   'query_risk': [['风险', 0.9965651631355286],
    ['个人信息', 1.6847050574142486e-05],
    ['正常', 0.003417979460209608]]},
  'label': '色情'},
 {'text': '就你那个大几巴怕是早邪哦你怕是森窥哦还大几巴你行不行就是一个问题',
  'topic': '色情',
  'score_list': {'offensive': [['冒犯', 0.9350871443748474],
    ['正常', 0.06491286307573318]],
   'query_risk': [['风险', 0.9823386073112488],
    ['个人信息', 3.4572629374451935e-05],
    ['正常', 0.017626861110329628]]},
  'label': '色情'},
 {'text': '干爹干爹',
  'topic': '色情',
  'score_list': {'offensive': [['冒犯', 0.933490514755249],
    ['正常', 0.06650949269533157]],
   'query_risk': [['风险', 0.9553155899047852],
    ['个人信息', 1.4558536349795759e-05],
    ['正常', 0.04466981813311577]]},
  'label': '色情'},
 {'text': '并且多数人还配合其强奸操难怪当年中国几乎亡于日本',
  'topic': '色情',
  'score_list': {'offensive': [['冒犯', 0.9945043325424194],
    ['正常', 0.005495647434145212]],
   'query_risk': [['风险',

[{'text': '用来打你玛',
  'topic': '辱骂',
  'score_list': {'offensive': [['冒犯', 0.7227365970611572],
    ['正常', 0.2772634029388428]],
   'query_risk': [['风险', 0.7774815559387207],
    ['个人信息', 2.7271302315057255e-05],
    ['正常', 0.22249111533164978]]}},
 {'text': '傻逼会的',
  'topic': '辱骂',
  'score_list': {'offensive': [['冒犯', 0.9970690608024597],
    ['正常', 0.0029309105593711138]],
   'query_risk': [['风险', 0.9801044464111328],
    ['个人信息', 1.1893325790879317e-05],
    ['正常', 0.019883625209331512]]}},
 {'text': '上脸了太刺激我感觉我都要裂了这纯属就是骗人的代购的手段我还真的没信了呵呵太欺负人\\n这要是烂脸了谁负责',
  'topic': '辱骂',
  'score_list': {'offensive': [['冒犯', 0.979736864566803],
    ['正常', 0.020263150334358215]],
   'query_risk': [['风险', 0.987972617149353],
    ['个人信息', 5.65334630664438e-05],
    ['正常', 0.011970845982432365]]}},
 {'text': '一看你就是踏马的老板本人狗你麻痹',
  'topic': '辱骂',
  'score_list': {'offensive': [['冒犯', 0.9497252702713013],
    ['正常', 0.05027477815747261]],
   'query_risk': [['风险', 0.9998016953468323],
    ['个人信息', 5.620441