In [1]:
import json
import sys,os
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [3]:
import torch
from torch.nn import functional as F
import numpy as np
import random
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

"""
https://github.com/ondrejbohdal/meta-calibration/blob/main/Metrics/metrics.py
"""

class ECE(nn.Module):
    
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECE, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels, mode='logits'):
        if mode == 'logits':
            softmaxes = F.softmax(logits, dim=1)
        else:
            softmaxes = logits
        # softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [4]:
import torch
import json
import sys
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizerFast
import transformers
from datetime import timedelta

import os, sys
cur_dir_path = '/root/xiaoda/query_topic/'

sys.path.extend([cur_dir_path])

from nets.them_classifier import MyBaseModel, RobertaClassifier

import configparser
from tqdm import tqdm

class TopicInfer(object):
    def __init__(self, config_path):

        import torch, os, sys

        con = configparser.ConfigParser()
        con_path = os.path.join(cur_dir_path, config_path)
        con.read(con_path, encoding='utf8')

        args_path = dict(dict(con.items('paths')), **dict(con.items("para")))
        self.tokenizer = BertTokenizerFast.from_pretrained(args_path["model_path"], do_lower_case=True)

        label_list = []
        label_path = os.path.join(cur_dir_path, args_path['label_path'])
        with open(label_path, 'r') as frobj:
            for line in frobj:
                label_list.append(line.strip())
        n_classes = len(label_list)

        self.label2id, self.id2label = {}, {}
        for idx, label in enumerate(label_list):
            self.label2id[label] = idx
            self.id2label[idx] = label
            
        print(self.label2id, '===', self.id2label)
        
        output_path = os.path.join(cur_dir_path, args_path['output_path'])

        from roformer import RoFormerModel, RoFormerConfig

        config = RoFormerConfig.from_pretrained(args_path["model_path"])
        encoder = RoFormerModel(config=config)
        
        encoder_net = MyBaseModel(encoder, config)

        classify_net = RobertaClassifier(
            hidden_size=config.hidden_size, 
            dropout_prob=con.getfloat('para', 'out_dropout_rate'),
            num_labels=n_classes, 
            dropout_type=con.get('para', 'dropout_type'))

        self.device = "cuda:1" if torch.cuda.is_available() else "cpu"

        class TopicClassifier(nn.Module):
            def __init__(self, transformer, classifier):
                super().__init__()

                self.transformer = transformer
                self.classifier = classifier

            def forward(self, input_ids, input_mask, 
                        segment_ids=None, transformer_mode='mean_pooling'):
                hidden_states = self.transformer(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              return_mode=transformer_mode)
                ce_logits = self.classifier(hidden_states)
                return ce_logits, hidden_states

        import os
        self.net = TopicClassifier(encoder_net, classify_net).to(self.device)
        # eo = 9
        # ckpt = torch.load(os.path.join(output_path, 'cls.pth.{}'.format(eo)), map_location=self.device)
        # self.topic_net.load_state_dict(ckpt)
        # self.topic_net.eval()
        
    def reload(self, model_path):
        ckpt = torch.load(model_path, map_location=self.device)
        self.net.load_state_dict(ckpt)
        self.net.eval() 

    def predict(self, text, top_n=5):

        """抽取输入text所包含的类型
        """
        token2char_span_mapping = self.tokenizer(text, return_offsets_mapping=True, max_length=256)["offset_mapping"]
        encoder_txt = self.tokenizer.encode_plus(text, max_length=256)
        input_ids = torch.tensor(encoder_txt["input_ids"]).long().unsqueeze(0).to(self.device)
        token_type_ids = torch.tensor(encoder_txt["token_type_ids"]).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(encoder_txt["attention_mask"]).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            scores, hidden_states = self.net(input_ids, attention_mask, token_type_ids, 
                         transformer_mode='cls'
                         )
            scores = torch.nn.Softmax(dim=1)(scores)[0].data.cpu().numpy()
        
        schema_types = []
        for index, score in enumerate(scores):
             schema_types.append([self.id2label[index], float(score)])
        schema_types = sorted(schema_types, key=lambda item:item[1], reverse=True)
        return schema_types[0:5]
    
    def predict_batch(self, text):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, return_tensors="pt",padding=True, max_length=256)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        with torch.no_grad():
            scores, hidden_states = self.net(model_input['input_ids'], 
                        model_input['attention_mask'], 
                        model_input['token_type_ids'], 
                         transformer_mode='cls'
                         )
            scores = torch.nn.Softmax(dim=1)(scores).data.cpu().numpy()
        schema_types_list = []
        for score_, text in zip(scores, text_list):
            schema_types = []
            for index, score in enumerate(score_):
                 schema_types.append([self.id2label[index], float(score)])
            schema_types = sorted(schema_types, key=lambda item:item[1], reverse=True)
            schema_types_list.append(schema_types[0:5])
        return schema_types_list
    
    def infer_batch(self, text):
        if isinstance(text, list):
            text_list = text
        else:
            text_list = [text]
        model_input = self.tokenizer(text_list, return_tensors="pt",padding=True, max_length=256)
        for key in model_input:
            model_input[key] = model_input[key].to(self.device)
        with torch.no_grad():
            scores, hidden_states = self.net(model_input['input_ids'], 
                        model_input['attention_mask'], 
                        model_input['token_type_ids'], 
                         transformer_mode='cls'
                         )
            scores = torch.nn.Softmax(dim=1)(scores).data.cpu().numpy()
        return scores
                

topic_api = TopicInfer('./topic_data_v4/config.ini')

# text = '王二今天打车去了哪里，从哪里出发，到哪里了'
# print(topic_api.predict(text), text)


{'基金': 0, '汽车': 1, '网络安全': 2, '情商': 3, '财务税务': 4, '旅游': 5, '建筑': 6, '炫富': 7, '家电': 8, '商业': 9, '性生活': 10, '风水': 11, '人物': 12, 'VPN': 13, '吸烟': 14, '市场营销': 15, '游戏': 16, '算命': 17, '编程': 18, '死亡': 19, '公司': 20, '国家': 21, '美食/烹饪': 22, '明星': 23, '城市': 24, '银行': 25, '期货': 26, '宗教': 27, '学习': 28, '电子数码': 29, '网络暴力': 30, 'LGBT': 31, '其他': 32, '故事': 33, '社会': 34, '二手': 35, '动漫': 36, '歧视': 37, '常识': 38, '星座': 39, '冷知识': 40, '职场职业': 41, '食品': 42, '心理健康': 43, '电子商务': 44, '道德伦理': 45, '商业/理财': 46, '赚钱': 47, '神话': 48, '校园生活': 49, '色情': 50, '婚姻': 51, '家居装修': 52, '生活': 53, '灵异灵修': 54, '股票': 55, '娱乐': 56, '女性': 57, '体验': 58, '广告': 59, '天气': 60, '女权': 61, '潜规则': 62, '人类': 63, '马克思主义': 64, '历史': 65, '音乐': 66, '毒品': 67, '摄影': 68, '金融': 69, '影视': 70, '语言': 71, '环境': 72, '高铁': 73, '人际交往': 74, '夜店': 75, '价值观': 76, '恋爱': 77, '相貌': 78, 'BDSM': 79, '恐怖主义': 80, '中医': 81, '性侵犯': 82, '阅读': 83, '时尚': 84, '体育/运动': 85, '资本主义': 86, '灾害意外': 87, '博彩': 88, '成长': 89, '校园暴力': 90, '移民': 91, '美容/塑身': 92, '经济': 93, '睡眠': 94, 

05/31/2023 23:03:37 - INFO - nets.them_classifier - ++RobertaClassifier++ apply stable dropout++


In [6]:
# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v4_label_smoothing/them/cls.pth.9')
# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v4/them/cls.pth.9')
# topic_api.reload('//data/albert.xht/xiaodao/topic_classification_v4/them_20epoch/cls.pth.19')


topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v4/them_knn_20epoch/cls.pth.19')
# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v5/them_knn_10epoch//cls.pth.9')

# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v5/normal_knn_20epoch/cls.pth.19')
# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v5/normal_knn_20epoch_logitclip_1.2/cls.pth.19')

# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v5/normal_knn_40epoch/cls.pth.30')



# topic_api.reload('/data/albert.xht/xiaodao/topic_classification_v4/them_3epoch/cls.pth.2')

In [14]:
with open('/data/albert.xht/xiaodao/query_risk_v10/biake_qa_web_text_zh_train.json.offensive.all.negative', 'w') as fwobj:
    with open('/data/albert.xht/xiaodao/query_risk_v10/biake_qa_web_text_zh_train.json.offensive.all') as frobj:
        for line in frobj:
            content = json.loads(line.strip())
            topic = topic_api.predict_batch(content['text'])[0][0]
            if content['label'][0] in ['风险']:
                content['topic'] = [topic]
                fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
    


In [9]:
text_list = ['你能不能准时一点',
'你太严肃了我不喜欢你',
'学神是什么',
'是不是网络断开了',
'世界上有没有鬼呀',
'我老婆不开心了怎么办',
'你是个傻瓜']
topic_api.predict_batch(['"女孩子究竟是怎么由丑变美的'])[0]

[['时尚', 0.44743233919143677],
 ['女性', 0.38462936878204346],
 ['男性', 0.04517074674367905],
 ['心理健康', 0.029388289898633957],
 ['恋爱', 0.01591077446937561]]

In [12]:
from tqdm import tqdm

with open('/data/albert.xht/sentiment/BelleGroup_train_2M_CN_v37/BelleGroup_train_2M_CN_tnews_latest_final', 'w') as fwobj:
    with open('/data/albert.xht/sentiment/BelleGroup_train_2M_CN_v37/BelleGroup_train_2M_CN_tnews_latest', 'r') as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            result = topic_api.predict_batch([content['text'][0:256]])[0]
            if result[0][0] not in ['时事政治'] and result[1][0] not in ['时事政治']:
                fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
        
with open('/data/albert.xht/sentiment/BelleGroup_train_2M_CN_v37/BelleGroup_train_2M_CN_title2event_latest_final', 'w') as fwobj:
    with open('/data/albert.xht/sentiment/BelleGroup_train_2M_CN_v37/BelleGroup_train_2M_CN_title2event_latest', 'r') as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            result = topic_api.predict_batch([content['text'][0:256]])[0]
            if result[0][0] not in ['时事政治'] and result[1][0] not in ['时事政治']:
                fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
                


218311it [35:57, 101.19it/s]
5592it [00:52, 106.91it/s]


In [13]:
result【0】
a = torch.tensor(np.random.random((10)))
b = torch.tensor([0,1,2,3])
a[b]


SyntaxError: invalid character '【' (U+3010) (1127155099.py, line 1)

In [14]:
result[0]

['基金', 0.9534446597099304]

In [16]:
content['text']

'南极洲科学家在极地探险活动中发现了一种非常独特的生物。据报道，这种生物被称为“极地美人鱼”，因其美丽的外表而得名。这种生物在南极冰海中深度达到3000米的一个区域被发现。\n科学家们描述了这种生物的外观和习性，他们说这种“美人鱼”长约30厘米，身体呈透明色，看起来像一颗水晶球。这种生物没有脊椎骨，而是由一些僵硬的角质物质组成，这些物质覆盖在身体的表面上。\n据科学家们表示，这种“美人鱼”是几十年来发现的首个属于此类的生物。习性十分独特，这种生物似乎能够在极寒的水下生存四百年之久，而且并不患病或变得晚年。科学家们对这种生物的研究显示“美人鱼”拥有一种特殊的代谢机制，可以将营养物质在应用微生物进行新陈代谢时持续不断地重新利用。\n科学家们还表示，他们将进一步研究这种活了几百年的生物所拥有的代谢机制，并将寻求与“美人鱼”相关的抗老化疗法来延长人类寿命。这项发现对于人类未来的医学和科学研究来说有着重大的影响。\n据悉，目前科学家们正在南极洲进行长达数月的探险。他们希望能够发现更多的特殊生物并为我们了解极地生态环境带来新的认识。\n值得注意的是，随着气候变化的不断进行，科学家们担心南极洲这些地区的生态环境可能会受到极大的影响，这使得探险活动的重要性更加突出。科学家们希望这项发现能够引起人们对极地’环境保护的关注，让世界更好地了解这个神秘而美丽的地方。'

In [73]:
v4_valid = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_valid.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        v4_valid.append(content)
        
v4_knn_valid = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_valid.json.topic.knn.final') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        v4_knn_valid.append(content)


In [12]:
v5_valid = []
v5_text = []
v5_label = []
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v5/biake_qa_web_text_zh_valid.json') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        v5_valid.append(content)
        v5_text.append(content['text'])
        v5_label.append(topic_api.label2id[content['label'][0]])



In [59]:
def predict_data(data, api):
    queue = []
    pred_score = []
    tt = []
    for item in tqdm(data):
        if isinstance(item, list):
            text = "\n".join(item)
        else:
            text = item
        queue.append(text)
        tt.append(item)
        if np.mod(len(queue), 128) == 0:
            result_list = api.infer_batch(queue)
            for result, text, t in zip(result_list, queue, tt):
                pred_score.append(result)
            queue = []
            tt = []
    if queue:
        result_list = api.infer_batch(queue)
        for result, text, t in zip(result_list, queue, tt):
            pred_score.append(result)
    return pred_score

pred_score = predict_data(v5_text, topic_api)


100%|██████████| 83069/83069 [00:22<00:00, 3750.61it/s]


In [60]:
v5_pred_score_np = np.array(pred_score)
v5_label_np = np.array(v5_label)

In [45]:
# Problem setup
n = 70000 # number of calibration points
alpha = 0.1 # 1-alpha is the desired coverage

# Split the softmax scores into calibration and validation sets (save the shuffling)
idx = np.array([1] * n + [0] * (v5_pred_score_np.shape[0]-n)) > 0
np.random.shuffle(idx)
cal_smx, val_smx = v5_pred_score_np[idx,:], v5_pred_score_np[~idx,:]
cal_labels, val_labels = v5_label_np[idx], v5_label_np[~idx]

In [46]:
# Get scores. calib_X.shape[0] == calib_Y.shape[0] == n
cal_pi = cal_smx.argsort(1)[:,::-1]; cal_srt = np.take_along_axis(cal_smx,cal_pi,axis=1).cumsum(axis=1)
cal_scores = np.take_along_axis(cal_srt,cal_pi.argsort(axis=1),axis=1)[range(n),cal_labels]
# Get the score quantile
qhat = np.quantile(cal_scores, np.ceil((n+1)*(1-alpha))/n, interpolation='higher')
# Deploy (output=list of length n, each element is tensor of classes)
val_pi = val_smx.argsort(1)[:,::-1]; val_srt = np.take_along_axis(val_smx,val_pi,axis=1).cumsum(axis=1)
prediction_sets = np.take_along_axis(val_srt <= qhat,val_pi.argsort(axis=1),axis=1)

# Calculate empirical coverage
empirical_coverage = prediction_sets[np.arange(prediction_sets.shape[0]),val_labels].mean()
print(f"The empirical coverage is: {empirical_coverage}")

The empirical coverage is: 0.999158313566455


Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  qhat = np.quantile(cal_scores, np.ceil((n+1)*(1-alpha))/n, interpolation='higher')


In [47]:
qhat

0.9999999

In [72]:
from sklearn.metrics import classification_report
import numpy as np

def evaluation_ece(pred_score, gold, mapping_dict):
    pass

def eval_all(data, model, top_n=5):
    pred = []
    gold = []
    pred_score = []
    pred = 0
    queue = []
    tt = []
    total_pred = []
    for item in tqdm(data):
        gold.append(item['label'][0])
        if isinstance(item['text'], list):
            text = "\n".join(item['text'])
        else:
            text = item['text']
        queue.append(text)
        tt.append(item)
        if np.mod(len(queue), 128) == 0:
            result_list = model.predict_batch(queue)
            for result, text, t in zip(result_list, queue, tt):
                score = sorted(result, key=lambda u:u[1], reverse=True)
                pred_set = set([p[0] for p in score[:top_n]])
                total_pred.append(score[0][0])
                if set(t['label']) & pred_set:
                    pred += 1
                pred_score.append(result)
            queue = []
            tt = []
    if queue:
        result_list = model.predict_batch(queue)
        for result, text, t in zip(result_list, queue, tt):
            score = sorted(result, key=lambda u:u[1], reverse=True)
            pred_set = set([p[0] for p in score[:top_n]])
            total_pred.append(score[0][0])
            if set(t['label']) & pred_set:
                pred += 1
            pred_score.append(result)
        # break
    print(classification_report(gold, total_pred, digits=4), '===', top_n)
    print(pred/len(pred_score))
    return pred_score, total_pred, gold

In [55]:
pred_score[0]

[['城市', 0.9653037190437317],
 ['文化/艺术', 0.0006875364342704415],
 ['教育/科学', 0.0003225688706152141],
 ['社会', 0.00032179488334804773],
 ['宠物', 0.00030149819212965667]]

In [140]:
target = []
for item in v4_valid:
    if item['label'][0] in ['道德伦理']:
        target.append(item)

In [157]:
target_result = []
for item in target:
    result = topic_api.predict(item['text'])
    if result[0][0] not in item['label']:
        target_result.append((result, item))

In [74]:
pred_score, total_pred, gold = eval_all(v5_valid, topic_api, top_n=2)

100%|██████████| 83069/83069 [00:28<00:00, 2945.79it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        BDSM     0.5000    0.6667    0.5714        15
        LGBT     0.6891    0.7413    0.7143       317
         VPN     1.0000    0.2000    0.3333         5
          两性     0.3383    0.1069    0.1625       636
          中医     0.7398    0.6233    0.6766       146
          二手     0.5000    0.1053    0.1739        19
          交友     0.0000    0.0000    0.0000        25
        交通出行     0.7128    0.7339    0.7232       372
          人物     0.5482    0.4954    0.5205       218
          人类     0.5000    0.0690    0.1212        29
        人际交往     0.4632    0.3288    0.3846       669
         价值观     0.3721    0.2112    0.2695       303
       体育/运动     0.7917    0.8538    0.8216      2039
          体验     0.4211    0.0656    0.1135       122
          保险     0.7360    0.7931    0.7635       116
          健康     0.7653    0.8045    0.7844      5754
          公司     0.6408    0.5872    0.6128       872
          养生     0.4286    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
evaluation_ece(v5_pred_score_np, gold, topic_api.label2id)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [66]:
pred_score_l = []
pred_score_l = torch.tensor(v5_pred_score_np)
gold_l = torch.tensor([topic_api.label2id[item] for item in gold])

ece_fn = ECE(n_bins=15)
print(ece_fn(pred_score_l, gold_l, mode='probs'), '==ece==')

tensor([0.2866]) ==ece==


In [10]:
# pred_score, gold = eval_all(v4_knn_valid, topic_api, top_n=5)

In [11]:
# pred_score, gold = eval_all(v4_valid, topic_api, top_n=2)

In [14]:
with open('/data/albert.xht/xiaodao/topic_classification_v7/biake_qa_web_text_zh_train.json.positive.topic', 'w') as fwobj:
    with open('/data/albert.xht/xiaodao/topic_classification_v7/biake_qa_web_text_zh_train.json.positive', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            queue.append(content['text'])
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['label'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tt['label'] = prob_dict
                fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

25507it [00:11, 2218.33it/s]


In [15]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/embed_linear_small_white.json.topic', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/embed_linear_small_white.json', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            queue.append(content['text'])
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['label'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tt['label'] = prob_dict
                fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

23116it [00:10, 2115.69it/s]


In [69]:
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/query_risk_corpus.json.risk_topic', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/query_risk_corpus.json.risk', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            queue.append(content['text'])
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['topic'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tt['topic'] = prob_dict
                fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

24721it [00:13, 1896.10it/s]


In [19]:
with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json,new_topic', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/topic_classification_v4/biake_qa_web_text_zh_train.json', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            queue.append(content['text'])
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['new_topic'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tt['new_topic'] = prob_dict
                fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

2071401it [16:00, 2157.32it/s]


In [8]:
with open('/data/albert.xht/sentiment/green_teenager.json.all.detail.topic', 'w') as fwobj:
    with open('/data/albert.xht/sentiment/green_teenager.json.all.detail', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            if len(content['text']) > 192:
                continue
            queue.append(content['text'])
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tt['topic'] = prob_dict
                    fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tt['topic'] = prob_dict
                fwobj.write(json.dumps(tt, ensure_ascii=False)+'\n')

241035it [02:58, 1348.68it/s]


In [12]:

import re
with open('/data/albert.xht/pretrained_model_risk/corpus/efaqa-corpus-zh/efaqa-corpus-zh.utf8.topic', 'w') as fwobj:
    with open('/data/albert.xht/pretrained_model_risk/corpus/efaqa-corpus-zh/efaqa-corpus-zh.utf8', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            title = content['title'] #''.join(re.split('[\s,]', content['title'])[1:])
            if len(title) > 192:
                continue
            queue.append(title)
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tmp = {
                        'text':text,
                        'topic':prob_dict,
                        'source':'efaqa'
                    }
                    fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tmp = {
                        'text':text,
                        'topic':prob_dict,
                        'source':'efaqa'
                    }
                fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')

20000it [00:11, 1762.52it/s]


In [14]:

import re
with open('/data/albert.xht/xiaodao/query_risk_v11/offensive_select_labeled.txt.topic', 'w') as fwobj:
    with open('/data/albert.xht/xiaodao/query_risk_v11/offensive_select_labeled.txt', 'r') as frobj:
        queue = []
        t = []
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            # content['text'] = re.sub('请问', '', content['text'])
            title = content['text'] #''.join(re.split('[\s,]', content['title'])[1:])
            if len(title) > 192:
                continue
            queue.append(title)
            t.append(content)
            if np.mod(len(queue), 32) == 0:
                probs = topic_api.predict_batch(queue)
                for prob_dict, text, tt in zip(probs, queue, t):
                    tmp = {
                        'text':text,
                        'topic':prob_dict,
                        'label':tt['label'],
                        'source':'efaqa'
                    }
                    fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')
                queue = []
                t = []
        if queue:
            probs = topic_api.predict_batch(queue)
            for prob_dict, text, tt in zip(probs, queue, t):
                tmp = {
                        'text':text,
                        'topic':prob_dict,
                        'label':tt['label'],
                        'source':'efaqa'
                    }
                fwobj.write(json.dumps(tmp, ensure_ascii=False)+'\n')

20641it [00:10, 1970.45it/s]


In [114]:

with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/疑似有风险query_from对话预训练数据-20221130.txt') as frobj:
    for line in tqdm(frobj):
        text = line.strip()
        result_list.append((text, result))

topic_filter = []
topic_white = []
left = []
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/query_risk_corpus.json.risk_topic') as frobj:
    for line in frobj:
        content = json.loads(line.strip())
        if content['score_list']['senti'][1][1] > 0.7:
            if content['topic'][0][0] in ['经济', '历史', 'LGBT', 'BDSM', '法律', '灵异灵修', '国家', '社会', '军事', '心理健康', '时事政治', '死亡', '毒品', '恐怖主义', '战争', '灵异事件']:
                topic_filter.append(content)
            else:
                if '艾滋病' in content['text'] or '抑郁' in content['text']:
                    topic_filter.append(content)
                else:
                    topic_white.append(content)
        else:
            topic_filter.append(content)
        # if content['topic'][0][0] in ['文化/艺术', '动漫', '音乐', '娱乐', '体育/运动', '小说', '星座', '时尚', '健康', '明星', '游戏'] and content['topic'][0][1] > 0.2:
        #     if content['score_list']['senti'][0][1] < 0.3:
        #         topic_white.append(content)
        #     elif content['score_list']['senti'][0][1] > 0.9:
        #         topic_filter.append(content)
        # elif content['topic'][0][0] in ['LGBT', 'BDSM', '法律', '灵异灵修', '国家', '社会', '军事', '心理健康', '时事政治', '死亡', '毒品', '恐怖主义', '战争', '灵异事件']:
        #     topic_filter.append(content)
        # elif content['topic'][1][0] in ['LGBT', 'BDSM', '法律', '灵异灵修', '国家', '社会', '军事', '心理健康', '时事政治', '死亡', '毒品', '恐怖主义', '战争', '灵异事件']:
        #     topic_filter.append(content)
        # else:
        #     left.append(content)
                

In [118]:
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/query_risk_corpus.json.filter', 'w') as fwobj:
    for d in topic_filter:
        d['label'] = ['风险']
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')
with open('/data/albert.xht/raw_chat_corpus/model_risk_xiaoda/query_risk_corpus.json.white', 'w') as fwobj:
    for d in topic_white:
        d['label'] = ['正常']
        fwobj.write(json.dumps(d, ensure_ascii=False)+'\n')

In [1]:
import os, sys

sys.path.extend(['/root/xiaoda/query_topic/'])

In [21]:
from keyword_processor import KeywordProcesser
with open('/data/albert.xht/xiaoda/sentiment/query_risk_v14//risk_event.txt') as frobj:
    for line in frobj:
        content = line.strip()
    keyword_list = content.split('||')

keyword_api_v1 = KeywordProcesser(keywords=keyword_list)

In [22]:
import jieba_fast as jieba
for word in keyword_list:
    jieba.add_word(word)
    
from tqdm import tqdm
import json, re

with open('/data/albert.xht/raw_chat_corpus/topic_classification_v5/biake_qa_web_text_zh_train.json.knn.final.keyword', 'w') as fwobj:
    with open('/data/albert.xht/raw_chat_corpus/topic_classification_v5/biake_qa_web_text_zh_train.json.knn.final') as frobj:
        for line in tqdm(frobj):
            content = json.loads(line.strip())
            text = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', content['text'].lower())
            content['keywords'] = keyword_api_v1.extract_keywords(text)
            words_list = list(jieba.cut(text))
            words_set = {}
            for word in words_list:
                words_set[word] = ''
            if content['keywords']:
                keyword_list = []
                for keyword_ in content['keywords']:
                    if keyword_[-1] in words_set:
                        keyword_list.append(keyword_)
                if keyword_list:
                    if content['label'][0] not in ['游戏', '小说', '网络安全', '动漫', '幽默滑稽', '电子数码']:
                        content['topic'] = content['label']
                        content['label'] = ['风险']
                        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')
                    elif content['label'][0] in ['游戏', '小说', '网络安全', '动漫', '电子数码']:
                        content['topic'] = content['label']
                        content['label'] = ['正常']
                        fwobj.write(json.dumps(content, ensure_ascii=False)+'\n')

1973027it [03:03, 10726.40it/s]


In [7]:
list(jieba.cut('javaweb程序员是否适合用MacBook'))

['j', 'av', 'aweb', '程序员', '是否', '适合', '用', 'MacBook']