General Dataset and Model Settings

In [46]:
from transformers import pipeline
classifier = pipeline("text-classification", model="Hate-speech-CNERG/bert-base-uncased-hatexplain", device=0)

In [47]:
import json

json_file = './data/dataset.json'

def extract_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    sentences = []
    abuse_flags = []
    
    for key, entry in data.items():
        if 'post_tokens' in entry:
            post_tokens = entry['post_tokens']
            sentence = " ".join(post_tokens)
        else:
            sentence = " "
            print(f"Warning: Entry {key} is missing 'post_tokens' key")
        
        if 'annotators' in entry:
            labels = [annotator['label'] for annotator in entry['annotators']]
            if sum(label != "normal" for label in labels) >= 2:
                abuse_label = 0  # Abusive
            else:
                abuse_label = 1  # normal
        else:
            abuse_label = 0  # Default to normal if 'annotators' key is missing
            print(f"Warning: Entry {key} is missing 'annotators' key")
        
        sentences.append(sentence)
        abuse_flags.append(abuse_label)

    return sentences, abuse_flags

original_sentences, annotated_labels = extract_data(json_file) #length = 20148

In [48]:
# print(original_sentences[:5])

LIME

In [49]:
import numpy as np

def predictor(texts):
    # 使用BERT模型进行预测
    predictions = classifier(texts, return_all_scores=True)
    
    # 提取每个预测的分数（概率），并按照三个类别返回
    # 假设classifier返回的是一个列表，其中每个元素是一个字典，包含'labels'和'score'
    # probs = []
    # for result in predictions:
    #     # 创建一个含三个元素的列表，每个元素对应一个类别的概率
    #     prob = [0] * 3
    #     if result['label'] == 'hate speech':
    #         prob[0] = result['score']
    #     elif result['label'] == 'normal':
    #         prob[1] = result['score']
    #     else:  # 'offensive'
    #         prob[2] = result['score'] 
    #     probs.append(prob)
        
    # print(predictions)
    result = []

    # for item in predictions:
    #     scores_list = [] * 3
    #     for entry in item:
    #         if entry['label'] == 'hate speech':
    #             scores_list[0] = entry['score']
    #         elif entry['label'] == 'normal':
    #             scores_list[1] = entry['score']
    #         elif entry['label'] == 'offensive':
    #             scores_list[2] = entry['score']
    #     result.append(scores_list)
        # print(scores_list)
    
    for item in predictions:
        scores_list = [entry['score'] for entry in item]
        result.append(scores_list)
    

    result_array = np.array(result)
    
    return result_array

In [50]:
print(predictor(original_sentences[:5]))
print(original_sentences[:5])



[[0.03392262 0.7708711  0.19520631]
 [0.07430059 0.63709313 0.28860623]
 [0.66122037 0.16300514 0.17577441]
 [0.62429971 0.16513535 0.21056499]
 [0.07229818 0.26018623 0.66751558]]
['i dont think im getting my baby them white 9 he has two white j and nikes not even touched', 'we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum', 'nawt yall niggers ignoring me', '<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔', 'this bitch in whataburger eating a burger with no top bun and holding it palm up i hate white bitches']


In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=['hate speech', 'normal', 'offensive'])

In [None]:
# 选择要解释的一个句子，假设是第一个句子
# sentence_to_explain = original_sentences[]

# 生成解释
for i, sentence in enumerate(original_sentences[:5]):
    explanation = explainer.explain_instance(sentence, predictor, num_features=6, top_labels=3)
    # 显示解释结果
    explanation.show_in_notebook(text=True)
    explanation.save_to_file(f'explanation_{i}.html')