In [1]:
import json
import pandas as pd
import os
import re
from tqdm import tqdm
from bart_score import BARTScorer
from sklearn.metrics import auc,roc_curve,roc_auc_score
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# 初始化 BARTScorer
print("----------------------------------- 开始初始化 bart 模型 ------------------------------------------")
bartscorer = BARTScorer(device='cuda:0',checkpoint="facebook/bart-large-cnn")
print("----------------------------------- 结束初始化 bart 模型 ------------------------------------------")


----------------------------------- 开始初始化 bart 模型 ------------------------------------------
----------------------------------- 结束初始化 bart 模型 ------------------------------------------


# 1、读取数据

In [5]:
# 读取 jsonl 文件
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 将每一行的 JSON 字符串转换为字典
            data.append(json.loads(line.strip()))
    return data


# 读取 finance
file_path = 'finance_samples_gpt4i-mini.jsonl'  
finance_samples = read_jsonl(file_path)

# 2、数据处理

In [6]:
def remove_chinese(text):
    # 使用正则表达式匹配所有中文字符并替换为空字符串
    pattern = re.compile(r'[\u4e00-\u9fff]')
    return pattern.sub('', text)

# 不同的数据合并
question_full = []
human_answers_full = []
chatgpt_answers_full = []
human_answers_masked_full = []
chatgpt_answers_masked_full = []
human_answers_masked_fill_full = []
chatgpt_answers_masked_fill_full = []
human_answers_revised_full = []
chatgpt_answers_revised_full = []



for i, sample in tqdm(enumerate(finance_samples)):
    question = sample['question']
    human_answers = sample['human_answers']
    chatgpt_answers = sample['chatgpt_answers']
    human_answers_masked = sample['human_answers_masked']
    chatgpt_answers_masked = sample['chatgpt_answers_masked']
    human_answers_masked_fill = sample['human_answers_masked_fill']
    chatgpt_answers_masked_fill = sample['chatgpt_answers_masked_fill']
    human_answers_revised = sample['human_answers_revised']
    chatgpt_answers_revised = sample['chatgpt_answers_revised']

    # 去除中文
    human_answers_masked = remove_chinese(human_answers_masked)
    chatgpt_answers_masked = remove_chinese(chatgpt_answers_masked)
    human_answers_masked_fill = remove_chinese(human_answers_masked_fill)
    chatgpt_answers_masked_fill = remove_chinese(chatgpt_answers_masked_fill)

    # 存储
    question_full.append(question)
    human_answers_full.append(human_answers)
    chatgpt_answers_full.append(chatgpt_answers)
    human_answers_masked_full.append(human_answers_masked)
    chatgpt_answers_masked_full.append(chatgpt_answers_masked)
    human_answers_masked_fill_full.append(human_answers_masked_fill)
    chatgpt_answers_masked_fill_full.append(chatgpt_answers_masked_fill)
    human_answers_revised_full.append(human_answers_revised)
    chatgpt_answers_revised_full.append(chatgpt_answers_revised)


3933it [00:00, 55509.11it/s]


# 3、计算各种相似度

# 3.1 计算修改前后的相似度

In [7]:
print("----------------------------------- 开始计算相似度 ------------------------------------------")
chatgpt_revise_score = bartscorer.score(chatgpt_answers_revised_full, chatgpt_answers_full)
human_revise_score = bartscorer.score(human_answers_revised_full, human_answers_full)

----------------------------------- 开始计算相似度 ------------------------------------------


In [8]:
y_true = []
y_score = []
for i in range(0,len(chatgpt_revise_score)):
    y_true.append(1)
    y_score.append(chatgpt_revise_score[i])
    
for i in range(0,len(human_revise_score)):
    y_true.append(0)
    y_score.append(human_revise_score[i])

# 计算评估指标
auroc_score = roc_auc_score(y_true, y_score)
print("the auroc is:",auroc_score)

the auroc is: 0.9739491685322336


In [9]:
# 反过来
print("----------------------------------- 开始计算相似度 ------------------------------------------")
chatgpt_revise_score = bartscorer.score(chatgpt_answers_full, chatgpt_answers_revised_full)
human_revise_score = bartscorer.score(human_answers_full, human_answers_revised_full)

----------------------------------- 开始计算相似度 ------------------------------------------


In [10]:
y_true = []
y_score = []
for i in range(0,len(chatgpt_revise_score)):
    y_true.append(1)
    y_score.append(chatgpt_revise_score[i])
    
for i in range(0,len(human_revise_score)):
    y_true.append(0)
    y_score.append(human_revise_score[i])

# 计算评估指标
auroc_score = roc_auc_score(y_true, y_score)
print("the auroc is:",auroc_score)

the auroc is: 0.7912387240925731


## 3.2 计算 mask 前后的相似度

In [6]:
print("----------------------------------- 开始计算相似度 ------------------------------------------")
chatgpt_mask_score = bartscorer.score(chatgpt_answers_masked_fill_full, chatgpt_answers_full)
human_mask_score = bartscorer.score(human_answers_masked_fill_full, human_answers_full)



----------------------------------- 开始计算相似度 ------------------------------------------


# 4、各种相似度

## 4.1 语义相似度

In [4]:
""" 加载模型 """
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
# from modelscope import snapshot_download
# model_dir = snapshot_download("AI-ModelScope/bge-large-en-v1.5", cache_dir='pretrain_models', revision='master') # 加载模型

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("/root/autodl-tmp/ai_generated/LLM-generated-text-detection/pretrain_models/AI-ModelScope/bge-large-en-v1.5")


def embedding_similarity(text1, text2):
    # 将文本转换为嵌入向量
    text1_embedding = model.encode(text1, normalize_embeddings=True, device='cuda:0')
    text2_embedding = model.encode(text2, normalize_embeddings=True, device='cuda:0')
    
    # 将嵌入向量转换为PyTorch张量
    text1_embedding = torch.tensor(text1_embedding).to('cuda:0')
    text2_embedding = torch.tensor(text2_embedding).to('cuda:0')
    
    # 计算余弦相似度
    cosine_similarity = torch.nn.functional.cosine_similarity(text1_embedding, text2_embedding, dim=0)
    
    return cosine_similarity.item()

# 示例使用
text1 = "This is a sample sentence."
text2 = "This is another example sentence."
similarity = embedding_similarity(text1, text2)
print(f"Cosine Similarity: {similarity}")


Cosine Similarity: 0.8590004444122314


## 4.2 语法一致性

In [5]:

import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")


def syntactic_resonance_analysis(text_raw, text_transformed):
    """
    Analyzes the syntactic similarity between two English sentences.

    Args:
        text_raw: The original text (str).
        text_transformed: The transformed text (str).

    Returns:
        score_syn: The syntactic similarity score (float).
    """

    doc_raw = nlp(text_raw)
    doc_transformed = nlp(text_transformed)

    # Extract syntactic features
    # Feature 1: Dependency relation triples
    dep_raw = [(token.head.i, token.i, token.dep_) for token in doc_raw]
    dep_transformed = [(token.head.i, token.i, token.dep_) for token in doc_transformed]

    # Feature 2: Part-of-speech (POS) tag sequence
    pos_raw = [token.tag_ for token in doc_raw]  # Using detailed POS tags
    pos_transformed = [token.tag_ for token in doc_transformed]

    # Feature 3: Syntactic structure depth (a simple estimation)
    depth_raw = max([token.i - token.head.i if token.head.i < token.i else token.head.i - token.i for token in doc_raw], default=0)
    depth_transformed = max([token.i - token.head.i if token.head.i < token.i else token.head.i - token.i for token in doc_transformed], default=0)

    # Calculate syntactic similarity score
    # Similarity Metric 1: Proportion of common dependency relation triples
    common_dep = len(set(dep_raw) & set(dep_transformed))
    total_dep = max(len(dep_raw), len(dep_transformed))
    dep_similarity = common_dep / total_dep if total_dep > 0 else 1.0

    # Similarity Metric 2: Similarity of POS tag sequences (using proportion of matching tags)
    common_pos = 0
    min_len = min(len(pos_raw), len(pos_transformed))
    for i in range(min_len):
        if pos_raw[i] == pos_transformed[i]:
            common_pos += 1
    pos_similarity = common_pos / max(len(pos_raw), len(pos_transformed)) if max(len(pos_raw), len(pos_transformed)) > 0 else 1.0

    # Similarity Metric 3: Difference in syntactic structure depth
    depth_diff = abs(depth_raw - depth_transformed)
    normalized_depth_diff = depth_diff / max(len(doc_raw), len(doc_transformed)) if max(len(doc_raw), len(doc_transformed)) > 0 else 0.0
    depth_similarity = 1.0 - normalized_depth_diff

    # Combine syntactic similarity scores (adjust weights as needed)
    score_syn = (0.5 * dep_similarity + 0.3 * pos_similarity + 0.2 * depth_similarity)

    return score_syn


## 4.3 词汇一致性

In [8]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the English model
nlp = spacy.load("en_core_web_sm")

def lexical_resonance_analysis(text_raw, text_transformed, top_n=10):
    """
    Analyzes the lexical similarity between two English texts using TF-IDF and Jaccard index.

    Args:
        text_raw: The original text (str).
        text_transformed: The transformed text (str).
        top_n: The number of top keywords to consider (int).

    Returns:
        score_lex: The lexical similarity score (float).
    """

    # Initialize TF-IDF vectorizer with English stop words
    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform the texts
    tfidf_matrix = vectorizer.fit_transform([text_raw, text_transformed])

    # Get the vocabulary (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get TF-IDF scores for each text
    tfidf_scores_raw = tfidf_matrix[0].toarray().flatten()
    tfidf_scores_transformed = tfidf_matrix[1].toarray().flatten()

    # Get top N keywords for each text
    top_indices_raw = tfidf_scores_raw.argsort()[-top_n:][::-1]
    top_keywords_raw = [feature_names[i] for i in top_indices_raw]

    top_indices_transformed = tfidf_scores_transformed.argsort()[-top_n:][::-1]
    top_keywords_transformed = [feature_names[i] for i in top_indices_transformed]

    # Convert to sets
    keywords_raw_set = set(top_keywords_raw)
    keywords_transformed_set = set(top_keywords_transformed)

    # Calculate Jaccard index
    intersection = len(keywords_raw_set.intersection(keywords_transformed_set))
    union = len(keywords_raw_set.union(keywords_transformed_set))

    if union == 0:
        score_lex = 0.0
    else:
        score_lex = intersection / union

    return score_lex

if __name__ == '__main__':
    text_raw = "Using a pre-trained sentence embedding model, such as Sentence-BERT, generate dense vector representations for the original text."
    text_transformed = "Dense vector representations for the original text are generated using a pre-trained sentence embedding model like Sentence-BERT."

    score = lexical_resonance_analysis(text_raw, text_transformed)
    print(score)


0.5384615384615384


## 4.4 推理一致性

In [9]:
from transformers import pipeline
import torch

# 加载预训练的NLI模型
nli_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def reasoning_analysis(text1, text2):
    
    # 定义可能的标签
    labels = ["entailment", "contradiction", "neutral"]
    
    # 进行NLI分类
    result = nli_model(text2, candidate_labels=labels, hypothesis_template="This text is {}.")
    
    # 获取entailment的概率作为一致性分数
    consistency_score = result['scores'][result['labels'].index('entailment')]
    
    return consistency_score

def calculate_consistency(paragraph1, paragraph2):
    # 计算双向的一致性分数
    score1 = reasoning_analysis(paragraph1, paragraph2)
    score2 = reasoning_analysis(paragraph2, paragraph1)
    
    # 取平均值作为最终的一致性分数
    final_score = (score1 + score2) / 2
    
    return final_score

# 使用示例
paragraph1 = "Input your first paragraph here."
paragraph2 = "Input your second paragraph here."

consistency_score = calculate_consistency(paragraph1, paragraph2)
print(f"推理共振分析的一致性分数: {consistency_score:.4f}")

Device set to use cuda:0


推理共振分析的一致性分数: 0.7328


In [38]:
def computer_scores(text1, text2):

    """ 计算分数 """
    # 语义一致性
    # semantics_score = embedding_similarity(text2, text1)

    # bart 的语义一致性
    semantics_score = bartscorer.score([text2], [text1])[0]
    

    # 语法一致性
    syntactic_score = syntactic_resonance_analysis(text1, text2)

    # 词汇一致性
    lexical_score = lexical_resonance_analysis(text1, text2)

    # 推理一致性
    reasoning_score = reasoning_analysis(text1, text2)

    score_dict = {
        'semantics_score': semantics_score,
        'syntactic_score': syntactic_score,
        'lexical_score': lexical_score,
        'reasoning_score': reasoning_score
    }
    
    return score_dict

    

# 5、计算 重写 的分数

In [39]:
# 存储最终预测的 logits
chatgpt_scores_dict_revised_full = []
human_scores_dict_revised_full = []

chatgpt_scores_dict_masked_fill_full = []
human_scores_dict_masked_fill_full = []

for i in tqdm(range(0,len(chatgpt_answers_revised_full))):

    # 原始的
    chatgpt_answers = chatgpt_answers_full[i]
    human_answers = human_answers_full[i]

    # 重写后的
    chatgpt_answers_revised = chatgpt_answers_revised_full[i]
    human_answers_revised = human_answers_revised_full[i]

    # mask 填充后的
    chatgpt_answers_masked_fill = chatgpt_answers_masked_fill_full[i]
    human_answers_masked_fill = human_answers_masked_fill_full[i]

    """ 计算重写的分数 """
    chatgpt_scores_dict_revised = computer_scores(chatgpt_answers, chatgpt_answers_revised)
    human_scores_dict_revised = computer_scores(human_answers, human_answers_revised)

    chatgpt_scores_dict_revised_full.append(chatgpt_scores_dict_revised)
    human_scores_dict_revised_full.append(human_scores_dict_revised)

    
    """ 计算mask的分数 """
    chatgpt_scores_dict_masked_fill = computer_scores(chatgpt_answers, chatgpt_answers_masked_fill)
    human_scores_dict_masked_fill = computer_scores(human_answers, human_answers_masked_fill)

    chatgpt_scores_dict_masked_fill_full.append(chatgpt_scores_dict_masked_fill)
    human_scores_dict_masked_fill_full.append(human_scores_dict_masked_fill)



100%|██████████| 3933/3933 [18:48<00:00,  3.48it/s]


## 5.2 聚合多个不同的分数

In [75]:
chatgpt_total_scores = []
human_total_scores = []

semantics_score_weight=0.52
syntactic_score_weight=0.0
lexical_score_weight=0.0
reasoning_score_weight=0.48

for i, _ in tqdm(enumerate(chatgpt_scores_dict_revised_full)):

    # revised
    chatgpt_revised_scores_dict = chatgpt_scores_dict_revised_full[i]
    human_revised_scores_dict = human_scores_dict_revised_full[i]

    chatgpt_revised_semantics_score = chatgpt_revised_scores_dict['semantics_score']
    chatgpt_revised_syntactic_score = chatgpt_revised_scores_dict['syntactic_score']
    chatgpt_revised_lexical_score = chatgpt_revised_scores_dict['lexical_score']
    chatgpt_revised_reasoning_score = chatgpt_revised_scores_dict['reasoning_score']

    human_revised_semantics_score = human_revised_scores_dict['semantics_score']
    human_revised_syntactic_score = human_revised_scores_dict['syntactic_score']
    human_revised_lexical_score = human_revised_scores_dict['lexical_score']
    human_revised_reasoning_score = human_revised_scores_dict['reasoning_score']

    
    # masked_fill
    chatgpt_masked_fill_scores_dict = chatgpt_scores_dict_masked_fill_full[i]
    human_masked_fill_scores_dict = human_scores_dict_masked_fill_full[i]

    chatgpt_masked_fill_semantics_score = chatgpt_masked_fill_scores_dict['semantics_score']
    chatgpt_masked_fill_syntactic_score = chatgpt_masked_fill_scores_dict['syntactic_score']
    chatgpt_masked_fill_lexical_score = chatgpt_masked_fill_scores_dict['lexical_score']
    chatgpt_masked_fill_reasoning_score = chatgpt_masked_fill_scores_dict['reasoning_score']

    human_masked_fill_semantics_score = human_masked_fill_scores_dict['semantics_score']
    human_masked_fill_syntactic_score = human_masked_fill_scores_dict['syntactic_score']
    human_masked_fill_lexical_score = human_masked_fill_scores_dict['lexical_score']
    human_masked_fill_reasoning_score = human_masked_fill_scores_dict['reasoning_score']

    
    # 分数聚合
    revised_chatgpt_total_score = semantics_score_weight * chatgpt_revised_semantics_score + \
                    syntactic_score_weight * chatgpt_revised_syntactic_score + \
                    lexical_score_weight * chatgpt_revised_lexical_score + \
                    reasoning_score_weight * chatgpt_revised_reasoning_score 

    revised_human_total_score = semantics_score_weight * human_revised_semantics_score + \
                    syntactic_score_weight * human_revised_syntactic_score + \
                    lexical_score_weight * human_revised_lexical_score + \
                    reasoning_score_weight * human_revised_reasoning_score 

    

    # 存储
    chatgpt_total_scores.append(revised_chatgpt_total_score)
    human_total_scores.append(revised_human_total_score)



SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (1030795496.py, line 4)

In [73]:
def get_fusion_score(chatgpt_scores_dict_revised_full=chatgpt_scores_dict_revised_full,
                    human_scores_dict_revised_full=human_scores_dict_revised_full,
                    ):
    
    chatgpt_total_scores = []
    human_total_scores = []

    semantics_score_weight=0.52, syntactic_score_weight=0.0, lexical_score_weight=0.0, reasoning_score_weight=0.48
    
    
    
    for i, _ in tqdm(enumerate(chatgpt_scores_dict_revised_full)):
    
        # revised
        chatgpt_revised_scores_dict = chatgpt_scores_dict_revised_full[i]
        human_revised_scores_dict = human_scores_dict_revised_full[i]
    
        chatgpt_revised_semantics_score = chatgpt_revised_scores_dict['semantics_score']
        chatgpt_revised_syntactic_score = chatgpt_revised_scores_dict['syntactic_score']
        chatgpt_revised_lexical_score = chatgpt_revised_scores_dict['lexical_score']
        chatgpt_revised_reasoning_score = chatgpt_revised_scores_dict['reasoning_score']
    
        human_revised_semantics_score = human_revised_scores_dict['semantics_score']
        human_revised_syntactic_score = human_revised_scores_dict['syntactic_score']
        human_revised_lexical_score = human_revised_scores_dict['lexical_score']
        human_revised_reasoning_score = human_revised_scores_dict['reasoning_score']
    
        
        # masked_fill
        chatgpt_masked_fill_scores_dict = chatgpt_scores_dict_masked_fill_full[i]
        human_masked_fill_scores_dict = human_scores_dict_masked_fill_full[i]
    
        chatgpt_masked_fill_semantics_score = chatgpt_masked_fill_scores_dict['semantics_score']
        chatgpt_masked_fill_syntactic_score = chatgpt_masked_fill_scores_dict['syntactic_score']
        chatgpt_masked_fill_lexical_score = chatgpt_masked_fill_scores_dict['lexical_score']
        chatgpt_masked_fill_reasoning_score = chatgpt_masked_fill_scores_dict['reasoning_score']
    
        human_masked_fill_semantics_score = human_masked_fill_scores_dict['semantics_score']
        human_masked_fill_syntactic_score = human_masked_fill_scores_dict['syntactic_score']
        human_masked_fill_lexical_score = human_masked_fill_scores_dict['lexical_score']
        human_masked_fill_reasoning_score = human_masked_fill_scores_dict['reasoning_score']
        
        # 分数聚合
        revised_chatgpt_total_score = semantics_score_weight * chatgpt_revised_semantics_score + \
                        syntactic_score_weight * chatgpt_revised_syntactic_score + \
                        lexical_score_weight * chatgpt_revised_lexical_score + \
                        reasoning_score_weight * chatgpt_revised_reasoning_score 

        revised_human_total_score = semantics_score_weight * human_revised_semantics_score + \
                        syntactic_score_weight * human_revised_syntactic_score + \
                        lexical_score_weight * human_revised_lexical_score + \
                        reasoning_score_weight * human_revised_reasoning_score 

        
    
        # 存储
        chatgpt_total_scores.append(revised_chatgpt_total_score)
        human_total_scores.append(revised_human_total_score)


        y_true = []
        y_score = []
        for i in range(0,len(chatgpt_total_scores)):
            y_true.append(1)
            y_score.append(chatgpt_total_scores[i])
            
        for i in range(0,len(human_total_scores)):
            y_true.append(0)
            y_score.append(human_total_scores[i])
        
        # 计算评估指标
        auroc_score = roc_auc_score(y_true, y_score)
        print(f"semantics_score_weight={semantics_score_weight}, syntactic_score_weight={syntactic_score_weight}, lexical_score_weight={lexical_score_weight}, reasoning_score_weight={reasoning_score_weight}; \n the auroc is:",auroc_score)



    

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (2012589049.py, line 9)

In [56]:
y_true = []
y_score = []
for i in range(0,len(chatgpt_total_scores)):
    y_true.append(1)
    y_score.append(chatgpt_total_scores[i])
    
for i in range(0,len(human_total_scores)):
    y_true.append(0)
    y_score.append(human_total_scores[i])

# 计算评估指标
auroc_score = roc_auc_score(y_true, y_score)
print("the auroc is:",auroc_score)

the auroc is: 0.9739491038846781


In [26]:
print("----------------------------------- 开始计算相似度 ------------------------------------------")
chatgpt_revise_score = bartscorer.score(chatgpt_answers_revised_full, chatgpt_answers_full)
human_revise_score = bartscorer.score(human_answers_revised_full, human_answers_full)

----------------------------------- 开始计算相似度 ------------------------------------------


In [27]:
y_true = []
y_score = []
for i in range(0,len(chatgpt_revise_score)):
    y_true.append(1)
    y_score.append(chatgpt_revise_score[i])
    
for i in range(0,len(human_revise_score)):
    y_true.append(0)
    y_score.append(human_revise_score[i])

# 计算评估指标
auroc_score = roc_auc_score(y_true, y_score)
print("the auroc is:",auroc_score)

the auroc is: 0.9739491685322336


In [71]:
from itertools import product
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def grid_search(chatgpt_scores_dict_revised_full, human_scores_dict_revised_full, 
                chatgpt_scores_dict_masked_fill_full, human_scores_dict_masked_fill_full):
    # 定义权重的搜索范围
    weight_range = [0.0, 0.25, 0.5, 0.75, 1.0]

    step = 0.02
    weight_range = [round(i * step, 2) for i in range(0, int(1/step) + 1)]
    
    # 生成所有可能的权重组合
    weight_combinations = list(product(weight_range, repeat=4))
    
    # 过滤掉权重和不等于1的组合
    weight_combinations = [comb for comb in weight_combinations if sum(comb) == 1.0]
    
    best_auroc = 0
    best_weights = None
    
    # 遍历所有权重组合
    for weights in tqdm(weight_combinations):
        semantics_score_weight, syntactic_score_weight, lexical_score_weight, reasoning_score_weight = weights
        
        # 调用get_fusion_score函数计算AUROC
        auroc_score = get_fusion_score(
            semantics_score_weight=semantics_score_weight,
            syntactic_score_weight=syntactic_score_weight,
            lexical_score_weight=lexical_score_weight,
            reasoning_score_weight=reasoning_score_weight,
            chatgpt_scores_dict_revised_full=chatgpt_scores_dict_revised_full,
            human_scores_dict_revised_full=human_scores_dict_revised_full,
            chatgpt_scores_dict_masked_fill_full=chatgpt_scores_dict_masked_fill_full,
            human_scores_dict_masked_fill_full=human_scores_dict_masked_fill_full
        )
        
        # 更新最佳AUROC和对应的权重
        if auroc_score > best_auroc:
            best_auroc = auroc_score
            best_weights = weights
    
    print(f"Best weights: semantics_score_weight={best_weights[0]}, syntactic_score_weight={best_weights[1]}, lexical_score_weight={best_weights[2]}, reasoning_score_weight={best_weights[3]}")
    print(f"Best AUROC: {best_auroc}")

def get_fusion_score(semantics_score_weight, syntactic_score_weight, lexical_score_weight, reasoning_score_weight,
                     chatgpt_scores_dict_revised_full, human_scores_dict_revised_full,
                     chatgpt_scores_dict_masked_fill_full, human_scores_dict_masked_fill_full):
    
    chatgpt_total_scores = []
    human_total_scores = []
    
    for i, _ in enumerate(chatgpt_scores_dict_revised_full):
        # revised
        chatgpt_revised_scores_dict = chatgpt_scores_dict_revised_full[i]
        human_revised_scores_dict = human_scores_dict_revised_full[i]
    
        chatgpt_revised_semantics_score = chatgpt_revised_scores_dict['semantics_score']
        chatgpt_revised_syntactic_score = chatgpt_revised_scores_dict['syntactic_score']
        chatgpt_revised_lexical_score = chatgpt_revised_scores_dict['lexical_score']
        chatgpt_revised_reasoning_score = chatgpt_revised_scores_dict['reasoning_score']
    
        human_revised_semantics_score = human_revised_scores_dict['semantics_score']
        human_revised_syntactic_score = human_revised_scores_dict['syntactic_score']
        human_revised_lexical_score = human_revised_scores_dict['lexical_score']
        human_revised_reasoning_score = human_revised_scores_dict['reasoning_score']
    
        # masked_fill
        chatgpt_masked_fill_scores_dict = chatgpt_scores_dict_masked_fill_full[i]
        human_masked_fill_scores_dict = human_scores_dict_masked_fill_full[i]
    
        chatgpt_masked_fill_semantics_score = chatgpt_masked_fill_scores_dict['semantics_score']
        chatgpt_masked_fill_syntactic_score = chatgpt_masked_fill_scores_dict['syntactic_score']
        chatgpt_masked_fill_lexical_score = chatgpt_masked_fill_scores_dict['lexical_score']
        chatgpt_masked_fill_reasoning_score = chatgpt_masked_fill_scores_dict['reasoning_score']
    
        human_masked_fill_semantics_score = human_masked_fill_scores_dict['semantics_score']
        human_masked_fill_syntactic_score = human_masked_fill_scores_dict['syntactic_score']
        human_masked_fill_lexical_score = human_masked_fill_scores_dict['lexical_score']
        human_masked_fill_reasoning_score = human_masked_fill_scores_dict['reasoning_score']
        
        # 分数聚合
        revised_chatgpt_total_score = semantics_score_weight * chatgpt_revised_semantics_score + \
                        syntactic_score_weight * chatgpt_revised_syntactic_score + \
                        lexical_score_weight * chatgpt_revised_lexical_score + \
                        reasoning_score_weight * chatgpt_revised_reasoning_score 

        revised_human_total_score = semantics_score_weight * human_revised_semantics_score + \
                        syntactic_score_weight * human_revised_syntactic_score + \
                        lexical_score_weight * human_revised_lexical_score + \
                        reasoning_score_weight * human_revised_reasoning_score 

        # 存储
        chatgpt_total_scores.append(revised_chatgpt_total_score)
        human_total_scores.append(revised_human_total_score)

    y_true = []
    y_score = []
    for i in range(0, len(chatgpt_total_scores)):
        y_true.append(1)
        y_score.append(chatgpt_total_scores[i])
            
    for i in range(0, len(human_total_scores)):
        y_true.append(0)
        y_score.append(human_total_scores[i])
        
    # 计算评估指标
    auroc_score = roc_auc_score(y_true, y_score)
    return auroc_score

# 示例调用
grid_search(chatgpt_scores_dict_revised_full, human_scores_dict_revised_full, 
            chatgpt_scores_dict_masked_fill_full, human_scores_dict_masked_fill_full)

100%|██████████| 21999/21999 [03:05<00:00, 118.28it/s]

Best weights: semantics_score_weight=0.52, syntactic_score_weight=0.0, lexical_score_weight=0.0, reasoning_score_weight=0.48
Best AUROC: 0.9767874548057021



