一般統計

In [None]:
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import chi2_contingency

gpt_df = pd.read_csv("classification_results_with_race_gpt.csv", encoding="ISO-8859-1")
deepseek_df = pd.read_csv("classification_results_with_race_deepseek.csv", encoding="ISO-8859-1")

# 補空值避免錯誤
gpt_df['concepts'] = gpt_df['concepts'].fillna('')
deepseek_df['concepts'] = deepseek_df['concepts'].fillna('')

# 定義清理函數：排除含 'or'、亂碼、過長或空的條目
def clean_concepts(concept_str):
    if pd.isna(concept_str):
        return []
    split_items = re.split(r';|\|', concept_str)
    return [s.strip() for s in split_items if 'or' not in s and len(s.strip()) > 0 and len(s.strip()) < 100 and not re.search(r'[\x80-\xFF]', s)]

# 套用清理
gpt_df['concepts_clean'] = gpt_df['concepts'].apply(clean_concepts)
deepseek_df['concepts_clean'] = deepseek_df['concepts'].apply(clean_concepts)

# One-hot encoding
mlb = MultiLabelBinarizer()
gpt_bin = pd.DataFrame(mlb.fit_transform(gpt_df['concepts_clean']), columns=mlb.classes_)
deepseek_bin = pd.DataFrame(mlb.transform(deepseek_df['concepts_clean']), columns=mlb.classes_)

# 各類別計數
gpt_counts = gpt_bin.sum(axis=0)
deepseek_counts = deepseek_bin.sum(axis=0)

# 卡方檢定表
contingency = pd.DataFrame({'GPT': gpt_counts, 'DeepSeek': deepseek_counts})

# 執行卡方檢定
chi2_stat, p_val, dof, expected = chi2_contingency(contingency.T)

# 計算每類貢獻度
contingency['Chi-Square_Contribution'] = (contingency['GPT'] - contingency['DeepSeek'])**2 / (expected[0] + expected[1])
contingency_sorted = contingency.sort_values(by='Chi-Square_Contribution', ascending=False)

print(f"Chi-Square Statistic: {chi2_stat:.2f}")
print(f"P-value: {p_val}")
print(contingency_sorted.head(10))

Chi-Square Statistic: 2225.80
P-value: 0.0
                                                  GPT  DeepSeek  \
Anti-Asian hate crimes-physical violence related  312      1292   
Anti-Asian hate crimes(general)                   739      2020   
Discrimination                                    323      1301   
Scapegoat                                         437      1264   
China/Chinese virus                               339         1   
Systemic racism                                   183       563   
Microaggressions                                   29       236   
Stereotypes                                        40       191   
Kung flu/plague                                    97         0   
Xenophobia                                        208       463   

                                                  Chi-Square_Contribution  
Anti-Asian hate crimes-physical violence related               598.753117  
Anti-Asian hate crimes(general)                                594.



In [None]:
import numpy as np
from scipy.special import rel_entr  # KL divergence helper

gpt_dist = gpt_counts + 1e-10  # 加上微小值避免除以零
deepseek_dist = deepseek_counts + 1e-10

# 正規化為機率分布
gpt_probs = gpt_dist / gpt_dist.sum()
deepseek_probs = deepseek_dist / deepseek_dist.sum()

# KL(P‖Q): GPT 相對於 DeepSeek 的 KL 散度
kl_gpt_vs_deepseek = np.sum(rel_entr(gpt_probs, deepseek_probs))

# KL(Q‖P): DeepSeek 相對於 GPT 的 KL 散度
kl_deepseek_vs_gpt = np.sum(rel_entr(deepseek_probs, gpt_probs))

# 平均對稱 KL（optional）
js_divergence = 0.5 * (kl_gpt_vs_deepseek + kl_deepseek_vs_gpt)

# 輸出結果
print(f"KL(GPT || DeepSeek): {kl_gpt_vs_deepseek:.4f}")
print(f"KL(DeepSeek || GPT): {kl_deepseek_vs_gpt:.4f}")
print(f"Symmetric KL (Jensen-Shannon approx): {js_divergence:.4f}")

KL(GPT || DeepSeek): 1.6101
KL(DeepSeek || GPT): 0.2682
Symmetric KL (Jensen-Shannon approx): 0.9391


In [None]:
gpt_categories = set([item for sublist in gpt_df['concepts_clean'] for item in sublist])
deepseek_categories = set([item for sublist in deepseek_df['concepts_clean'] for item in sublist])

print("GPT 類別總數：", len(gpt_categories))
print("DeepSeek 類別總數：", len(deepseek_categories))
print("兩者共同的類別數：", len(gpt_categories & deepseek_categories))
print("僅 GPT 有的類別：", gpt_categories - deepseek_categories)
print("僅 DeepSeek 有的類別：", deepseek_categories - gpt_categories)


GPT 類別總數： 83
DeepSeek 類別總數： 43
兩者共同的類別數： 35
僅 GPT 有的類別： {'Sexual harassment', 'Environmental policies make people vulnerable', 'Kung flu/plague', 'Racial Injustice/Oppression', 'Victimized', 'Othering', 'Threatening language', 'Maliciously Stigmatizing China in Violation of the Principles of Equality and Non-discrimination', 'Hate crimes (general)', 'Anti-Asian Hate Crimes-Physical Violence Related', 'Racial slurs', 'Domestic violence', 'Demographics', 'Hate crimes', 'Prejudice', 'Community impact', 'Physical violence related', 'Fear', 'Viral racism', 'Psychological effect', 'Racialized attacks', 'Hate crimes-physical violence related', 'Ramen Noodle flu', 'Self-defense', 'Wuhan virus/plague', 'Racial Inequity', 'Physical assaults', 'Gender bias', 'Bias', 'Symbolization', 'Weight bias', 'Anti-Asian Hate Crimes(general)', 'Asian Virus', 'Diversity', 'Gendered bias', 'Hate crimes(general)', 'Immigrant experience', 'Collective action', 'Diseased Chinese', 'Gendered racism', 'Physical Hara

確認投票

1. 單純child & parent投票，parent牽涉權重：0.5

In [None]:
import pandas as pd

# === STEP 1: 讀取模型輸出 ===
def read_model_outputs(gpt_path, deepseek_path, encoding_list=['utf-8-sig', 'big5', 'latin1']):
    for enc in encoding_list:
        try:
            gpt_df = pd.read_excel(gpt_path)
            deepseek_df = pd.read_excel(deepseek_path)
            return gpt_df, deepseek_df
        except Exception:
            continue
    raise ValueError("All encodings failed for reading files.")

# === STEP 2: 概念清洗與轉集合 ===
def process_concepts(concept_str):
    if pd.isna(concept_str):
        return set()
    return set(c.strip().lower() for c in concept_str.split(';') if c.strip())

# === STEP 3: 合併資料並投票 ===
def vote_concepts(gpt_df, deepseek_df):
    gpt_df['concepts'] = gpt_df['concepts'].astype(str)
    deepseek_df['concepts'] = deepseek_df['concepts'].astype(str)

    merged_df = pd.merge(
        gpt_df[['quote', 'concepts']].rename(columns={'concepts': 'concepts_gpt'}),
        deepseek_df[['quote', 'concepts']].rename(columns={'concepts': 'concepts_deepseek'}),
        on='quote',
        how='outer'
    )

    merged_df['concepts_gpt_set'] = merged_df['concepts_gpt'].apply(process_concepts)
    merged_df['concepts_deepseek_set'] = merged_df['concepts_deepseek'].apply(process_concepts)

    def vote(row):
        counts = {}
        for c in row['concepts_gpt_set']:
            counts[c] = counts.get(c, 0) + 1
        for c in row['concepts_deepseek_set']:
            counts[c] = counts.get(c, 0) + 1
        return counts

    merged_df['voted_concepts'] = merged_df.apply(vote, axis=1)
    return merged_df

# === STEP 4: 引入 Concept Tree 進階分析（選用） ===
def flatten_votes_with_tree(voted_dict, concept_tree):
    expanded_votes = {}
    for concept, count in voted_dict.items():
        expanded_votes[concept] = expanded_votes.get(concept, 0) + count

        # 允許一對多的 parent list
        parents = concept_tree.get(concept.lower(), [])
        if isinstance(parents, str):
            parents = [parents]
        for parent in parents:
            expanded_votes[parent] = expanded_votes.get(parent, 0) + count * 0.5  # 可調整權重
    return expanded_votes

# === STEP 5: 建立完整流程 ===
def run_voting_pipeline(gpt_path, deepseek_path, concept_tree={}):
    gpt_df, deepseek_df = read_model_outputs(gpt_path, deepseek_path)
    merged_df = vote_concepts(gpt_df, deepseek_df)

    if concept_tree:
        merged_df['voted_with_tree'] = merged_df['voted_concepts'].apply(
            lambda vc: flatten_votes_with_tree(vc, concept_tree)
        )
    return merged_df

concept_tree = {
    "white supremacy": ["racial injustice/inequity and oppression"],
    "preserve whiteness": ["racial injustice/inequity and oppression"],
    "white privilege": ["racial injustice/inequity and oppression"],
    "racial bias": ["racial injustice/inequity and oppression"],
    "(domestic) terrorism": ["racial injustice/inequity and oppression"],
    '"china/chinese/asian virus"/"kung flu/plague/ramen noodle flu"': ["racial injustice/inequity and oppression"],
    "racial injustice/inequity and oppression": ["systematic racism"],
    "systematic racism": ["macro-level racism"],
    "macro-level racism": ["types of anti-asian hate"],
    "page act (a historical law passed in 1875)": ["cultural complexity"],
    "cultural complexity": ["racial injustice/inequity and oppression"],

    # Individual-level racism 分支
    "commie": ["bigotry/prejudice"],
    "yellow peril": ["bigotry/prejudice"],
    "ching chong": ["bigotry/prejudice"],
    "perpetual/forever foreigner (go back to china)": ["bigotry/prejudice"],
    "racism toward asian women": ["bigotry/prejudice"],
    "scapegoat": ["bigotry/prejudice"],
    "misogyny": ["scapegoat"],
    "xenophobia": ["scapegoat"],
    "bigotry/prejudice": ["individual level racism"],
    "individual level racism": ["types of anti-asian hate"],

    # Racial discrimination 分支
    "anti-asian hate crimes(general)": ["racial discrimination"],
    "anti-asian hate crimes-physical violence related": ["anti-asian hate crimes(general)"],
    "physical": ["harassments"],
    "verbal": ["harassments"],
    "online": ["harassments"],
    "harassments": ["anti-asian hate crimes(general)"],
    "microaggression": ["racial discrimination"],
    "racial discrimination": ["types of anti-asian hate"],

    # 歧視來源類型
    "other minorities (e.g., black) attack aa": ["anti-asian hate crimes-physical violence related"],
    "attacked by white or not specified": ["anti-asian hate crimes-physical violence related"],
    "recidivism": ["anti-asian hate crimes-physical violence related", "harassments"],
    "sexual violence": ["anti-asian hate crimes-physical violence related"]
}

# === 執行主程式（記得替換自己的） ===
final_df = run_voting_pipeline("classification_test_gpt.xlsx", "classification_test_deepseek.xlsx", concept_tree)


✔️ 全部流程已定義完畢。請呼叫 `run_voting_pipeline()` 來執行。


In [7]:
final_df# === STEP 6: 將結果輸出為 Excel ===
def save_to_excel(df, output_path="voting_results.xlsx"):
    # 將字典欄位轉為字串，以便寫入 Excel
    df_to_save = df.copy()
    if 'voted_concepts' in df_to_save.columns:
        df_to_save['voted_concepts'] = df_to_save['voted_concepts'].apply(lambda d: str(d))
    if 'voted_with_tree' in df_to_save.columns:
        df_to_save['voted_with_tree'] = df_to_save['voted_with_tree'].apply(lambda d: str(d))

    df_to_save.to_excel(output_path, index=False)
    print(f"✔️ 已成功輸出至 {output_path}")

save_to_excel(final_df, "concept_voting_output.xlsx")

✔️ 已成功輸出至 concept_voting_output.xlsx


2. 遞迴concept tree投票

In [None]:
import pandas as pd

# === STEP 1: 讀取模型輸出 ===
def read_model_outputs(gpt_path, deepseek_path, encoding_list=['utf-8-sig', 'big5', 'latin1']):
    for enc in encoding_list:
        try:
            gpt_df = pd.read_excel(gpt_path)
            deepseek_df = pd.read_excel(deepseek_path)
            return gpt_df, deepseek_df
        except Exception:
            continue
    raise ValueError("All encodings failed for reading files.")

# === STEP 2: 概念清洗與轉集合 ===
def process_concepts(concept_str):
    if pd.isna(concept_str):
        return set()
    return set(c.strip().lower() for c in concept_str.split(';') if c.strip())

# === STEP 3: 合併資料並投票 ===
def vote_concepts(gpt_df, deepseek_df):
    gpt_df['concepts'] = gpt_df['concepts'].astype(str)
    deepseek_df['concepts'] = deepseek_df['concepts'].astype(str)

    merged_df = pd.merge(
        gpt_df[['quote', 'concepts']].rename(columns={'concepts': 'concepts_gpt'}),
        deepseek_df[['quote', 'concepts']].rename(columns={'concepts': 'concepts_deepseek'}),
        on='quote',
        how='outer'
    )

    merged_df['concepts_gpt_set'] = merged_df['concepts_gpt'].apply(process_concepts)
    merged_df['concepts_deepseek_set'] = merged_df['concepts_deepseek'].apply(process_concepts)

    def vote(row):
        counts = {}
        for c in row['concepts_gpt_set']:
            counts[c] = counts.get(c, 0) + 1
        for c in row['concepts_deepseek_set']:
            counts[c] = counts.get(c, 0) + 1
        return counts

    merged_df['voted_concepts'] = merged_df.apply(vote, axis=1)
    return merged_df

# === STEP 4: 引入 Concept Tree 進階分析（選用） ===
from collections import defaultdict, deque

def flatten_votes_with_tree(voted_dict, concept_tree):
    expanded_votes = defaultdict(float)

    def add_with_ancestors(concept, vote):
        queue = deque([(concept, vote)])
        visited = set()
        while queue:
            node, current_vote = queue.popleft()
            if node in visited:
                continue
            visited.add(node)
            expanded_votes[node] += current_vote

            # 找到父節點
            parents = concept_tree.get(node.lower(), [])
            if isinstance(parents, str):
                parents = [parents]
            for parent in parents:
                queue.append((parent, current_vote * 0.5))  # 向上傳遞時的權重（可調）

    # 對每個投票概念啟動遞迴傳票
    for concept, count in voted_dict.items():
        add_with_ancestors(concept, count)

    return dict(expanded_votes)

# === STEP 5: 建立完整流程 ===
def run_voting_pipeline(gpt_path, deepseek_path, concept_tree={}):
    gpt_df, deepseek_df = read_model_outputs(gpt_path, deepseek_path)
    merged_df = vote_concepts(gpt_df, deepseek_df)

    if concept_tree:
        merged_df['voted_with_tree'] = merged_df['voted_concepts'].apply(
            lambda vc: flatten_votes_with_tree(vc, concept_tree)
        )
    return merged_df

concept_tree = {
    # Macro-level racism 分支
    "white supremacy": ["racial injustice/inequity and oppression"],
    "preserve whiteness": ["racial injustice/inequity and oppression"],
    "white privilege": ["racial injustice/inequity and oppression"],
    "racial bias": ["racial injustice/inequity and oppression"],
    "(domestic) terrorism": ["racial injustice/inequity and oppression"],
    '"china/chinese/asian virus"/"kung flu/plague/ramen noodle flu"': ["racial injustice/inequity and oppression"],
    "racial injustice/inequity and oppression": ["systematic racism"],
    "systematic racism": ["macro-level racism"],
    "macro-level racism": ["types of anti-asian hate"],
    "page act (a historical law passed in 1875)": ["cultural complexity"],
    "cultural complexity": ["racial injustice/inequity and oppression"],

    # Individual-level racism 分支
    "commie": ["bigotry/prejudice"],
    "yellow peril": ["bigotry/prejudice"],
    "ching chong": ["bigotry/prejudice"],
    "perpetual/forever foreigner (go back to china)": ["bigotry/prejudice"],
    "racism toward asian women": ["bigotry/prejudice"],
    "scapegoat": ["bigotry/prejudice"],
    "misogyny": ["scapegoat"],
    "xenophobia": ["scapegoat"],
    "bigotry/prejudice": ["individual level racism"],
    "individual level racism": ["types of anti-asian hate"],

    # Racial discrimination 分支
    "anti-asian hate crimes(general)": ["racial discrimination"],
    "anti-asian hate crimes-physical violence related": ["anti-asian hate crimes(general)"],
    "physical": ["harassments"],
    "verbal": ["harassments"],
    "online": ["harassments"],
    "harassments": ["anti-asian hate crimes(general)"],
    "microaggression": ["racial discrimination"],
    "racial discrimination": ["types of anti-asian hate"],

    # 歧視來源類型
    "other minorities (e.g., black) attack aa": ["anti-asian hate crimes-physical violence related"],
    "attacked by white or not specified": ["anti-asian hate crimes-physical violence related"],
    "recidivism": ["anti-asian hate crimes-physical violence related", "harassments"],
    "sexual violence": ["anti-asian hate crimes-physical violence related"]
}

# === 執行主程式（記得替換自己的） ===
final_df_2 = run_voting_pipeline("classification_test_gpt.xlsx", "classification_test_deepseek.xlsx", concept_tree)


In [11]:
final_df_2
# === STEP 6: 將結果輸出為 Excel ===
def save_to_excel(df, output_path="voting_results.xlsx"):
    # 將字典欄位轉為字串，以便寫入 Excel
    df_to_save = df.copy()
    if 'voted_concepts' in df_to_save.columns:
        df_to_save['voted_concepts'] = df_to_save['voted_concepts'].apply(lambda d: str(d))
    if 'voted_with_tree' in df_to_save.columns:
        df_to_save['voted_with_tree'] = df_to_save['voted_with_tree'].apply(lambda d: str(d))

    df_to_save.to_excel(output_path, index=False)
    print(f"✔️ 已成功輸出至 {output_path}")

save_to_excel(final_df_2, "concept_voting_output_2.xlsx")

✔️ 已成功輸出至 concept_voting_output_2.xlsx


In [None]:
concept_tree = {
    "systematic racism": ["Macro-level racism"],
    "racial injustice/inequity and oppression": ["systematic racism"],
    "white supremacy": ["systematic racism"],
    "preserve whiteness": ["systematic racism"],
    "white privilege": ["systematic racism"],
    "Racial bias": ["systematic racism"],
    "(Domestic) terrorism": ["systematic racism"],
    
    
    "Bigotry/prejudice": ["Individual-level racism"],
    "“China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu”": ["Bigotry/prejudice"],
    "commie": ["Bigotry/prejudice"],
    "yellow peril": ["Bigotry/prejudice"],
    "Ching Chong": ["Bigotry/prejudice"],
    "Perpetual/forever foreinger (Go back to China)": ["Bigotry/prejudice"],
    "Scapegoat": ["Bigotry/prejudice"],
    
    
    "Racial discrimination": ["Individual-level racism"],
    "Anti-Asian hate crimes(general)": ["Racial discrimination"],
    "Microaggression": ["Racial discrimination"],
    
}