In [None]:
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import chi2_contingency

gpt_df = pd.read_csv("classification_results_with_race_gpt.csv", encoding="ISO-8859-1")
deepseek_df = pd.read_csv("classification_results_with_race_deepseek.csv", encoding="ISO-8859-1")

# 補空值避免錯誤
gpt_df['concepts'] = gpt_df['concepts'].fillna('')
deepseek_df['concepts'] = deepseek_df['concepts'].fillna('')

# 定義清理函數：排除含 'or'、亂碼、過長或空的條目
def clean_concepts(concept_str):
    if pd.isna(concept_str):
        return []
    split_items = re.split(r';|\|', concept_str)
    return [s.strip() for s in split_items if 'or' not in s and len(s.strip()) > 0 and len(s.strip()) < 100 and not re.search(r'[\x80-\xFF]', s)]

# 套用清理
gpt_df['concepts_clean'] = gpt_df['concepts'].apply(clean_concepts)
deepseek_df['concepts_clean'] = deepseek_df['concepts'].apply(clean_concepts)

# One-hot encoding
mlb = MultiLabelBinarizer()
gpt_bin = pd.DataFrame(mlb.fit_transform(gpt_df['concepts_clean']), columns=mlb.classes_)
deepseek_bin = pd.DataFrame(mlb.transform(deepseek_df['concepts_clean']), columns=mlb.classes_)

# 各類別計數
gpt_counts = gpt_bin.sum(axis=0)
deepseek_counts = deepseek_bin.sum(axis=0)

# 卡方檢定表
contingency = pd.DataFrame({'GPT': gpt_counts, 'DeepSeek': deepseek_counts})

# 執行卡方檢定
chi2_stat, p_val, dof, expected = chi2_contingency(contingency.T)

# 計算每類貢獻度
contingency['Chi-Square_Contribution'] = (contingency['GPT'] - contingency['DeepSeek'])**2 / (expected[0] + expected[1])
contingency_sorted = contingency.sort_values(by='Chi-Square_Contribution', ascending=False)

print(f"Chi-Square Statistic: {chi2_stat:.2f}")
print(f"P-value: {p_val}")
print(contingency_sorted.head(10))

Chi-Square Statistic: 2225.80
P-value: 0.0
                                                  GPT  DeepSeek  \
Anti-Asian hate crimes-physical violence related  312      1292   
Anti-Asian hate crimes(general)                   739      2020   
Discrimination                                    323      1301   
Scapegoat                                         437      1264   
China/Chinese virus                               339         1   
Systemic racism                                   183       563   
Microaggressions                                   29       236   
Stereotypes                                        40       191   
Kung flu/plague                                    97         0   
Xenophobia                                        208       463   

                                                  Chi-Square_Contribution  
Anti-Asian hate crimes-physical violence related               598.753117  
Anti-Asian hate crimes(general)                                594.



In [None]:
import numpy as np
from scipy.special import rel_entr  # KL divergence helper

gpt_dist = gpt_counts + 1e-10  # 加上微小值避免除以零
deepseek_dist = deepseek_counts + 1e-10

# 正規化為機率分布
gpt_probs = gpt_dist / gpt_dist.sum()
deepseek_probs = deepseek_dist / deepseek_dist.sum()

# KL(P‖Q): GPT 相對於 DeepSeek 的 KL 散度
kl_gpt_vs_deepseek = np.sum(rel_entr(gpt_probs, deepseek_probs))

# KL(Q‖P): DeepSeek 相對於 GPT 的 KL 散度
kl_deepseek_vs_gpt = np.sum(rel_entr(deepseek_probs, gpt_probs))

# 平均對稱 KL（optional）
js_divergence = 0.5 * (kl_gpt_vs_deepseek + kl_deepseek_vs_gpt)

# 輸出結果
print(f"KL(GPT || DeepSeek): {kl_gpt_vs_deepseek:.4f}")
print(f"KL(DeepSeek || GPT): {kl_deepseek_vs_gpt:.4f}")
print(f"Symmetric KL (Jensen-Shannon approx): {js_divergence:.4f}")

KL(GPT || DeepSeek): 1.6101
KL(DeepSeek || GPT): 0.2682
Symmetric KL (Jensen-Shannon approx): 0.9391


In [None]:
gpt_categories = set([item for sublist in gpt_df['concepts_clean'] for item in sublist])
deepseek_categories = set([item for sublist in deepseek_df['concepts_clean'] for item in sublist])

print("GPT 類別總數：", len(gpt_categories))
print("DeepSeek 類別總數：", len(deepseek_categories))
print("兩者共同的類別數：", len(gpt_categories & deepseek_categories))
print("僅 GPT 有的類別：", gpt_categories - deepseek_categories)
print("僅 DeepSeek 有的類別：", deepseek_categories - gpt_categories)


GPT 類別總數： 83
DeepSeek 類別總數： 43
兩者共同的類別數： 35
僅 GPT 有的類別： {'Sexual harassment', 'Environmental policies make people vulnerable', 'Kung flu/plague', 'Racial Injustice/Oppression', 'Victimized', 'Othering', 'Threatening language', 'Maliciously Stigmatizing China in Violation of the Principles of Equality and Non-discrimination', 'Hate crimes (general)', 'Anti-Asian Hate Crimes-Physical Violence Related', 'Racial slurs', 'Domestic violence', 'Demographics', 'Hate crimes', 'Prejudice', 'Community impact', 'Physical violence related', 'Fear', 'Viral racism', 'Psychological effect', 'Racialized attacks', 'Hate crimes-physical violence related', 'Ramen Noodle flu', 'Self-defense', 'Wuhan virus/plague', 'Racial Inequity', 'Physical assaults', 'Gender bias', 'Bias', 'Symbolization', 'Weight bias', 'Anti-Asian Hate Crimes(general)', 'Asian Virus', 'Diversity', 'Gendered bias', 'Hate crimes(general)', 'Immigrant experience', 'Collective action', 'Diseased Chinese', 'Gendered racism', 'Physical Hara