In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

# 設定分界日
split_date = "2021-03-16"

def get_before_after_counts(file_path):
    """讀取檔案，切分 before/after，回傳 entity_type 統計"""
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])

    # 切分
    before_df = df[df['date'] < split_date]
    after_df = df[df['date'] >= split_date]

    # 過濾掉無效的 entity_type
    exclude_types = ['Cannot be inferred', 'unknown', 'not applicable']
    before_df = before_df[~before_df['entity_type'].isin(exclude_types)]
    after_df = after_df[~after_df['entity_type'].isin(exclude_types)]

    # 計算 counts
    before_counts = before_df['entity_type'].value_counts()
    after_counts = after_df['entity_type'].value_counts()

    # 合併成 DataFrame
    comparison = pd.DataFrame({
        'Before': before_counts,
        'After': after_counts
    }).fillna(0).astype(int)

    # 加上總和
    comparison['Total'] = comparison['Before'] + comparison['After']

    # 排序（依照 Total 由大到小）
    comparison = comparison.sort_values(by='Total', ascending=False)

    return comparison

def chi_square_with_residuals(comparison_df):
    """卡方檢定 + 標準化殘差"""
    contingency_table = comparison_df[['Before', 'After']].T.values
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # 標準化殘差
    residuals = (contingency_table - expected) / np.sqrt(expected)
    residuals_df = pd.DataFrame(
        residuals.T,
        index=comparison_df.index,
        columns=['Before_resid', 'After_resid']
    )

    return chi2, p, dof, residuals_df

# 分別跑 step3 和 step4
comparison_step3 = get_before_after_counts("step3_all_new.csv")
comparison_step4 = get_before_after_counts("step4_all_with_date.csv")

print("=== Step3 統計結果 ===")
print(comparison_step3)
chi2, p, dof, residuals_df = chi_square_with_residuals(comparison_step3)
print(f"\n[Step3] Chi-square = {chi2:.2f}, df = {dof}, p-value = {p:.4f}")
print("\n[Step3] 標準化殘差：")
print(residuals_df)

print("\n=== Step4 統計結果 ===")
print(comparison_step4)
chi2, p, dof, residuals_df = chi_square_with_residuals(comparison_step4)
print(f"\n[Step4] Chi-square = {chi2:.2f}, df = {dof}, p-value = {p:.4f}")
print("\n[Step4] 標準化殘差：")
print(residuals_df)


In [None]:
import pandas as pd

# === 1. 讀取資料 ===
df = pd.read_csv("step3_all_new.csv")

# 確認欄位名稱正確
print("👉 欄位名稱：", df.columns.tolist())

# === 2. 日期轉換 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# === 3. 設定分界日 ===
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date]
after_df = df[df['date'] >= cutoff_date]

# === 4. 排除無效 reaction ===
valid_reactions = ['Cannot be inferred', 'unknown']
before_df_filtered = before_df[~before_df['reaction'].isin(valid_reactions)]
after_df_filtered = after_df[~after_df['reaction'].isin(valid_reactions)]

# === 5. 計算 reaction 出現次數 ===
before_counts = before_df_filtered['reaction'].value_counts()
after_counts = after_df_filtered['reaction'].value_counts()

# === 6. 建立比較表 ===
all_reactions = sorted(set(before_counts.index).union(set(after_counts.index)))
reaction_comparison = pd.DataFrame(index=all_reactions)
reaction_comparison['Before'] = before_counts
reaction_comparison['After'] = after_counts
reaction_comparison = reaction_comparison.fillna(0).astype(int)

# === 7. 加入 Total 與百分比 ===
reaction_comparison['Total'] = reaction_comparison['Before'] + reaction_comparison['After']

total_before = reaction_comparison['Before'].sum()
total_after = reaction_comparison['After'].sum()

reaction_comparison['Before(%)'] = (reaction_comparison['Before'] / total_before * 100).round(2) if total_before > 0 else 0
reaction_comparison['After(%)'] = (reaction_comparison['After'] / total_after * 100).round(2) if total_after > 0 else 0

# === 8. 排序 ===
reaction_comparison = reaction_comparison.sort_values(by='Total', ascending=False)

# === 9. 輸出結果 ===
pd.set_option("display.width", 200)  # 預設 80
pd.set_option("display.max_columns", None)  # 顯示所有欄位
pd.set_option("display.max_colwidth", None)  # 不截斷文字
print("📊 Reaction 數量與百分比比較：")
print(reaction_comparison)

# 如果要存成 CSV
# reaction_comparison.to_csv("reaction_comparison.csv", encoding="utf-8-sig")

from scipy.stats import wilcoxon

# === 10. Wilcoxon 檢定 ===
before_vals = reaction_comparison["Before(%)"]
after_vals = reaction_comparison["After(%)"]

# 確保不是完全一樣的數據，否則 wilcoxon 會報錯
if (before_vals != after_vals).any():
    stat, p = wilcoxon(after_vals, before_vals)
    print("\n📊 Wilcoxon 符號等級檢定結果")
    print("Statistic =", stat, "  p-value =", p)
    if p < 0.05:
        print("➡️ 結論：事件前後整體反應分布有顯著差異")
    else:
        print("➡️ 結論：事件前後整體反應分布沒有顯著差異")
else:
    print("\n⚠️ Before% 和 After% 完全相同，無法進行 Wilcoxon 檢定")


from scipy.stats import chi2_contingency
import numpy as np

# === 11. 卡方檢定 ===
contingency_table = reaction_comparison[['Before', 'After']].T.values
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("\n📊 卡方檢定結果")
print(f"Chi-square = {chi2:.2f}, df = {dof}, p-value = {p:.4f}")
if p < 0.05:
    print("➡️ 結論：事件前後反應分布有顯著差異")
else:
    print("➡️ 結論：事件前後反應分布沒有顯著差異")

# === 12. 計算標準化殘差 ===
residuals = (contingency_table - expected) / np.sqrt(expected)
residuals_df = pd.DataFrame(
    residuals.T,
    index=reaction_comparison.index,
    columns=['Before_resid', 'After_resid']
)

print("\n📊 各反應類別的標準化殘差：")
print(residuals_df)

# 如果要一起存成 CSV
# result_with_resid = reaction_comparison.join(residuals_df)
# result_with_resid.to_csv("reaction_comparison_with_resid.csv", encoding="utf-8-sig")


In [None]:
import pandas as pd
from scipy.stats import wilcoxon, chi2_contingency
import numpy as np

# === 1. 讀取資料 ===
df = pd.read_csv("step4_all_with_date.csv")

# 確認欄位
print("👉 欄位名稱：", df.columns.tolist())

# === 2. 日期轉換 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# === 3. 設定分界日 ===
cutoff_date = pd.to_datetime("2021-03-16")
before_df = df[df['date'] < cutoff_date]
after_df = df[df['date'] >= cutoff_date]

# === 4. 排除無效 emotions ===
valid_emotions = ['cannot be inferred', 'unknown']
before_df_filtered = before_df[~before_df['emotion'].isin(valid_emotions)]
after_df_filtered = after_df[~after_df['emotion'].isin(valid_emotions)]

# === 5. 計算 emotion 出現次數 ===
before_counts = before_df_filtered['emotion'].value_counts()
after_counts = after_df_filtered['emotion'].value_counts()

# === 6. 建立比較表 ===
all_emotions = sorted(set(before_counts.index).union(set(after_counts.index)))
emotion_comparison = pd.DataFrame(index=all_emotions)
emotion_comparison['Before'] = before_counts
emotion_comparison['After'] = after_counts
emotion_comparison = emotion_comparison.fillna(0).astype(int)

# === 7. 加入 Total 與百分比 ===
emotion_comparison['Total'] = emotion_comparison['Before'] + emotion_comparison['After']

total_before = emotion_comparison['Before'].sum()
total_after = emotion_comparison['After'].sum()

emotion_comparison['Before(%)'] = (emotion_comparison['Before'] / total_before * 100).round(2) if total_before > 0 else 0
emotion_comparison['After(%)'] = (emotion_comparison['After'] / total_after * 100).round(2) if total_after > 0 else 0

# === 8. 排序 ===
emotion_comparison = emotion_comparison.sort_values(by='Total', ascending=False)

# === 9. 顯示 ===
pd.set_option("display.width", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

print("📊 Emotion 數量與百分比比較：")
print(emotion_comparison)

# === 10. Wilcoxon 檢定 ===
before_vals = emotion_comparison["Before(%)"]
after_vals = emotion_comparison["After(%)"]

if (before_vals != after_vals).any():
    stat, p = wilcoxon(after_vals, before_vals)
    print("\n📊 Wilcoxon 符號等級檢定結果")
    print("Statistic =", stat, "  p-value =", p)
    if p < 0.05:
        print("➡️ 結論：事件前後整體情緒分布有顯著差異")
    else:
        print("➡️ 結論：事件前後整體情緒分布沒有顯著差異")
else:
    print("\n⚠️ Before% 和 After% 完全相同，無法進行 Wilcoxon 檢定")

# === 11. 卡方檢定 ===
contingency_table = emotion_comparison[['Before', 'After']].T.values
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("\n📊 卡方檢定結果")
print(f"Chi-square = {chi2:.2f}, df = {dof}, p-value = {p:.4f}")
if p < 0.05:
    print("➡️ 結論：事件前後情緒分布有顯著差異")
else:
    print("➡️ 結論：事件前後情緒分布沒有顯著差異")

# === 12. 標準化殘差 ===
residuals = (contingency_table - expected) / np.sqrt(expected)
residuals_df = pd.DataFrame(
    residuals.T,
    index=emotion_comparison.index,
    columns=['Before_resid', 'After_resid']
)

# 也可以加上是否顯著標記
residuals_df['Before_sig'] = residuals_df['Before_resid'].apply(lambda x: "*" if abs(x) > 2 else "")
residuals_df['After_sig'] = residuals_df['After_resid'].apply(lambda x: "*" if abs(x) > 2 else "")

print("\n📊 各情緒類別的標準化殘差：")
print(residuals_df)

# === 13. 合併輸出 (可存檔) ===
# result_with_resid = emotion_comparison.join(residuals_df)
# result_with_resid.to_csv("emotion_comparison_with_resid.csv", encoding="utf-8-sig")


In [None]:
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
import numpy as np

# === 1. 載入 CSV ===
df = pd.read_csv("step3_all_new.csv")

# === 2. 日期轉換 & 切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

before_df['period'] = 'before'
after_df['period'] = 'after'

df = pd.concat([before_df, after_df], ignore_index=True)

# === 3. 過濾無效值 ===
invalid_reactions = ['Cannot be inferred', 'unknown', 'Cannot be inferred.']
invalid_entity_types = ['Cannot be inferred', 'unknown']

df = df[~df['reaction'].isin(invalid_reactions)]
df = df[~df['entity_type'].isin(invalid_entity_types)]

# === 4. 去重：同篇文章、同一個 entity、同一個 reaction 只保留一次 ===
df_unique = df.drop_duplicates(subset=['article_id', 'entity', 'reaction'])

# === 5. 準備所有 entity_type ===
entity_types = sorted(df_unique['entity_type'].dropna().unique())

all_rows = []
for entity in entity_types:
    be = df_unique[(df_unique['entity_type'] == entity) & (df_unique['period'] == 'before')]['reaction'].dropna().astype(str)
    af = df_unique[(df_unique['entity_type'] == entity) & (df_unique['period'] == 'after')]['reaction'].dropna().astype(str)
    
    be_counts = Counter(be)
    af_counts = Counter(af)
    
    all_reactions = sorted(set(be_counts.keys()) | set(af_counts.keys()))

    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for reaction in all_reactions:
        be_n = be_counts.get(reaction, 0)
        af_n = af_counts.get(reaction, 0)
        row = {
            'entity_type': entity,
            'reaction': reaction,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0
        }
        all_rows.append(row)

# === 6. 建立 DataFrame 並取前10名 ===
comparison_df = pd.DataFrame(all_rows)
comparison_df = comparison_df.sort_values(by=['entity_type', 'Total'], ascending=[True, False])
top10_df = comparison_df.groupby('entity_type').head(10).reset_index(drop=True)

# === 7. 各 entity_type 的 Wilcoxon 檢定 ===
print("\n📊 各 entity_type 的 Wilcoxon 檢定結果")
entity_results = []

for entity in entity_types:
    sub_df = comparison_df[comparison_df['entity_type'] == entity]
    before_vals = sub_df["Before(%)"]
    after_vals = sub_df["After(%)"]

    # 避免全 0 或完全相同
    if (before_vals != after_vals).any() and len(sub_df) > 0:
        try:
            stat, p = wilcoxon(after_vals, before_vals)
            entity_results.append({
                "entity_type": entity,
                "Statistic": stat,
                "p-value": p,
                "Significant": "Yes" if p < 0.05 else "No"
            })
        except ValueError as e:
            entity_results.append({
                "entity_type": entity,
                "Statistic": None,
                "p-value": None,
                "Significant": "N/A",
                "Note": str(e)
            })
    else:
        entity_results.append({
            "entity_type": entity,
            "Statistic": None,
            "p-value": None,
            "Significant": "N/A",
            "Note": "Before% 和 After% 完全相同或為空"
        })

entity_results_df = pd.DataFrame(entity_results)
print(entity_results_df)

from scipy.stats import chi2_contingency

# === 卡方檢定：各 entity_type 的 Before vs After ===
print("\n📊 各 entity_type 的卡方檢定結果")
chi_results = []

for entity in entity_types:
    sub_df = comparison_df[comparison_df['entity_type'] == entity]

    # 建立 反應 × 時間 的列聯表
    contingency = sub_df[['Before','After']].to_numpy()

    # 如果總數太小或只有一個反應，跳過
    if contingency.shape[0] > 1 and contingency.sum() > 0:
        chi2, p, dof, expected = chi2_contingency(contingency)
        chi_results.append({
            "entity_type": entity,
            "Chi2": round(chi2, 3),
            "df": dof,
            "p-value": round(p, 4),
            "Significant": "Yes" if p < 0.05 else "No"
        })
    else:
        chi_results.append({
            "entity_type": entity,
            "Chi2": None,
            "df": None,
            "p-value": None,
            "Significant": "N/A"
        })

chi_results_df = pd.DataFrame(chi_results)
print(chi_results_df)

print("\n📊 顯著的 entity_type 的標準化殘差分析")
residuals_results = {}

for entity in entity_types:
    sub_df = comparison_df[comparison_df['entity_type'] == entity]
    contingency = sub_df[['Before','After']].to_numpy()

    if contingency.shape[0] > 1 and contingency.sum() > 0:
        chi2, p, dof, expected = chi2_contingency(contingency)

        if p < 0.05:  # 只針對顯著的群體
            residuals = (contingency - expected) / np.sqrt(expected)

            # 整理成 DataFrame
            resid_df = pd.DataFrame(
                residuals,
                index=sub_df['reaction'],
                columns=['Before_resid','After_resid']
            ).round(2)

            residuals_results[entity] = resid_df
            print(f"\n🔎 {entity} 標準化殘差")
            print(resid_df)
from scipy.stats import f_oneway

# === One-way ANOVA：各 entity_type Before% vs After% ===
print("\n📊 各 entity_type 的 One-way ANOVA 結果")
anova_results = []

for entity in entity_types:
    sub_df = comparison_df[comparison_df['entity_type'] == entity]

    before_vals = sub_df["Before(%)"].values
    after_vals = sub_df["After(%)"].values

    # 確保不是空集合
    if len(before_vals) > 0 and len(after_vals) > 0:
        try:
            f_stat, p_val = f_oneway(before_vals, after_vals)

            # 判斷趨勢（看總和比較）
            trend = "Increase" if after_vals.mean() > before_vals.mean() else "Decrease"

            anova_results.append({
                "entity_type": entity,
                "F-stat": round(f_stat, 3),
                "p-value": round(p_val, 4),
                "Significant": "Yes" if p_val < 0.05 else "No",
                "Trend": trend
            })
        except Exception as e:
            anova_results.append({
                "entity_type": entity,
                "F-stat": None,
                "p-value": None,
                "Significant": "N/A",
                "Trend": "N/A",
                "Note": str(e)
            })
    else:
        anova_results.append({
            "entity_type": entity,
            "F-stat": None,
            "p-value": None,
            "Significant": "N/A",
            "Trend": "N/A",
            "Note": "Empty values"
        })

anova_results_df = pd.DataFrame(anova_results)
print(anova_results_df)



# === 8. 顯示前十名 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('colheader_justify', 'left')

print("\n📊 各 entity_type 的前十名 Reaction 數量與百分比比較（事件前後，去重後）")
print(top10_df)


In [None]:
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon

# === 1. 假設 comparison_df 已經建好 ===
df = comparison_df.copy()

# === 2. 設定閾值，過小的 reaction 合併 ===
threshold = 10
df['reaction_merged'] = df.apply(
    lambda row: row['reaction'] if row['Total'] >= threshold else "Other reactions",
    axis=1
)

# === 3. 重新彙總 ===
merged = (
    df.groupby(['entity_type','reaction_merged'])
      .agg({'Before':'sum','After':'sum'})
      .reset_index()
)

# 計算百分比
results = []
for entity, sub_df in merged.groupby('entity_type'):
    total_be = sub_df['Before'].sum()
    total_af = sub_df['After'].sum()
    for _, row in sub_df.iterrows():
        results.append({
            'entity_type': entity,
            'reaction': row['reaction_merged'],
            'Before': row['Before'],
            'After': row['After'],
            'Total': row['Before'] + row['After'],
            'Before(%)': round(row['Before'] / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(row['After'] / total_af * 100, 2) if total_af > 0 else 0
        })

merged_df = pd.DataFrame(results)

# === 4. 各 entity_type 的 Wilcoxon 檢定 ===
wilcoxon_results = []
for entity, sub_df in merged_df.groupby('entity_type'):
    before_vals = sub_df['Before(%)']
    after_vals = sub_df['After(%)']
    if (before_vals != after_vals).any() and len(sub_df) > 1:
        try:
            stat, p = wilcoxon(after_vals, before_vals)
            wilcoxon_results.append({
                "entity_type": entity,
                "Statistic": stat,
                "p-value": p,
                "Significant": "Yes" if p < 0.05 else "No"
            })
        except ValueError as e:
            wilcoxon_results.append({
                "entity_type": entity,
                "Statistic": None,
                "p-value": None,
                "Significant": "N/A",
                "Note": str(e)
            })

wilcoxon_df = pd.DataFrame(wilcoxon_results)
print("\n📊 各 entity_type 的 Wilcoxon 檢定結果（合併低頻反應後）")
print(wilcoxon_df)

print("\n📊 各 entity_type 的卡方檢定結果（合併低頻反應後）")
chi_results = []

for entity, sub_df in merged_df.groupby('entity_type'):
    contingency = sub_df[['Before','After']].to_numpy()

    if contingency.shape[0] > 1 and contingency.sum() > 0:
        chi2, p, dof, expected = chi2_contingency(contingency)
        chi_results.append({
            "entity_type": entity,
            "Chi2": round(chi2, 3),
            "df": dof,
            "p-value": round(p, 4),
            "Significant": "Yes" if p < 0.05 else "No"
        })
    else:
        chi_results.append({
            "entity_type": entity,
            "Chi2": None,
            "df": None,
            "p-value": None,
            "Significant": "N/A"
        })

chi_results_df = pd.DataFrame(chi_results)
print(chi_results_df)


# === 6. 顯示合併後的反應總表（前十名）===
merged_df = merged_df.sort_values(by=['entity_type', 'Total'], ascending=[True, False])
top10_merged = merged_df.groupby('entity_type').head(10).reset_index(drop=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('colheader_justify', 'left')

print("\n📊 各 entity_type 的前十名 Reaction 數量與百分比比較（事件前後，合併低頻後）")
print(top10_merged)




In [None]:
from collections import Counter
import pandas as pd

# === 1. 載入 CSV ===
df = pd.read_csv("step4_all_with_date.csv")

# === 2. 日期轉換 & 切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

# === 3. 排除無效 emotion 值的 row ===
invalid_emotions = ['cannot be inferred', 'unknown']
valid_before = before_df[~before_df['emotion'].isin(invalid_emotions)]
valid_after = after_df[~after_df['emotion'].isin(invalid_emotions)]

# === 4. 排除無效 entity_type 的 row ===
invalid_entity_types = ['Cannot be inferred', 'unknown']
valid_before = valid_before[~valid_before['entity_type'].isin(invalid_entity_types)]
valid_after = valid_after[~valid_after['entity_type'].isin(invalid_entity_types)]

# === 5. 找出所有出現過的 entity_type ===
entity_types = sorted(set(valid_before['entity_type'].dropna()) | set(valid_after['entity_type'].dropna()))

# === 6. 統計每個 entity_type 的 emotion ===
all_rows = []

def extract_emotions(series):
    all_emotions = []
    for item in series.dropna():
        parts = [e.strip() for e in item.split('|') if e.strip() and e.strip() not in invalid_emotions]
        all_emotions.extend(parts)
    return Counter(all_emotions)

for entity in entity_types:
    be_series = valid_before[valid_before['entity_type'] == entity]['emotion']
    af_series = valid_after[valid_after['entity_type'] == entity]['emotion']

    be_counts = extract_emotions(be_series)
    af_counts = extract_emotions(af_series)

    all_emotions = sorted(set(be_counts.keys()) | set(af_counts.keys()))
    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for emotion in all_emotions:
        be_n = be_counts.get(emotion, 0)
        af_n = af_counts.get(emotion, 0)
        row = {
            'entity_type': entity,
            'emotion': emotion,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0
        }
        all_rows.append(row)

# === 7. 建立 DataFrame 並取每個 entity_type 的前10個情緒 ===
comparison_df = pd.DataFrame(all_rows)
comparison_df = comparison_df.sort_values(by=['entity_type', 'Total'], ascending=[True, False])
top10_df = comparison_df.groupby('entity_type').head(10).reset_index(drop=True)

# === 8. 顯示設定 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'left')

# === 9. 顯示結果 ===
print("📊 各 entity_type 的前十名 Emotion 數量與百分比比較（事件前後）")
print(top10_df)

# ✅（可選）輸出成 CSV
# top10_df.to_csv("step4_top10_entity_emotions.csv", index=False, encoding="utf-8-sig")


In [None]:
import pandas as pd
from collections import Counter

# === 1. 載入資料 ===
df = pd.read_csv("step3_all_with_date.csv")

# === 2. 日期切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

# === 3. 分類成 individual / organization ===
def classify_entity_group(e):
    if e in ['victims', 'other_individuals', 'professionals', 'politicians', 'perpetrators', 'celebrities']:
        return 'individual'
    elif e in ['ngo_or_advocacy_groups', 'law_enforcement_agencies', 'community_groups',
               'government_bodies', 'business_entities']:
        return 'organization'
    else:
        return 'other'

before_df['entity_group'] = before_df['entity_type'].apply(classify_entity_group)
after_df['entity_group'] = after_df['entity_type'].apply(classify_entity_group)

# === 4. 過濾無效值 ===
invalid_reactions = ['Cannot be inferred', 'unknown']
valid_before = before_df[
    (~before_df['reaction'].isin(invalid_reactions)) &
    (before_df['entity_group'].isin(['individual', 'organization']))
]
valid_after = after_df[
    (~after_df['reaction'].isin(invalid_reactions)) &
    (after_df['entity_group'].isin(['individual', 'organization']))
]

# === 5. 統計各 group 的 reaction ===
all_rows = []

for group in ['individual', 'organization']:
    be = valid_before[valid_before['entity_group'] == group]['reaction'].dropna().astype(str)
    af = valid_after[valid_after['entity_group'] == group]['reaction'].dropna().astype(str)

    be_counts = Counter(be)
    af_counts = Counter(af)

    all_reactions = sorted(set(be_counts.keys()) | set(af_counts.keys()))
    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for reaction in all_reactions:
        be_n = be_counts.get(reaction, 0)
        af_n = af_counts.get(reaction, 0)
        row = {
            'entity_group': group,
            'reaction': reaction,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0,
        }
        all_rows.append(row)

# === 6. 整理結果 ===
reaction_comparison_df = pd.DataFrame(all_rows)
reaction_comparison_df = reaction_comparison_df.sort_values(by=['entity_group', 'Total'], ascending=[True, False])

# === 7. 只取各 group 前 20 ===
top20_reaction_df = reaction_comparison_df.groupby('entity_group').head(20).reset_index(drop=True)

# === 8. 顯示 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)
pd.set_option('display.colheader_justify', 'left')
pd.set_option('display.max_colwidth', 50)

print("📊 個人 vs 組織的 Reaction 前 20 名 數量與百分比比較：")
print(top20_reaction_df)


In [None]:
import pandas as pd
from collections import Counter

# === 1. 讀取資料 ===
df = pd.read_csv("step4_all_with_date.csv")

# === 2. 日期切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

# === 3. 分類 entity_group ===
def classify_entity_group(e):
    if e in ['victims', 'other_individuals', 'professionals', 'politicians', 'perpetrators', 'celebrities']:
        return 'individual'
    elif e in ['ngo_or_advocacy_groups', 'law_enforcement_agencies', 'community_groups',
               'government_bodies', 'business_entities']:
        return 'organization'
    else:
        return 'other'

before_df['entity_group'] = before_df['entity_type'].apply(classify_entity_group)
after_df['entity_group'] = after_df['entity_type'].apply(classify_entity_group)

# === 4. 過濾 valid rows (排除無效情緒 & 只取 individual / organization) ===
invalid_emotions = ['cannot be inferred', 'unknown']

valid_before = before_df[
    (before_df['entity_group'].isin(['individual', 'organization'])) &
    (before_df['emotion'].notna())
]
valid_after = after_df[
    (after_df['entity_group'].isin(['individual', 'organization'])) &
    (after_df['emotion'].notna())
]

# === 5. 處理多個 emotion 的欄位 ===
def extract_emotions(series):
    all_emotions = []
    for item in series.dropna():
        parts = [e.strip().lower() for e in item.split('|') if e.strip().lower() not in invalid_emotions]
        all_emotions.extend(parts)
    return Counter(all_emotions)

# === 6. 統計每個 entity_group 的情緒 ===
all_rows = []

for group in ['individual', 'organization']:
    be_series = valid_before[valid_before['entity_group'] == group]['emotion']
    af_series = valid_after[valid_after['entity_group'] == group]['emotion']

    be_counts = extract_emotions(be_series)
    af_counts = extract_emotions(af_series)

    all_emotions = sorted(set(be_counts.keys()) | set(af_counts.keys()))
    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for emotion in all_emotions:
        be_n = be_counts.get(emotion, 0)
        af_n = af_counts.get(emotion, 0)
        row = {
            'entity_group': group,
            'emotion': emotion,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0,
        }
        all_rows.append(row)

# === 7. 建立 DataFrame 並排序 ===
emotion_comparison_df = pd.DataFrame(all_rows)
emotion_comparison_df = emotion_comparison_df.sort_values(by=['entity_group', 'Total'], ascending=[True, False])

# === 8. 只取前 20 個 emotion ===
top20_emotion_df = emotion_comparison_df.groupby('entity_group').head(20).reset_index(drop=True)

# === 9. 顯示設定 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.colheader_justify', 'left')

# === 10. 顯示結果 ===
print("📊 個人 vs 組織的 Emotion 前 20 名 數量與百分比比較：")
print(top20_emotion_df)
