In [1]:
import pandas as pd

# === 1. 讀取資料 ===
df = pd.read_csv("step4_all_with_date.csv")

# 確認欄位
print("👉 欄位名稱：", df.columns.tolist())

# === 2. 日期轉換 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# === 3. 設定分界日 ===
cutoff_date = pd.to_datetime("2021-03-16")
before_df = df[df['date'] < cutoff_date]
after_df = df[df['date'] >= cutoff_date]

# === 4. 排除無效 emotions ===
valid_emotions = ['cannot be inferred', 'unknown']
before_df_filtered = before_df[~before_df['emotion'].isin(valid_emotions)]
after_df_filtered = after_df[~after_df['emotion'].isin(valid_emotions)]

# === 5. 計算 emotion 出現次數 ===
before_counts = before_df_filtered['emotion'].value_counts()
after_counts = after_df_filtered['emotion'].value_counts()

# === 6. 建立比較表 ===
all_emotions = sorted(set(before_counts.index).union(set(after_counts.index)))
emotion_comparison = pd.DataFrame(index=all_emotions)
emotion_comparison['Before'] = before_counts
emotion_comparison['After'] = after_counts
emotion_comparison = emotion_comparison.fillna(0).astype(int)

# === 7. 加入 Total 與百分比 ===
emotion_comparison['Total'] = emotion_comparison['Before'] + emotion_comparison['After']

total_before = emotion_comparison['Before'].sum()
total_after = emotion_comparison['After'].sum()

emotion_comparison['Before(%)'] = (emotion_comparison['Before'] / total_before * 100).round(2) if total_before > 0 else 0
emotion_comparison['After(%)'] = (emotion_comparison['After'] / total_after * 100).round(2) if total_after > 0 else 0

# === 8. 排序 ===
emotion_comparison = emotion_comparison.sort_values(by='Total', ascending=False)

# === 9. 顯示 ===
print("📊 Emotion 數量與百分比比較：")
print(emotion_comparison)

# # === 10. 存成 CSV (可選) ===
# emotion_comparison.to_csv("emotion_comparison.csv", encoding="utf-8-sig")

compassion_rows = df[df["emotion"].str.contains("solidarity", case=False, na=False)]

# 顯示結果
print(compassion_rows)


👉 欄位名稱： ['emotion_id', 'article_id', 'entity', 'entity_type', 'asian_status', 'emotion', 'emotion_reason', 'date']
📊 Emotion 數量與百分比比較：
          Before  After  Total  Before(%)  After(%)
sadness      857   1038   1895      31.37     37.37
anger        925    864   1789      33.86     31.10
fear         709    668   1377      25.95     24.05
joy          125     93    218       4.58      3.35
love          83     85    168       3.04      3.06
surprise      33     30     63       1.21      1.08
Empty DataFrame
Columns: [emotion_id, article_id, entity, entity_type, asian_status, emotion, emotion_reason, date]
Index: []


In [2]:
from collections import Counter
import pandas as pd

# === 1. 載入 CSV ===
df = pd.read_csv("step4_all_with_date.csv")

# === 2. 日期轉換 & 切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

# === 3. 排除無效 emotion 值的 row ===
invalid_emotions = ['cannot be inferred', 'unknown']
valid_before = before_df[~before_df['emotion'].isin(invalid_emotions)]
valid_after = after_df[~after_df['emotion'].isin(invalid_emotions)]

# === 4. 排除無效 entity_type 的 row ===
invalid_entity_types = ['Cannot be inferred', 'unknown']
valid_before = valid_before[~valid_before['entity_type'].isin(invalid_entity_types)]
valid_after = valid_after[~valid_after['entity_type'].isin(invalid_entity_types)]

# === 5. 找出所有出現過的 entity_type ===
entity_types = sorted(set(valid_before['entity_type'].dropna()) | set(valid_after['entity_type'].dropna()))

# === 6. 統計每個 entity_type 的 emotion ===
all_rows = []

def extract_emotions(series):
    all_emotions = []
    for item in series.dropna():
        parts = [e.strip() for e in item.split('|') if e.strip() and e.strip() not in invalid_emotions]
        all_emotions.extend(parts)
    return Counter(all_emotions)

for entity in entity_types:
    be_series = valid_before[valid_before['entity_type'] == entity]['emotion']
    af_series = valid_after[valid_after['entity_type'] == entity]['emotion']

    be_counts = extract_emotions(be_series)
    af_counts = extract_emotions(af_series)

    all_emotions = sorted(set(be_counts.keys()) | set(af_counts.keys()))
    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for emotion in all_emotions:
        be_n = be_counts.get(emotion, 0)
        af_n = af_counts.get(emotion, 0)
        row = {
            'entity_type': entity,
            'emotion': emotion,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0
        }
        all_rows.append(row)

# === 7. 建立 DataFrame 並取每個 entity_type 的前10個情緒 ===
comparison_df = pd.DataFrame(all_rows)
comparison_df = comparison_df.sort_values(by=['entity_type', 'Total'], ascending=[True, False])
top10_df = comparison_df.groupby('entity_type').head(10).reset_index(drop=True)

# === 8. 顯示設定 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'left')

# === 9. 顯示結果 ===
print("📊 各 entity_type 的前十名 Emotion 數量與百分比比較（事件前後）")
print(top10_df)

# ✅（可選）輸出成 CSV
# top10_df.to_csv("step4_top10_entity_emotions.csv", index=False, encoding="utf-8-sig")


📊 各 entity_type 的前十名 Emotion 數量與百分比比較（事件前後）
   entity_type               emotion    Before  After  Total  Before(%)  After(%)
0          business_entities   sadness    9       2     11    60.00      18.18   
1          business_entities     anger    2       6      8    13.33      54.55   
2          business_entities      fear    4       2      6    26.67      18.18   
3          business_entities      love    0       1      1     0.00       9.09   
4                celebrities     anger   38      19     57    39.18      28.79   
5                celebrities   sadness   26      26     52    26.80      39.39   
6                celebrities      fear   19      10     29    19.59      15.15   
7                celebrities       joy    5       6     11     5.15       9.09   
8                celebrities      love    4       4      8     4.12       6.06   
9                celebrities  surprise    5       1      6     5.15       1.52   
10          community_groups     anger    8       8   

In [3]:
import pandas as pd
from collections import Counter

# === 1. 讀取資料 ===
df = pd.read_csv("step4_all_with_date.csv")

# === 2. 日期切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")

before_df = df[df['date'] < cutoff_date].copy()
after_df = df[df['date'] >= cutoff_date].copy()

# === 3. 分類 entity_group ===
def classify_entity_group(e):
    if e in ['victims', 'other_individuals', 'professionals', 'politicians', 'perpetrators', 'celebrities']:
        return 'individual'
    elif e in ['ngo_or_advocacy_groups', 'law_enforcement_agencies', 'community_groups',
               'government_bodies', 'business_entities']:
        return 'organization'
    else:
        return 'other'

before_df['entity_group'] = before_df['entity_type'].apply(classify_entity_group)
after_df['entity_group'] = after_df['entity_type'].apply(classify_entity_group)

# === 4. 過濾 valid rows (排除無效情緒 & 只取 individual / organization) ===
invalid_emotions = ['cannot be inferred', 'unknown']

valid_before = before_df[
    (before_df['entity_group'].isin(['individual', 'organization'])) &
    (before_df['emotion'].notna())
]
valid_after = after_df[
    (after_df['entity_group'].isin(['individual', 'organization'])) &
    (after_df['emotion'].notna())
]

# === 5. 處理多個 emotion 的欄位 ===
def extract_emotions(series):
    all_emotions = []
    for item in series.dropna():
        parts = [e.strip().lower() for e in item.split('|') if e.strip().lower() not in invalid_emotions]
        all_emotions.extend(parts)
    return Counter(all_emotions)

# === 6. 統計每個 entity_group 的情緒 ===
all_rows = []

for group in ['individual', 'organization']:
    be_series = valid_before[valid_before['entity_group'] == group]['emotion']
    af_series = valid_after[valid_after['entity_group'] == group]['emotion']

    be_counts = extract_emotions(be_series)
    af_counts = extract_emotions(af_series)

    all_emotions = sorted(set(be_counts.keys()) | set(af_counts.keys()))
    total_be = sum(be_counts.values())
    total_af = sum(af_counts.values())

    for emotion in all_emotions:
        be_n = be_counts.get(emotion, 0)
        af_n = af_counts.get(emotion, 0)
        row = {
            'entity_group': group,
            'emotion': emotion,
            'Before': be_n,
            'After': af_n,
            'Total': be_n + af_n,
            'Before(%)': round(be_n / total_be * 100, 2) if total_be > 0 else 0,
            'After(%)': round(af_n / total_af * 100, 2) if total_af > 0 else 0,
        }
        all_rows.append(row)

# === 7. 建立 DataFrame 並排序 ===
emotion_comparison_df = pd.DataFrame(all_rows)
emotion_comparison_df = emotion_comparison_df.sort_values(by=['entity_group', 'Total'], ascending=[True, False])

# === 8. 只取前 20 個 emotion ===
top20_emotion_df = emotion_comparison_df.groupby('entity_group').head(20).reset_index(drop=True)

# === 9. 顯示設定 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.colheader_justify', 'left')

# === 10. 顯示結果 ===
print("📊 個人 vs 組織的 Emotion 前 20 名 數量與百分比比較：")
print(top20_emotion_df)


📊 個人 vs 組織的 Emotion 前 20 名 數量與百分比比較：
   entity_group  emotion    Before  After  Total  Before(%)  After(%)
0     individual   sadness  780     950    1730   31.71      37.85   
1     individual     anger  826     767    1593   33.58      30.56   
2     individual      fear  639     612    1251   25.98      24.38   
3     individual       joy  109      86     195    4.43       3.43   
4     individual      love   77      70     147    3.13       2.79   
5     individual  surprise   29      25      54    1.18       1.00   
6   organization     anger   99      95     194   36.67      36.54   
7   organization   sadness   77      86     163   28.52      33.08   
8   organization      fear   69      53     122   25.56      20.38   
9   organization       joy   15       7      22    5.56       2.69   
10  organization      love    6      14      20    2.22       5.38   
11  organization  surprise    4       5       9    1.48       1.92   


In [5]:
import pandas as pd
from scipy.stats import chi2_contingency

# === 1. 讀取資料 ===
df = pd.read_csv("step4_all_with_date.csv")

# === 2. 日期切分 ===
df['date'] = pd.to_datetime(df['date'], errors='coerce')
cutoff_date = pd.to_datetime("2021-03-16")
df['period'] = df['date'].apply(lambda x: 'before' if x < cutoff_date else 'after')

# === 3. 拆 emotion 欄位（用 | 分隔）===
df_exp = df.assign(
    emotion=df['emotion'].str.split(r'\s*\|\s*', regex=True)
).explode('emotion')

# === 4. 去除空值與無效 emotion ===
df_exp['emotion'] = df_exp['emotion'].str.strip().str.lower()
invalid_emotions = ['cannot be inferred', 'unknown']
df_exp = df_exp[~df_exp['emotion'].isin(invalid_emotions)]
df_exp = df_exp.dropna(subset=['emotion'])

# === 5. 去重：同篇文章、同一 entity、同一 emotion 只保留一次 ===
df_unique = df_exp.drop_duplicates(subset=['article_id', 'entity', 'emotion'])

# === 6. 建立 crosstab ===
emotion_table = pd.crosstab(df_unique['emotion'], df_unique['period'])

# === 7. 卡方檢定 ===
chi2, p, dof, expected = chi2_contingency(emotion_table)

# === 8. 整理結果表格 ===
expected_df = pd.DataFrame(expected, index=emotion_table.index, columns=emotion_table.columns)
diff_df = emotion_table - expected_df

result_df = pd.DataFrame({
    'Before_obs': emotion_table['before'],
    'Before_exp': expected_df['before'].round(2),
    'Before_diff': diff_df['before'].round(2),
    'After_obs': emotion_table['after'],
    'After_exp': expected_df['after'].round(2),
    'After_diff': diff_df['after'].round(2),
})
result_df['Total'] = result_df['Before_obs'] + result_df['After_obs']

# 整體是否顯著（用總體的 p-value）
result_df['Significant_diff'] = 'Yes' if p < 0.05 else 'No'

# === 9. 排序 ===
result_df = result_df.sort_values(by='Total', ascending=False)

# === 10. 顯示結果 ===
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1200)

print("📊 Emotion 前後比較表格（含觀察值、期望值、差異與總數）：")
print(result_df)

print("\nChi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)


📊 Emotion 前後比較表格（含觀察值、期望值、差異與總數）：
          Before_obs  Before_exp  Before_diff  After_obs  After_exp  After_diff  Total Significant_diff
emotion                                                                                                
sadness   773         827.75     -54.75        890        835.25     54.75       1663   Yes            
anger     788         771.01      16.99        761        777.99    -16.99       1549   Yes            
fear      649         621.19      27.81        599        626.81    -27.81       1248   Yes            
joy       113         101.04      11.96         90        101.96    -11.96        203   Yes            
love       74          78.15      -4.15         83         78.85      4.15        157   Yes            
surprise   31          28.87       2.13         27         29.13     -2.13         58   Yes            

Chi-square statistic: 14.004101483373983
p-value: 0.015583383916842752
Degrees of freedom: 5
