In [3]:
import os
import json
import math
import pandas as pd
from collections import Counter
from typing import Dict, Any
from openai import OpenAI

# ==============================
# OpenAI 設定與呼叫
# ==============================
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def get_response(prompt: str, temperature: float = 0.0) -> str:
    """呼叫 OpenAI API，回傳模型輸出（應該是 JSON 字串）"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            stream=False,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"API 調用錯誤: {e}")
        return ""

# ==============================
# 讀取文章（只為了保留順序，不丟進模型）
# ==============================
import pandas as pd

# 讀取 CSV
df = pd.read_csv("./articles_584.csv", encoding="ISO-8859-1")

# 只保留有文章的資料
df = df.dropna(subset=["ARTICLE_TEXT"])

# 確保 id 欄位是整數型態（避免後面拼接出問題）
df["id"] = df["id"].astype(int)

# 建立 dict：用 CSV 裡的 id 當編號
articles = {f"Article_{row['id']}": row["ARTICLE_TEXT"] for _, row in df.iterrows()}

# 列出全部文章編號
all_articles = list(articles.keys())
print("📄 CSV 中的文章編號：")
print(all_articles)  # 先只印前 20 筆
print(f"✅ 總共有 {len(all_articles)} 篇文章")


📄 CSV 中的文章編號：
['Article_1', 'Article_2', 'Article_3', 'Article_4', 'Article_5', 'Article_6', 'Article_8', 'Article_9', 'Article_10', 'Article_11', 'Article_12', 'Article_13', 'Article_14', 'Article_15', 'Article_16', 'Article_17', 'Article_18', 'Article_19', 'Article_20', 'Article_21', 'Article_22', 'Article_23', 'Article_24', 'Article_25', 'Article_26', 'Article_27', 'Article_28', 'Article_30', 'Article_31', 'Article_32', 'Article_33', 'Article_34', 'Article_35', 'Article_36', 'Article_37', 'Article_38', 'Article_39', 'Article_40', 'Article_42', 'Article_43', 'Article_44', 'Article_45', 'Article_46', 'Article_47', 'Article_48', 'Article_49', 'Article_50', 'Article_52', 'Article_53', 'Article_56', 'Article_57', 'Article_58', 'Article_59', 'Article_60', 'Article_61', 'Article_62', 'Article_65', 'Article_66', 'Article_67', 'Article_68', 'Article_69', 'Article_70', 'Article_71', 'Article_72', 'Article_73', 'Article_74', 'Article_75', 'Article_76', 'Article_79', 'Article_80', 'Article_81',

In [7]:
import pandas as pd

# 輸入你的 CSV 檔案名稱
csv_file = "articles_584.csv"

# 假設你知道總共應該有多少篇（例：584）
expected_total = 584

# 讀取 CSV
df = pd.read_csv(csv_file)

# 取得實際出現過的 id（轉成 int）
existing_numbers = set(df["id"].dropna().astype(int))

# 建立完整應有的 ID 集合
# ⚠️ 假設 id 從 1 開始編號
expected_numbers = set(range(1, expected_total + 1))

# 找出缺少的編號
missing = sorted(expected_numbers - existing_numbers)

print(f"✅ 實際文章數量: {len(existing_numbers)} / {expected_total}")
print(f"❌ 缺少的 Article 編號（共 {len(missing)} 篇）:")
print(", ".join(f"Article_{i}" for i in missing))


✅ 實際文章數量: 584 / 584
❌ 缺少的 Article 編號（共 45 篇）:
Article_7, Article_29, Article_41, Article_51, Article_54, Article_55, Article_63, Article_64, Article_77, Article_78, Article_95, Article_106, Article_111, Article_135, Article_138, Article_146, Article_215, Article_229, Article_234, Article_254, Article_261, Article_269, Article_271, Article_287, Article_289, Article_295, Article_353, Article_354, Article_362, Article_372, Article_378, Article_407, Article_408, Article_432, Article_444, Article_447, Article_462, Article_464, Article_472, Article_482, Article_491, Article_497, Article_498, Article_502, Article_509


In [8]:
# ==========================================
# Colab 程式碼區塊 1: 初始化和步驟1
# ==========================================
import json
import os

# Step 1 的提示詞
step1_prompt = """You are a sociology professor with 30 years of experience analyzing Asian racism in the United States.
Your task is to analyze the following article by identifying named entities and classifying them into the appropriate social roles and institutional categories. Focus especially on people or groups reacting to or affected by anti-Asian incidents.

Step 1: Named Entity Recognition and Categorization

1. Identify named entities.
2. Classify them into appropriate social roles or institutional categories.
3. Determine whether each **individual** is **Asian**, **Non-Asian**, or **Cannot be inferred** based on the text.
4. For each entity, include a `"reference"` field that reflects **the exact phrase or wording** used in the article to refer to them.

Please extract all named entities from the text and categorize them according to the following schema:

---

**INDIVIDUALS** (Specific persons or actors representing individual agency)

1. politicians
   - Elected officials acting in an individual capacity.
   - Examples: senators, representatives, mayors, governors

2. professionals
   - Individuals recognized by their expertise or institutional role.
   - Examples: professors, doctors, lawyers, foundation presidents

3. celebrities
   - Public figures in entertainment or sports (e.g., actors, athletes) unless clearly acting in a professional or political role.
   - If overlapping with another role, assign to the more institutionally grounded category.

4. perpetrators
   - Individuals directly identified as committing or responsible for anti-Asian actions.
   - Do not include vague or generalized public unless clearly specified.

5. victims
   - Individuals or racial/ethnic groups explicitly targeted by anti-Asian acts.
   - Examples: “a woman attacked on the subway,” “Japanese Americans during WWII”

6. other_individuals
   - All other named or unnamed individuals who do not fall into the above categories.
   - Includes the general public, community members, business owners, or relatives (e.g., “my mom,” “a neighbor”).

---

**ORGANIZATIONS** (Named institutions or collectives)

1. law_enforcement_agencies
   - Official police or investigative institutions.
   - Examples: Chicago Police Department, FBI, local sheriff’s office

2. government_bodies
   - Government agencies, departments, or offices at any level (local/state/federal).
   - Examples: CDC, Department of Justice, City Council

3. ngo_or_advocacy_groups
   - Civil rights organizations, foundations, or advocacy nonprofits.
   - Examples: Stop AAPI Hate, Robert Wood Johnson Foundation

4. business_entities
   - Named companies, hotels, restaurants, or stores.
   - Examples: Wrap-on Tools, Edgewater Beach Hotel

5. community_groups
   - Named cultural, ethnic, or neighborhood associations.
   - Examples: Chinatown Association, Asian-American Coalition

---

**ETHNICITY INFERENCE RULES:**

- For each **individual**, determine whether they are **Asian**, **Non-Asian**, or **Cannot be inferred**.
- Use contextual clues such as ethnicity indicators, names, or explicit mentions.
- If ethnicity is ambiguous or not stated, return `"Cannot be inferred"`.

---

**ADDITIONAL INSTRUCTIONS:**

- Use `"reference"` to capture how the person/group was referred to in the original article (e.g., `"an 80-year-old woman"`, `"Lee"`, `"the attacker"`).
- Normalize all name variants to a canonical form (e.g., “Dr. Church,” “J. Church,” and “Church” → “Jacqueline Church”).
- If an individual belongs to multiple roles, assign them to the most institutionally specific one (e.g., categorize a lawyer-celebrity as a professional).
- Include only individuals explicitly involved in specific incidents under “victims” and “perpetrators.”
- Do not classify individual police officers or sheriffs as individuals—assign them under law_enforcement_agencies.
- Classify individual owners under “business_actors” and company names under “business_entities.”

---

**Output format (in JSON):**

For all individuals, return an object with "name" and "asian_status" fields.
For all organizations, return an object with "name" and "asian_status": "Not applicable".

```json
{
  "individuals": {
    "politicians": [
      {
        "name": "Tammy Duckworth",
        "reference": "Senator Tammy Duckworth",
        "asian_status": "Asian"
      },
      {
        "name": "Joe Biden",
        "reference": "President Joe Biden",
        "asian_status": "Non-Asian"
      }
    ],
    "professionals": [
      {
        "name": "Julie Morita",
        "reference": "Julie Morita",
        "asian_status": "Asian"
      }
    ],
    "celebrities": [
      {
        "name": "Awkwafina",
        "reference": "Awkwafina",
        "asian_status": "Asian"
      }
    ],
    "perpetrators": [
      {
        "name": "Unknown Attacker",
        "reference": "the attacker",
        "asian_status": "Non-Asian"
      }
    ],
    "victims": [
      {
        "name": "Asian Elderly Woman",
        "reference": "an 80-year-old woman",
        "asian_status": "Asian"
      }
    ],
    "other_individuals": [
      {
        "name": "my mom",
        "reference": "my mom",
        "asian_status": "Asian"
      }
    ]
    
  },
  "organizations": {
    "law_enforcement_agencies": [
      {
        "name": "Chicago Police Department",
        "reference": "Chicago Police Department",
        "asian_status": "Not applicable"
      }
    ],
    "government_bodies": [
      {
        "name": "City Council",
        "reference": "City Council",
        "asian_status": "Not applicable"
      }
    ],
    "ngo_or_advocacy_groups": [
      {
        "name": "Stop AAPI Hate",
        "reference": "Stop AAPI Hate",
        "asian_status": "Not applicable"
      }
    ],
    "business_entities": [
      {
        "name": "Edgewater Beach Hotel",
        "reference": "Edgewater Beach Hotel",
        "asian_status": "Not applicable"
      }
    ],
    "community_groups": [
      {
        "name": "Chinatown Association",
        "reference": "Chinatown Association",
        "asian_status": "Not applicable"
      }
    ]
  }
}


"""

# 呼叫 Step 1

# 建立輸出資料夾
output_dir = "step1_batches"
os.makedirs(output_dir, exist_ok=True)

# 每批文章數量
batch_size = 50
article_items = list(articles.items())
total_articles = len(article_items)
total_batches = (total_articles + batch_size - 1) // batch_size  # 無條件進位

# 逐批處理
for batch_idx in range(0, total_articles, batch_size):
    batch_number = batch_idx // batch_size + 1
    filename = os.path.join(output_dir, f"step1_batch_{batch_number}.json")

    # 如果檔案已經存在，就跳過這批
    if os.path.exists(filename):
        print(f"⏭️ 批次 {batch_number}/{total_batches} 已存在，跳過")
        continue

    print(f"\n🚀 開始處理批次 {batch_number}/{total_batches}...")

    step1_result = {}
    batch = article_items[batch_idx: batch_idx + batch_size]

    for title, content in batch:
        full_prompt = step1_prompt + "\n\nArticle Text:\n" + content
        response = get_response(full_prompt)
        step1_result[title] = response

    # 儲存這一批結果
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(step1_result, f, ensure_ascii=False, indent=2)

    print(f"✅ 批次 {batch_number}/{total_batches} 已完成，儲存至 {filename}")

print("\n🎉 所有批次處理完成！")




🚀 開始處理批次 1/12...
✅ 批次 1/12 已完成，儲存至 step1_batches\step1_batch_1.json

🚀 開始處理批次 2/12...
✅ 批次 2/12 已完成，儲存至 step1_batches\step1_batch_2.json

🚀 開始處理批次 3/12...
✅ 批次 3/12 已完成，儲存至 step1_batches\step1_batch_3.json

🚀 開始處理批次 4/12...
✅ 批次 4/12 已完成，儲存至 step1_batches\step1_batch_4.json

🚀 開始處理批次 5/12...
✅ 批次 5/12 已完成，儲存至 step1_batches\step1_batch_5.json

🚀 開始處理批次 6/12...
✅ 批次 6/12 已完成，儲存至 step1_batches\step1_batch_6.json

🚀 開始處理批次 7/12...
✅ 批次 7/12 已完成，儲存至 step1_batches\step1_batch_7.json

🚀 開始處理批次 8/12...
✅ 批次 8/12 已完成，儲存至 step1_batches\step1_batch_8.json

🚀 開始處理批次 9/12...
✅ 批次 9/12 已完成，儲存至 step1_batches\step1_batch_9.json

🚀 開始處理批次 10/12...
✅ 批次 10/12 已完成，儲存至 step1_batches\step1_batch_10.json

🚀 開始處理批次 11/12...
✅ 批次 11/12 已完成，儲存至 step1_batches\step1_batch_11.json

🚀 開始處理批次 12/12...
✅ 批次 12/12 已完成，儲存至 step1_batches\step1_batch_12.json

🎉 所有批次處理完成！


In [9]:
import os
import glob
import json

step2_prompt = """You are a sociology professor with 30 years of experience analyzing Asian racism in the United States.

**Step 2: Extract Related Sentences**

Based on the named entities identified in **Step 1**, extract **all relevant complete sentences** from the text for each entity.

**For each named entity:**
1. Use the `"reference"` field (the phrase as it appears in the article) to identify relevant sentences.
2. Identify **all complete, verbatim sentences** in the text that mention or describe the entity’s **involvement, action, reaction, statement, or experience** related to anti-Asian hate (directly or indirectly).
3. Do **not paraphrase** or summarize. Use the **exact wording** from the text.
4. If no relevant sentence is found, set `"relevant_sentences": []`.
5. Return results **grouped by entity name**, exactly matching the names used in Step 1.
6. Include the following structured metadata for each entity:
   - `"entity_type"`:  
     - For individuals: the **social role** (e.g., "politician", "celebrity", "victim")  
     - For organizations: the **institutional category** (e.g., "law_enforcement_agency", "ngo_or_advocacy_group")
   - `"asian_status"`:  
     - For individuals: **"Asian"**, **"Non-Asian"**, or **"Cannot be inferred"**  
     - For organizations: Always **"Not applicable"**

**Note:** These sentences will later be used to infer **behavioral reactions** and **emotional responses**, so include **any sentence** that provides context about what the entity did, said, or experienced.

### Output format (JSON):

```json
{
  "Entity Name": {
    "entity_type": "e.g., politician, law_enforcement_agency",
    "asian_status": "Asian / Non-Asian / Cannot be inferred / Not applicable",
    "relevant_sentences": [
      "Sentence 1 from the article.",
      "Sentence 2 from the article."
    ]
  }
}

"""

# 建立輸出資料夾
output_dir = "step2_batches"
os.makedirs(output_dir, exist_ok=True)

# 找出 Step 1 的所有批次檔案
step1_files = sorted(glob.glob("step1_batches/step1_batch_*.json"))

for step1_file in step1_files:
    batch_name = os.path.basename(step1_file).replace("step1_", "step2_")
    output_file = os.path.join(output_dir, batch_name)

    # 如果已經有 Step 2 的結果，就跳過
    if os.path.exists(output_file):
        print(f"⏭️ {batch_name} 已存在，跳過")
        continue

    print(f"\n🚀 開始處理 {step1_file} → {output_file}")

    # 讀取 Step 1 的結果
    with open(step1_file, "r", encoding="utf-8") as f:
        step1_result = json.load(f)

    step2_result = {}

    # 逐篇文章處理
    for title, step1_text in step1_result.items():
        content = articles.get(title, "")  # 從原始文章 dict 拿內容

        full_prompt = (
            step2_prompt +
            f"\n\nStep 1 Results:\n{step1_text}" +
            f"\n\nOriginal Article Text:\n{content}"
        )

        response = get_response(full_prompt)
        step2_result[title] = response
        print(f"✔️ 已處理 {title}")

    # 儲存這一批的 Step 2 結果
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(step2_result, f, ensure_ascii=False, indent=2)

    print(f"✅ {batch_name} 已完成並儲存至 {output_file}")

print("\n🎉 Step 2 全部批次處理完成！")



🚀 開始處理 step1_batches\step1_batch_1.json → step2_batches\step2_batch_1.json
✔️ 已處理 Article_1
✔️ 已處理 Article_2
✔️ 已處理 Article_3
✔️ 已處理 Article_4
✔️ 已處理 Article_5
✔️ 已處理 Article_6
✔️ 已處理 Article_8
✔️ 已處理 Article_9
✔️ 已處理 Article_10
✔️ 已處理 Article_11
✔️ 已處理 Article_12
✔️ 已處理 Article_13
✔️ 已處理 Article_14
✔️ 已處理 Article_15
✔️ 已處理 Article_16
✔️ 已處理 Article_17
✔️ 已處理 Article_18
✔️ 已處理 Article_19
✔️ 已處理 Article_20
✔️ 已處理 Article_21
✔️ 已處理 Article_22
✔️ 已處理 Article_23
✔️ 已處理 Article_24
✔️ 已處理 Article_25
✔️ 已處理 Article_26
✔️ 已處理 Article_27
✔️ 已處理 Article_28
✔️ 已處理 Article_30
✔️ 已處理 Article_31
✔️ 已處理 Article_32
✔️ 已處理 Article_33
✔️ 已處理 Article_34
✔️ 已處理 Article_35
✔️ 已處理 Article_36
✔️ 已處理 Article_37
✔️ 已處理 Article_38
✔️ 已處理 Article_39
✔️ 已處理 Article_40
✔️ 已處理 Article_42
✔️ 已處理 Article_43
✔️ 已處理 Article_44
✔️ 已處理 Article_45
✔️ 已處理 Article_46
✔️ 已處理 Article_47
✔️ 已處理 Article_48
✔️ 已處理 Article_49
✔️ 已處理 Article_50
✔️ 已處理 Article_52
✔️ 已處理 Article_53
✔️ 已處理 Article_56
✅ step2_batch_1.json 已完成並儲存至 ste

In [11]:
import os
import json

def rerun_incomplete_step2(step1_file, step2_file):
    # 讀取 Step1 的結果
    with open(step1_file, "r", encoding="utf-8") as f:
        step1_result = json.load(f)

    # 如果 step2_file 已存在，先讀取；否則新建
    if os.path.exists(step2_file):
        with open(step2_file, "r", encoding="utf-8") as f:
            step2_result = json.load(f)
    else:
        step2_result = {}

    updated = False

    for title, step1_text in step1_result.items():
        # 只處理空的 or 缺失的
        if not step2_result.get(title):
            content = articles.get(title, "")  # 原始文章
            full_prompt = (
                step2_prompt +
                f"\n\nStep 1 Results:\n{step1_text}" +
                f"\n\nOriginal Article Text:\n{content}"
            )
            response = get_response(full_prompt)
            step2_result[title] = response
            print(f"🔄 重新處理 {title}")
            updated = True

    if updated:
        with open(step2_file, "w", encoding="utf-8") as f:
            json.dump(step2_result, f, ensure_ascii=False, indent=2)
        print(f"✅ 已更新 {step2_file}")
    else:
        print("👌 沒有需要補跑的文章，全數完成")

# 使用範例
rerun_incomplete_step2(
    step1_file="step1_batches/step1_batch_9.json",
    step2_file="step2_batches/step2_batch_9.json"
)


🔄 重新處理 Article_485
🔄 重新處理 Article_486
🔄 重新處理 Article_487
🔄 重新處理 Article_488
🔄 重新處理 Article_489
🔄 重新處理 Article_490
✅ 已更新 step2_batches/step2_batch_9.json


In [12]:
# -*- coding: utf-8 -*-
import os
import re
import glob
import json
from collections import Counter
from typing import Dict, Any
from openai import OpenAI


# ==============================
# 工具：JSON 寬鬆解析 + 句子正規化
# ==============================
def strip_code_fences(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.strip()
    m = re.search(r"```(?:json)?\s*(.*?)```", s, flags=re.S | re.I)
    if m:
        return m.group(1).strip()
    return s

def parse_json_loose(s: str):
    if not isinstance(s, str):
        return None
    s = strip_code_fences(s).strip()
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        pass
    m = re.search(r"\{.*\}", s, flags=re.S)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return None
    return None

def to_text(x: Any) -> str:
    if x is None:
        return ""
    if isinstance(x, str):
        return x
    if isinstance(x, list):
        return "\n".join(str(item) for item in x)
    if isinstance(x, dict):
        if "relevant_sentences" in x:
            return to_text(x["relevant_sentences"])
        return "\n".join(to_text(v) for v in x.values())
    return str(x)

def parse_model_json(s: str, default: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(s, str) or not s.strip():
        return default
    s2 = strip_code_fences(s)
    try:
        return json.loads(s2)
    except Exception:
        m = re.search(r"\{.*\}", s2, flags=re.S)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                return default
        return default

# ==============================
# Step2 結果正規化
# ==============================
def normalize_step2_result(title: str, raw_obj) -> Dict[str, Dict[str, str]]:
    if isinstance(raw_obj, dict):
        if all(isinstance(v, dict) for v in raw_obj.values()):
            out = {}
            for k, v in raw_obj.items():
                out[k] = {
                    "entity_type": v.get("entity_type", ""),
                    "asian_status": v.get("asian_status", ""),
                    "relevant_sentences": to_text(v.get("relevant_sentences"))
                }
            return out
    if isinstance(raw_obj, str):
        parsed = parse_json_loose(raw_obj)
        if parsed:
            return normalize_step2_result(title, parsed)
    return {}

# ==============================
# Prompts
# ==============================
def build_gate_prompt(step2_sentences: str) -> str:
    return f"""You are a sociology professor analyzing reactions to anti-Asian hate.

Task: Decide if the extracted sentences show any OBSERVABLE reaction (action or inaction) to anti-Asian hate by the entity.

Rules:
- Observable = concrete action/inaction or explicit public stance (e.g., speaking up, condemning, organizing, reporting, policy ask, government action, refusing to act).
- Pure emotions/concerns are NOT reactions.
- Pure incident descriptions are NOT reactions.
- Use ONLY the exact `relevant_sentences`.

Output JSON:
{{
  "has_reaction": "yes" | "no",
  "evidence": "Exact sentence(s) from relevant_sentences (or empty if no)."
}}

relevant_sentences:
{step2_sentences}
"""

def build_classifier_prompt(entity_type: str, asian_status: str, step2_sentences: str) -> str:
    return f"""You are a sociology professor with 30 years of experience analyzing anti-Asian racism.

Task: Classify the entity’s REACTION strictly using the Reaction Concept Tree. Use ONLY `relevant_sentences` as evidence. Do NOT infer emotions. Do NOT paraphrase.

Reaction Concept Tree:
- Support Asian Americans
  - Attending marches/rallies
  - Speaking up on social media
  - Calling for being united
  - Educating students
  - Fostering conversations about anti-Asian hate
  - Hiring security guards
  - Providing shopkeepers with air horns
  - Rewarding the public to report the info about the suspects
- Advocacy/take actions for changes
- Politicians initiated anti-Asian hate and racism
- Undermining human rights
- Color blind/minimizing racism
- Youth as not an excuse
- Videotaping/confronting harasser/attacker
- Sex (sexual) addiction
- Religion as a reason
- Feeling hopeless or support AAPI being not enough
- Not confronting attacker/harasser or not reporting
- Useless law enforcement
  - Did not take a report on Anti-Asian hate crime
  - Did not often patrol the streets
- Government takes actions to stop AAPI hate
  - Installing hotlines
  - Launching a hate-crime task force
  - Increasing patrols
  - Organizing a town hall

Strict Rules:
1) Pure concerns/worries ≠ reaction; return "Cannot be inferred".
2) Arrests/charges/prosecutions ⇒ “Government takes actions…”.
3) Explicit condemnation ⇒ “Support Asian Americans”.
4) If no clear reaction, return "Cannot be inferred".
5) Do NOT invent labels.
6) Always choose the most specific subcategory.

entity_type: {entity_type}
asian_status: {asian_status}

relevant_sentences:
{step2_sentences}

Output JSON:
{{
  "reaction": "<one label from the tree or 'Cannot be inferred'>",
  "reaction_reason": "Exact sentence(s) from relevant_sentences"
}}
"""

# ==============================
# 主流程：從 Step2 → Step3
# ==============================
def run_step3_from_step2(step2_dir="step2_batches", step3_dir="step3_batches"):
    os.makedirs(step3_dir, exist_ok=True)
    step2_files = sorted(glob.glob(os.path.join(step2_dir, "step2_batch_*.json")))
    total_batches = len(step2_files)

    for idx, step2_file in enumerate(step2_files, start=1):
        batch_name = os.path.basename(step2_file).replace("step2_", "step3_")
        out_path = os.path.join(step3_dir, batch_name)

        # 斷點續跑
        if os.path.exists(out_path):
            print(f"⏭️ 批次 {idx}/{total_batches} {batch_name} 已存在，跳過")
            continue

        print(f"\n🚀 開始處理批次 {idx}/{total_batches}: {step2_file}")

        with open(step2_file, "r", encoding="utf-8") as f:
            step2_batch_result = json.load(f)

        step3_batch_result = {}
        gate_stats = Counter()
        label_stats = Counter()

        for title, raw in step2_batch_result.items():
            entities = normalize_step2_result(title, raw)
            entity_outputs = {}

            if not entities:
                step3_batch_result[title] = entity_outputs
                continue

            for entity, meta in entities.items():
                entity_type = meta.get("entity_type", "")
                asian_status = meta.get("asian_status", "")
                relevant_sentences = to_text(meta.get("relevant_sentences", "")).strip()

                # --- 3A: Gate ---
                gate_prompt = build_gate_prompt(relevant_sentences)
                gate_resp = get_response(gate_prompt, temperature=0.0)
                gate_json = parse_model_json(gate_resp, default={"has_reaction": "no", "evidence": ""})

                has_reaction = str(gate_json.get("has_reaction", "no")).lower()
                gate_stats[has_reaction] += 1

                if has_reaction != "yes":
                    out = {
                        "entity_type": entity_type,
                        "asian_status": asian_status,
                        "reaction": "Cannot be inferred",
                        "reaction_reason": ""
                    }
                    entity_outputs[entity] = out
                    label_stats["Cannot be inferred"] += 1
                    continue

                # --- 3B: Classifier ---
                cls_prompt = build_classifier_prompt(entity_type, asian_status, relevant_sentences)
                cls_resp = get_response(cls_prompt, temperature=0.0)
                cls_json = parse_model_json(cls_resp, default={"reaction": "Cannot be inferred", "reaction_reason": ""})

                label = cls_json.get("reaction", "Cannot be inferred")
                reason = cls_json.get("reaction_reason", "")

                out = {
                    "entity_type": entity_type,
                    "asian_status": asian_status,
                    "reaction": label,
                    "reaction_reason": reason
                }
                entity_outputs[entity] = out
                label_stats[label] += 1

            step3_batch_result[title] = entity_outputs

        # 儲存
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(step3_batch_result, f, ensure_ascii=False, indent=2)

        print(f"✅ 批次 {idx}/{total_batches} 已完成並儲存至 {out_path}")
        print("   Gate stats:", dict(gate_stats))
        print("   Label stats:", dict(label_stats))

# ==============================
# 執行
# ==============================
if __name__ == "__main__":
    run_step3_from_step2("step2_batches", "step3_batches")



🚀 開始處理批次 1/12: step2_batches\step2_batch_1.json
✅ 批次 1/12 已完成並儲存至 step3_batches\step3_batch_1.json
   Gate stats: {'no': 426, 'yes': 174}
   Label stats: {'Cannot be inferred': 466, 'Support Asian Americans': 86, 'Government takes actions to stop AAPI hate': 30, 'Advocacy/take actions for changes': 4, 'Attending marches/rallies': 1, 'Politicians initiated anti-Asian hate and racism': 4, 'Color blind/minimizing racism': 1, 'Fostering conversations about anti-Asian hate': 2, 'Useless law enforcement': 2, 'Hiring security guards': 2, 'Educating students': 1, 'Videotaping/confronting harasser/attacker': 1}

🚀 開始處理批次 2/12: step2_batches\step2_batch_10.json
✅ 批次 2/12 已完成並儲存至 step3_batches\step3_batch_10.json
   Gate stats: {'no': 285, 'yes': 131}
   Label stats: {'Cannot be inferred': 296, 'Support Asian Americans': 56, 'Government takes actions to stop AAPI hate': 51, 'Attending marches/rallies': 2, 'Advocacy/take actions for changes': 5, 'Politicians initiated anti-Asian hate and racism':

In [5]:
# -*- coding: utf-8 -*-
import os
import re
import glob
import json
from collections import Counter
from typing import Dict, Any
from openai import OpenAI


# ==============================
# 工具：JSON 寬鬆解析 + 句子正規化
# ==============================
def strip_code_fences(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.strip()
    m = re.search(r"```(?:json)?\s*(.*?)```", s, flags=re.S | re.I)
    if m:
        return m.group(1).strip()
    return s

def parse_json_loose(s: str):
    if not isinstance(s, str):
        return None
    s = strip_code_fences(s).strip()
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        pass
    m = re.search(r"\{.*\}", s, flags=re.S)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return None
    return None

def to_text(x: Any) -> str:
    if x is None:
        return ""
    if isinstance(x, str):
        return x
    if isinstance(x, list):
        return "\n".join(str(item) for item in x)
    if isinstance(x, dict):
        if "relevant_sentences" in x:
            return to_text(x["relevant_sentences"])
        return "\n".join(to_text(v) for v in x.values())
    return str(x)

def parse_model_json(s: str, default: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(s, str) or not s.strip():
        return default
    s2 = strip_code_fences(s)
    try:
        return json.loads(s2)
    except Exception:
        m = re.search(r"\{.*\}", s2, flags=re.S)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                return default
        return default

# ==============================
# Step2 結果正規化
# ==============================
def normalize_step2_result(title: str, raw_obj) -> Dict[str, Dict[str, str]]:
    if isinstance(raw_obj, dict):
        if all(isinstance(v, dict) for v in raw_obj.values()):
            out = {}
            for k, v in raw_obj.items():
                out[k] = {
                    "entity_type": v.get("entity_type", ""),
                    "asian_status": v.get("asian_status", ""),
                    "relevant_sentences": to_text(v.get("relevant_sentences"))
                }
            return out
    if isinstance(raw_obj, str):
        parsed = parse_json_loose(raw_obj)
        if parsed:
            return normalize_step2_result(title, parsed)
    return {}

# ==============================
# Prompts
# ==============================
def build_gate_prompt(step2_sentences: str) -> str:
    return f"""You are a sociology professor analyzing reactions to anti-Asian hate.

Task: Decide if the extracted sentences show any OBSERVABLE reaction (action or inaction) to anti-Asian hate by the entity.

Rules:
- Observable = concrete action/inaction or explicit public stance (e.g., speaking up, condemning, organizing, reporting, policy ask, government action, refusing to act).
- Pure emotions/concerns are NOT reactions.
- Pure incident descriptions are NOT reactions.
- Use ONLY the exact `relevant_sentences`.

Output JSON:
{{
  "has_reaction": "yes" | "no",
  "evidence": "Exact sentence(s) from relevant_sentences (or empty if no)."
}}

relevant_sentences:
{step2_sentences}
"""

def build_classifier_prompt(entity_type: str, asian_status: str, step2_sentences: str) -> str:
    return f"""You are a sociology professor with 30 years of experience analyzing anti-Asian racism.

Task: Classify the entity’s REACTION strictly using the Reaction Concept Tree. Use ONLY `relevant_sentences` as evidence. Do NOT infer emotions. Do NOT paraphrase.

Reaction Concept Tree:
- Support Asian Americans:People or organizations condemned/do not want Anti-Asian incidents to happen, including all forms of crimes, attacks, violence, assaults, physical, verbal, and online harassment. This kind of support is at a conscientious (cognitive) level, not yet taking concrete actions to stop AAPI hate. 
  - Attending marches/rallies:People or organizations actively attended marches/rallies for supporting Asian American communities.
  - Speaking up on social media:People or organizations spoke up in public, such as via social media, to condemn Anti-Asian hate.
  - Calling for being united:Asians/Asian Americans become united to support each other to overcome Anti-Asian hate.
  - Fostering conversations about anti-Asian hate:The public fostered conversations regarding the Anti-Asian hate issues. Some organizations/groups (e.g., Asian American communities) also think it’s important to have conversations to address the root cause behind the Anti-Asian hate incidents so this will help us to make change.
  - Providing shopkeepers with air horns:Some stores or groups provide shopkeepers with air horns. If any anti-Asian hate crimes happened, they can use the air horns to draw everyone’s attention
- Advocacy/take actions for changes:Individuals, groups, or organizations want or advocate for changing the current situation where Asians/Asian Americans suffered from Anti-Asian hate, such as physical, verbal, and online harassment, attacks, violence, assaults, and hate crimes. They want cultural shift, open dialogue and listening sessions regarding incidents, practical change in racial stereotypes
and perceptions, more awareness about Anti-Asian hate, and human rights.
- Politicians initiated anti-Asian hate and racism:Trump and some republican politicians made a lot of comments on the COVID-19 pandemic. One kind of comment is that he used/dubbed Asian-related objects to combine with disease (virus/flu/covid) or directly calling coronavirus/covid (e.g., country’s or region’s names or Kung flu[modified by Kung Fu], or races) to verbally attack Asians/Asian Americans, such as  “China/Chinese virus” or “Kung flu.” Another type of comment is that he blamed China for causing the pandemic. Such kinds of comments initiated racism, Anti-Asian bigotry/hate, and Anti-Asian hate incidents in the US society. Such comments also led some Americans to blame Asians/Asian Americans for causing the pandemic.
- Undermining human rights:People want to dehumanize and to undermine the fundamental rights, dignity and belonging of those they target.
- Color blind/minimizing racism:Some Americans do not believe discrimination, racism, or racist bias/bigotry against Asians/Asian Americans exists in the community. Incidents of anti-Asian hate (including physical, verbal, and online harassment, attacks, violence, assaults, and Anti-Asian crimes) were downplayed, ignored, or perceived as not existing by the public, the law enforcement system (e.g., the police), and governors (e.g., Mr. Donald Trump). When Asians/Asian Americans were attacked, Anti-Asian hate or racism was not perceived as the perpetrators’ motives/motivations by the police or the perpetrators said their motivations were not triggered by Anti-Asian bigotry or racism.
- Youth as not an excuse:Robert Aaron Long murdered eight people in the incident of the 2021 Atlanta Spa Shootings. In news reports, he was called “the 21-year-old.” Some comments advocated stopping calling him “the 21-year-old” as if his youth is an excuse to murder others because of their race, ethnicities, and sex.
- Videotaping/confronting harasser/attacker:Asians/Asian Americans or bystanders videotaped/recorded the incidents of physical or verbal harassment; Anti-Asian attacks, assaults, or violence; and Anti-Asian crimes. Asians/Asian Americans who suffered physical harassment, attacks, assaults, violence attacked back to the harassers or attackers. Bystanders’ behaviors aim to defense those who were attacked. Cell phones and survelliance system can be used for videotaping or recording. Additionally, those who experienced verbal harassment speak out to the harassers to let them know their thoughts were biased, offensive, and unjust and tell them to stop. 
- Sex (sexual) addiction:Excessive sexual thoughts, desires, urges or behaviors that can’t be controlled and cause distress and harm to your relationships, finances and other aspects of life. It is also called hypersexuality or compulsive sexual behavior. It is what the Atlanta shooter claimed as a motivation that led to his senseless killings of the victims.
- Religion as a reason:In the 2021 Atlanta Spa Shootings, Robert Aaron Long was the killer who murdered eight people. He told the police that his motive was religious guilt about his sexuality. He said he had sexual desire so he wanted to eliminate it. That’s why he went to the spa to skill women of Asian descent. Asian advocacy groups mentioned whether the killer’s motive was religious guilt about his sexuality, no one should ignore the broader context of Anti-Asian violence and hate crimes. Asian advocacy groups tend to attribute the killer’s motive stems from racism or xenophobia, misogyny, and gendered racism
- Feeling hopeless or support AAPI being not enough:Asians/Asian Americans felt worried, frustrated, anxious, and afraid that they may experience Anti-Asian hate crimes, attacks, assaults, and violence. But they felt that nothing happened to stop them. Support for Asian American communities is not enough.
- Not confronting attacker/harasser or not reporting:Asians/Asian Americans did not want to confront attackers/harassers/bullies who physically or verbally harassed or attacked them. They thought it is not worthy of reporting the incidents. They did not want to confront because they were afraid of their safety. They just wanted to leave from the incidents soon.
- Useless law enforcement:Police did not take a police report and denied there was an Anti-Asian hate crime for the incidents of physical, verbal, or online harassment, attacks, assaults, violence, and Anti-Asian crimes. Another situation is that police affirmed there was a crime, but the motivation did not come from Anti-Asian hate or bigotry/prejudice or racism. Additionally, Asian Americans thought if police often patrolled the streets, a lot of Anti-Asian hate crimes, attacks, assaults, and violence would not happen. But in reality, policy did not do so. 
  - Did not take a report on Anti-Asian hate crime:police did not take a report on Anti-Asian hate crime, including physical, verbal, or online harassment, attacks, assaults, and violence.
  - Did not often patrol the streets:police affirmed there was a crime, but police did not often patrol the streets so that there were a lot of Anti-Asian hate crimes (e.g., physical, verbal, or online harassment, attacks, assaults, and violence) happened.
- Takes actions to stop AAPI hate:After the incidents of Anti-Asian hate crimes, attacks, assaults, and violence, state or city government or individuals take concrete actions that aim to stop AAPI hate.
  - Installing hotlines:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments) install hotlines for victims or people who witness Anti-Asian incidents to report.
  - Launching a hate-crime task force:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments) launched an Asian hate crime task force to develop approaches to stopping anti-Asian hate crimes.
  - Making an announcement to condemn anti-Asian hate:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments)  made an open announcement to condemn anti-Asian hate.
  - Increasing patrols:Some organizations (e.g., city and state governments) increased patrolling the streets to ensure the safety of Asian Americans. 
  - Organizing a town hall:Some organizations (e.g., city and state governments) organized a town hall meeting to discuss how to stop anti-Asian hate racism.
  - Hiring security guards:Some stores or groups hired security guards to increase safety for Asian Americans and prevent anti-Asian hate crimes or racism.
  - Educating students:Schoolteachers and university faculty took actions to educate students on current social and political issues on Anti-Asian hate. They aim to use education to change the public’s view about Asian Americans/Asians and increase the awareness of respecting Asian Americans/Asians
  - Rewarding the public to report the info about the suspects:Individuals, groups, or organizations provide rewards to the public when they report any information regarding the suspects who may commit anti-Asian hate crimes.


Strict Rules:
1) Pure concerns/worries ≠ reaction; return "Cannot be inferred".
2) Arrests/charges/prosecutions ⇒ “Government takes actions…”.
3) Explicit condemnation ⇒ “Support Asian Americans”.
4) If no clear reaction, return "Cannot be inferred".
5) Do NOT invent labels.
6) Always choose the most specific subcategory.

entity_type: {entity_type}
asian_status: {asian_status}

relevant_sentences:
{step2_sentences}

Output JSON:
{{
  "reaction": "<one label from the tree or 'Cannot be inferred'>",
  "reaction_reason": "Exact sentence(s) from relevant_sentences"
}}
"""

# ==============================
# 主流程：從 Step2 → Step3
# ==============================
def run_step3_from_step2(step2_dir="step2_batches", step3_dir="step3_batches"):
    os.makedirs(step3_dir, exist_ok=True)
    step2_files = sorted(glob.glob(os.path.join(step2_dir, "step2_batch_*.json")))
    total_batches = len(step2_files)

    for idx, step2_file in enumerate(step2_files, start=1):
        batch_name = os.path.basename(step2_file).replace("step2_", "step3_")
        out_path = os.path.join(step3_dir, batch_name)

        # 斷點續跑
        if os.path.exists(out_path):
            print(f"⏭️ 批次 {idx}/{total_batches} {batch_name} 已存在，跳過")
            continue

        print(f"\n🚀 開始處理批次 {idx}/{total_batches}: {step2_file}")

        with open(step2_file, "r", encoding="utf-8") as f:
            step2_batch_result = json.load(f)

        step3_batch_result = {}
        gate_stats = Counter()
        label_stats = Counter()

        for title, raw in step2_batch_result.items():
            entities = normalize_step2_result(title, raw)
            entity_outputs = {}

            if not entities:
                step3_batch_result[title] = entity_outputs
                continue

            for entity, meta in entities.items():
                entity_type = meta.get("entity_type", "")
                asian_status = meta.get("asian_status", "")
                relevant_sentences = to_text(meta.get("relevant_sentences", "")).strip()

                # --- 3A: Gate ---
                gate_prompt = build_gate_prompt(relevant_sentences)
                gate_resp = get_response(gate_prompt, temperature=0.0)
                gate_json = parse_model_json(gate_resp, default={"has_reaction": "no", "evidence": ""})

                has_reaction = str(gate_json.get("has_reaction", "no")).lower()
                gate_stats[has_reaction] += 1

                if has_reaction != "yes":
                    out = {
                        "entity_type": entity_type,
                        "asian_status": asian_status,
                        "reaction": "Cannot be inferred",
                        "reaction_reason": ""
                    }
                    entity_outputs[entity] = out
                    label_stats["Cannot be inferred"] += 1
                    continue

                # --- 3B: Classifier ---
                cls_prompt = build_classifier_prompt(entity_type, asian_status, relevant_sentences)
                cls_resp = get_response(cls_prompt, temperature=0.0)
                cls_json = parse_model_json(cls_resp, default={"reaction": "Cannot be inferred", "reaction_reason": ""})

                label = cls_json.get("reaction", "Cannot be inferred")
                reason = cls_json.get("reaction_reason", "")

                out = {
                    "entity_type": entity_type,
                    "asian_status": asian_status,
                    "reaction": label,
                    "reaction_reason": reason
                }
                entity_outputs[entity] = out
                label_stats[label] += 1

            step3_batch_result[title] = entity_outputs

        # 儲存
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(step3_batch_result, f, ensure_ascii=False, indent=2)

        print(f"✅ 批次 {idx}/{total_batches} 已完成並儲存至 {out_path}")
        print("   Gate stats:", dict(gate_stats))
        print("   Label stats:", dict(label_stats))

# ==============================
# 執行
# ==============================
if __name__ == "__main__":
    run_step3_from_step2("step2_batches", "step3_batches_new")



🚀 開始處理批次 1/12: step2_batches\step2_batch_1.json
✅ 批次 1/12 已完成並儲存至 step3_batches_new\step3_batch_1.json
   Gate stats: {'no': 472, 'yes': 185}
   Label stats: {'Cannot be inferred': 504, 'Support Asian Americans': 77, 'Fostering conversations about anti-Asian hate': 14, 'Speaking up on social media': 4, 'Government takes actions for changes': 2, 'Attending marches/rallies': 8, 'Calling for being united': 7, 'Increasing patrols': 4, 'Takes actions to stop AAPI hate': 12, 'Politicians initiated anti-Asian hate and racism': 5, 'Feeling hopeless or support AAPI being not enough': 11, 'Videotaping/confronting harasser/attacker': 2, 'Advocacy/take actions for changes': 4, 'Government takes actions to stop AAPI hate': 1, 'Government takes actions…': 1, 'Launching a hate-crime task force': 1}

🚀 開始處理批次 2/12: step2_batches\step2_batch_10.json
✅ 批次 2/12 已完成並儲存至 step3_batches_new\step3_batch_10.json
   Gate stats: {'no': 314, 'yes': 136}
   Label stats: {'Cannot be inferred': 332, 'Support Asian 

In [14]:
# -*- coding: utf-8 -*-
import os
import glob
import json
from openai import OpenAI
from collections import Counter


# ==============================
# Step 4 Prompt
# ==============================
step4_prompt = """You are a sociology professor with 30 years of experience analyzing racial dynamics and anti-Asian racism in the United States.

### Step 4: Infer **Emotions and Their Intensity**

Your task is to analyze the **extracted sentences** from **Step 2** and infer each entity's **emotional stance** toward anti-Asian hate.

You will also receive metadata from Step 2, including:
- `entity_type`: for individuals use their social role; for organizations use institutional category.
- `asian_status`: "Asian", "Non-Asian", "Cannot be inferred", or "Not applicable"

Use only the exact `relevant_sentences` from Step 2 as your source — do NOT paraphrase or add your own wording.

---

## Emotion Concept Tree
- Love 
- Joy 
- Anger 
- Sadness 
- Fear 
- Surprise 

---

## Instructions
1. Focus only on emotions — do NOT infer actions.
2. If no emotion is expressed, output `"emotion": "Cannot be inferred".
3. If multiple emotions appear, list multiple objects.
4. Use the exact sentence(s) as `"emotion_reason"`.

---

## Output format
{
  "Entity Name": {
    "entity_type": "...",
    "asian_status": "...",
    "emotions": [
      {
        "emotion": "deepest matched term or Cannot be inferred",
        "emotion_reason": "Exact sentence(s)"
      }
    ]
  }
}
"""

# ==============================
# 主流程 Step2 → Step4
# ==============================
def run_step4_from_step2(step2_dir="step2_batches", step4_dir="step4_batches"):
    os.makedirs(step4_dir, exist_ok=True)
    step2_files = sorted(glob.glob(os.path.join(step2_dir, "step2_batch_*.json")))
    total_batches = len(step2_files)

    for idx, step2_file in enumerate(step2_files, start=1):
        batch_name = os.path.basename(step2_file).replace("step2_", "step4_")
        out_path = os.path.join(step4_dir, batch_name)

        # 斷點續跑
        if os.path.exists(out_path):
            print(f"⏭️ 批次 {idx}/{total_batches} {batch_name} 已存在，跳過")
            continue

        print(f"\n🚀 開始處理批次 {idx}/{total_batches}: {step2_file}")

        with open(step2_file, "r", encoding="utf-8") as f:
            step2_batch_result = json.load(f)

        step4_batch_result = {}
        emo_stats = Counter()

        for title, step2_text in step2_batch_result.items():
            full_prompt = (
                step4_prompt +
                f"\n\nStep 2 Results (Extracted Sentences):\n{step2_text}"
            )

            response = get_response(full_prompt)
            step4_batch_result[title] = response
            emo_stats["done"] += 1

        # 儲存
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(step4_batch_result, f, ensure_ascii=False, indent=2)

        print(f"✅ 批次 {idx}/{total_batches} 已完成並儲存至 {out_path}")
        print("   Stats:", dict(emo_stats))

# ==============================
# 執行
# ==============================
if __name__ == "__main__":
    run_step4_from_step2("step2_batches", "step4_batches")



🚀 開始處理批次 1/12: step2_batches\step2_batch_1.json
✅ 批次 1/12 已完成並儲存至 step4_batches\step4_batch_1.json
   Stats: {'done': 50}

🚀 開始處理批次 2/12: step2_batches\step2_batch_10.json
✅ 批次 2/12 已完成並儲存至 step4_batches\step4_batch_10.json
   Stats: {'done': 50}

🚀 開始處理批次 3/12: step2_batches\step2_batch_11.json
✅ 批次 3/12 已完成並儲存至 step4_batches\step4_batch_11.json
   Stats: {'done': 50}

🚀 開始處理批次 4/12: step2_batches\step2_batch_12.json
✅ 批次 4/12 已完成並儲存至 step4_batches\step4_batch_12.json
   Stats: {'done': 34}

🚀 開始處理批次 5/12: step2_batches\step2_batch_2.json
✅ 批次 5/12 已完成並儲存至 step4_batches\step4_batch_2.json
   Stats: {'done': 50}

🚀 開始處理批次 6/12: step2_batches\step2_batch_3.json
✅ 批次 6/12 已完成並儲存至 step4_batches\step4_batch_3.json
   Stats: {'done': 50}

🚀 開始處理批次 7/12: step2_batches\step2_batch_4.json
✅ 批次 7/12 已完成並儲存至 step4_batches\step4_batch_4.json
   Stats: {'done': 50}

🚀 開始處理批次 8/12: step2_batches\step2_batch_5.json
✅ 批次 8/12 已完成並儲存至 step4_batches\step4_batch_5.json
   Stats: {'done': 50}

🚀 開始處理批次 

In [1]:
import os
import glob
import json
import pandas as pd

def merge_step3_with_id(input_dir="step3_batches", prefix="step3_batch_", 
                        output_json="step3_all.json", output_csv="step3_all.csv"):
    batch_files = sorted(glob.glob(os.path.join(input_dir, f"{prefix}*.json")))

    merged_result = {}
    for file in batch_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            merged_result.update(data)

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(merged_result, f, ensure_ascii=False, indent=2)

    rows = []
    idx = 1
    for article_id, entities in merged_result.items():
        for entity, meta in entities.items():
            rows.append({
                "reaction_id": f"reaction_{idx}",
                "article_id": article_id,
                "entity": entity,
                "entity_type": meta.get("entity_type", ""),
                "asian_status": meta.get("asian_status", ""),
                "reaction": meta.get("reaction", ""),
                "reaction_reason": meta.get("reaction_reason", "")
            })
            idx += 1

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")

    print(f"✅ Step 3 已合併 {len(batch_files)} 個批次檔 → {output_csv}，共 {len(df)} 筆")

if __name__ == "__main__":
    merge_step3_with_id(
        input_dir="step3_batches_new",
        prefix="step3_batch_",
        output_json="step3_all_new.json",
        output_csv="step3_all_new.csv"
    )


✅ Step 3 已合併 12 個批次檔 → step3_all_new.csv，共 6063 筆


In [1]:
import os
import glob
import json
import pandas as pd
import re

def parse_json_loose(s: str):
    """嘗試從字串裡解析 JSON"""
    if not isinstance(s, str):
        return None
    s = s.strip()
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        # 嘗試抓第一個 {...}
        m = re.search(r"\{.*\}", s, flags=re.S)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                return None
    return None

def merge_step4_with_id(input_dir="step4_batches", prefix="step4_batch_", 
                        output_json="step4_all.json", output_csv="step4_all.csv"):
    batch_files = sorted(glob.glob(os.path.join(input_dir, f"{prefix}*.json")))
    all_articles = {}

    for file in batch_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # ⚠️ 不用 update，逐篇展開
        for article_id, raw in data.items():
            if isinstance(raw, dict):
                all_articles[article_id] = raw
            elif isinstance(raw, str):
                parsed = parse_json_loose(raw)
                if isinstance(parsed, dict):
                    all_articles[article_id] = parsed

    # 存 JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_articles, f, ensure_ascii=False, indent=2)

    # 轉成 CSV
    rows = []
    idx = 1
    for article_id, entities in all_articles.items():
        for entity, meta in entities.items():
            emotions = meta.get("emotions", [])
            
            # 統一處理格式
            if isinstance(emotions, str):
                emotions = [{
                    "emotion": emotions,
                    "emotion_path": None,
                    "emotion_reason": ""
                }]
            elif not isinstance(emotions, list):
                emotions = []
    
            for emo in emotions:
                if not isinstance(emo, dict):  # 再保險一次
                    emo = {"emotion": str(emo), "emotion_path": None, "emotion_reason": ""}
                rows.append({
                    "emotion_id": f"emotion_{idx}",
                    "article_id": article_id,
                    "entity": entity,
                    "entity_type": meta.get("entity_type", ""),
                    "asian_status": meta.get("asian_status", ""),
                    "emotion": emo.get("emotion", ""),
                    "emotion_path": emo.get("emotion_path", ""),
                    "emotion_reason": emo.get("emotion_reason", "")
                })
                idx += 1

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")

    print(f"✅ Step 4 已合併 {len(batch_files)} 個批次檔 → {output_csv}，共 {len(df)} 筆")


if __name__ == "__main__":
    merge_step4_with_id(
        input_dir="step4_batches",
        prefix="step4_batch_",
        output_json="step4_all.json",
        output_csv="step4_all.csv"
    )


✅ Step 4 已合併 12 個批次檔 → step4_all.csv，共 8011 筆


# 處理錯誤文章

In [158]:
import os, json, re
from collections import Counter
from typing import Any, Dict
from openai import OpenAI

# ==============================
# OpenAI client (替換成你自己的 key)
# ==============================
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def get_response(prompt: str, temperature: float = 0.0) -> str:
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return resp.choices[0].message.content
    except Exception as e:
        print("❌ API error:", e)
        return ""

# ==============================
# JSON utilities (寬鬆 parser)
# ==============================
def strip_code_fences(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.strip()
    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.I)
    return m.group(1).strip() if m else s

def _sanitize_json_like(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = strip_code_fences(s).lstrip("\ufeff")
    # 移除不可見控制字元（保留換行/縮排）
    s = "".join(ch for ch in s if ch.isprintable() or ch in "\n\r\t")
    # 嘗試修正常見 mojibake
    if "â€" in s or "Ã" in s:
        try:
            s = s.encode("latin1").decode("utf-8")
        except Exception:
            pass
    # 取最外層 {...} 或 [...]
    m = re.search(r"(\{[\s\S]*\}|\[[\s\S]*\])", s)
    if m:
        s = m.group(1)
    # 移除尾逗號
    s = re.sub(r",(\s*[}\]])", r"\1", s)
    return s.strip()

def parse_json_loose(s: str):
    if not isinstance(s, str):
        return None
    base = _sanitize_json_like(s)
    if not base:
        return None
    try:
        return json.loads(base)
    except Exception:
        pass
    # 單引號 pseudo-JSON
    if base.startswith("{") and base.endswith("}") and '"' not in base and "'" in base:
        try:
            return json.loads(re.sub(r"'", '"', base))
        except Exception:
            pass
    return None

def parse_model_json(s: str, default: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(s, str) or not s.strip():
        return default
    cleaned = _sanitize_json_like(s)
    try:
        return json.loads(cleaned)
    except Exception:
        m = re.search(r"\{[\s\S]*\}", cleaned)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                return default
        return default

def to_text(x: Any) -> str:
    if x is None:
        return ""
    if isinstance(x, str):
        return x
    if isinstance(x, list):
        return "\n".join(str(item) for item in x)
    if isinstance(x, dict):
        if "relevant_sentences" in x:
            return to_text(x["relevant_sentences"])
        return "\n".join(to_text(v) for v in x.values())
    return str(x)

def normalize_step2_result(title: str, raw_obj) -> Dict[str, Dict[str, str]]:
    if isinstance(raw_obj, dict) and all(isinstance(v, dict) for v in raw_obj.values()):
        out = {}
        for k, v in raw_obj.items():
            out[k] = {
                "entity_type": v.get("entity_type", ""),
                "asian_status": v.get("asian_status", ""),
                "relevant_sentences": to_text(v.get("relevant_sentences"))
            }
        return out
    if isinstance(raw_obj, str):
        parsed = parse_json_loose(raw_obj)
        if parsed is not None: 
            return normalize_step2_result(title, parsed)
    return {}

def debug_json_failure(s: str, context=2):
    cleaned = _sanitize_json_like(s)
    try:
        json.loads(cleaned)
        print("✅ JSON OK")
        return
    except json.JSONDecodeError as e:
        print(f"❌ JSON error @ line {e.lineno}, col {e.colno}: {e.msg}")
        lines = cleaned.splitlines()
        i = e.lineno - 1
        lo, hi = max(0, i-context), min(len(lines), i+context+1)
        for idx in range(lo, hi):
            mark = ">>" if idx == i else "  "
            print(f"{mark} {idx+1:4d}: {lines[idx]}")

# ==============================
# Prompts
# ==============================
def build_gate_prompt(step2_sentences: str) -> str:
    return f"""Task: Decide if the extracted sentences show any observable reaction.

Rules:
- Observable = action/inaction or explicit stance
- Pure emotions or incident descriptions ≠ reaction

Output JSON:
{{
  "has_reaction": "yes" | "no",
  "evidence": "Exact sentence(s) from relevant_sentences"
}}

relevant_sentences:
{step2_sentences}
"""

def build_classifier_prompt(entity_type: str, asian_status: str, step2_sentences: str) -> str:
    return f"""Task: Classify the entity’s REACTION strictly using the Reaction Concept Tree.

Reaction Concept Tree:
- Support Asian Americans
  - Attending marches/rallies
  - Speaking up on social media
  - Calling for being united
  - Educating students
  - Fostering conversations about anti-Asian hate
  - Hiring security guards
  - Providing shopkeepers with air horns
  - Rewarding the public to report suspects
- Advocacy/take actions for changes
- Politicians initiated anti-Asian hate and racism
- Undermining human rights
- Color blind/minimizing racism
- Youth as not an excuse
- Videotaping/confronting harasser/attacker
- Sex (sexual) addiction
- Religion as a reason
- Feeling hopeless or support AAPI being not enough
- Not confronting attacker/harasser or not reporting
- Useless law enforcement
  - Did not take a report
  - Did not patrol the streets
- Government takes actions to stop AAPI hate
  - Installing hotlines
  - Launching a hate-crime task force
  - Increasing patrols
  - Organizing a town hall

Rules:
- If no clear reaction → "Cannot be inferred"
- Always choose the most specific subcategory

entity_type: {entity_type}
asian_status: {asian_status}

relevant_sentences:
{step2_sentences}

Output JSON:
{{
  "reaction": "<one label or 'Cannot be inferred'>",
  "reaction_reason": "Exact sentence(s)"
}}
"""

# ==============================
# Step3 主流程 + Debug
# ==============================
def run_step3_for_titles(step2_file="step2_batches/step2_batch_1.json",
                         titles=None,
                         out_dir="step3_batches_debug",
                         out_suffix="__subset_debug"):
    os.makedirs(out_dir, exist_ok=True)

    with open(step2_file, "r", encoding="utf-8") as f:
        step2_batch_result = json.load(f)

    if titles is None:
        titles = [next(iter(step2_batch_result.keys()), None)]

    filtered = {t: step2_batch_result[t] for t in titles if t in step2_batch_result}
    if not filtered:
        print("找不到指定的 title")
        return

    gate_stats = Counter()
    label_stats = Counter()
    step3_batch_result = {}

    for title, raw in filtered.items():
        print(f"\n🔎 Debugging {title}")
        entities = normalize_step2_result(title, raw)
        if not entities:
            print("⚠️ 無法解析，呼叫 debug_json_failure")
            debug_json_failure(raw)
            continue

        entity_outputs = {}
        for entity, meta in entities.items():
            entity_type = meta.get("entity_type", "")
            asian_status = meta.get("asian_status", "")
            relevant_sentences = to_text(meta.get("relevant_sentences", "")).strip()

            # --- Gate
            gate_resp = get_response(build_gate_prompt(relevant_sentences))
            gate_json = parse_model_json(gate_resp, default={"has_reaction": "no", "evidence": ""})
            if str(gate_json.get("has_reaction", "no")).lower() != "yes":
                out = {
                    "entity_type": entity_type,
                    "asian_status": asian_status,
                    "reaction": "Cannot be inferred",
                    "reaction_reason": ""
                }
                entity_outputs[entity] = out
                label_stats["Cannot be inferred"] += 1
                continue

            # --- Classifier
            cls_resp = get_response(build_classifier_prompt(entity_type, asian_status, relevant_sentences))
            cls_json = parse_model_json(cls_resp, default={"reaction": "Cannot be inferred", "reaction_reason": ""})
            out = {
                "entity_type": entity_type,
                "asian_status": asian_status,
                "reaction": cls_json.get("reaction", "Cannot be inferred"),
                "reaction_reason": cls_json.get("reaction_reason", "")
            }
            entity_outputs[entity] = out
            label_stats[out["reaction"]] += 1

        step3_batch_result[title] = entity_outputs

    base = os.path.basename(step2_file).replace("step2_", "step3_").replace(".json", f"{out_suffix}.json")
    out_path = os.path.join(out_dir, base)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(step3_batch_result, f, ensure_ascii=False, indent=2)

    print(f"✅ 已完成 {list(step3_batch_result.keys())}，輸出 {out_path}")
    print("   Gate stats:", dict(gate_stats))
    print("   Label stats:", dict(label_stats))


# 或跑多篇
run_step3_for_titles("step2_batches/step2_batch_9.json", titles=["Article_440",
    "Article_465"])


🔎 Debugging Article_440

🔎 Debugging Article_465
✅ 已完成 ['Article_440', 'Article_465']，輸出 step3_batches_debug\step3_batch_9__subset_debug.json
   Gate stats: {}
   Label stats: {'Politicians initiated anti-Asian hate and racism': 1, 'Color blind/minimizing racism': 2, 'Support Asian Americans': 2, 'Feeling hopeless or support AAPI being not enough': 2, 'Advocacy/take actions for changes': 1, 'Cannot be inferred': 3}


In [165]:
# -*- coding: utf-8 -*-
import os, json, re
from collections import Counter
from typing import Any, Dict
from openai import OpenAI

# ==============================
# OpenAI client
# ==============================
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def get_response(prompt: str, temperature: float = 0.0) -> str:
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return resp.choices[0].message.content
    except Exception as e:
        print("❌ API error:", e)
        return "{}"

# ==============================
# JSON utilities
# ==============================
def strip_code_fences(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.strip()
    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.I)
    return m.group(1).strip() if m else s

def _sanitize_json_like(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = strip_code_fences(s).lstrip("\ufeff")
    s = "".join(ch for ch in s if ch.isprintable() or ch in "\n\r\t")
    m = re.search(r"(\{[\s\S]*\}|\[[\s\S]*\])", s)
    if m:
        s = m.group(1)
    s = re.sub(r",(\s*[}\]])", r"\1", s)
    return s.strip()

def parse_model_json(s: str, default: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(s, str) or not s.strip():
        return default
    cleaned = _sanitize_json_like(s)
    try:
        return json.loads(cleaned)
    except Exception:
        return default

def to_text(x: Any) -> str:
    if x is None: return ""
    if isinstance(x, str): return x
    if isinstance(x, list): return "\n".join(str(item) for item in x)
    if isinstance(x, dict):
        if "relevant_sentences" in x:
            return to_text(x["relevant_sentences"])
        return "\n".join(to_text(v) for v in x.values())
    return str(x)

def normalize_step2_result(title: str, raw_obj) -> Dict[str, Dict[str, str]]:
    if isinstance(raw_obj, dict) and all(isinstance(v, dict) for v in raw_obj.values()):
        out = {}
        for k, v in raw_obj.items():
            out[k] = {
                "entity_type": v.get("entity_type", ""),
                "asian_status": v.get("asian_status", ""),
                "relevant_sentences": to_text(v.get("relevant_sentences"))
            }
        return out
    return {}

# ==============================
# Step4 Prompt
# ==============================
step4_prompt = """You are a sociology professor with 30 years of experience analyzing racial dynamics and anti-Asian racism in the United States.

### Step 4: Infer **Emotions and Their Intensity**

Your task is to analyze the **extracted sentences** from **Step 2** and infer each entity's **emotional stance** toward anti-Asian hate.

You will also receive metadata from Step 2, including:
- `entity_type`
- `asian_status`

Use only the exact `relevant_sentences` from Step 2 as your source.

---

## Emotion Concept Tree
- Love 
- Joy 
- Anger 
- Sadness 
- Fear 
- Surprise 

---

## Instructions
1. Focus only on emotions — do NOT infer actions.
2. If no emotion is expressed, output `"emotion": "Cannot be inferred"`.
3. If multiple emotions appear, list multiple objects.
4. Use the exact sentence(s) as `"emotion_reason"`.

---

## Output format
{
  "Entity Name": {
    "entity_type": "...",
    "asian_status": "...",
    "emotions": [
      {
        "emotion": "deepest matched term or Cannot be inferred",
        "emotion_reason": "Exact sentence(s)"
      }
    ]
  }
}
"""

# ==============================
# Step4 主流程 + Debug
# ==============================
def run_step4_for_titles(step2_file="step2_batches/step2_batch_1.json",
                         titles=None,
                         out_dir="step4_batches_debug",
                         out_suffix="__subset_debug"):
    os.makedirs(out_dir, exist_ok=True)

    with open(step2_file, "r", encoding="utf-8") as f:
        step2_batch_result = json.load(f)

    if titles is None:
        titles = [next(iter(step2_batch_result.keys()), None)]

    filtered = {t: step2_batch_result[t] for t in titles if t in step2_batch_result}
    if not filtered:
        print("找不到指定的 title")
        return

    step4_batch_result = {}
    emo_stats = Counter()

    for title, raw in filtered.items():
        print(f"\n🔎 Debugging {title}")
        entities = normalize_step2_result(title, raw)
        if not entities:
            print(f"⚠️ {title} 無法解析 step2 結果")
            continue

        entity_outputs = {}
        for entity, meta in entities.items():
            entity_type = meta.get("entity_type", "")
            asian_status = meta.get("asian_status", "")
            relevant_sentences = to_text(meta.get("relevant_sentences", "")).strip()

            full_prompt = (
                step4_prompt +
                f"\n\nEntity: {entity}\nentity_type: {entity_type}\nasian_status: {asian_status}\nrelevant_sentences:\n{relevant_sentences}"
            )

            resp = get_response(full_prompt)
            parsed = parse_model_json(resp, default={
                entity: {
                    "entity_type": entity_type,
                    "asian_status": asian_status,
                    "emotions": [{"emotion": "Cannot be inferred", "emotion_reason": ""}]
                }
            })
            entity_outputs[entity] = parsed.get(entity, parsed)

        step4_batch_result[title] = entity_outputs
        emo_stats["done"] += 1

    # === 輸出 ===
    base = os.path.basename(step2_file).replace("step2_", "step4_").replace(".json", f"{out_suffix}.json")
    out_path = os.path.join(out_dir, base)

    # 如果檔案已經存在 → 合併舊結果
    if os.path.exists(out_path):
        with open(out_path, "r", encoding="utf-8") as f:
            old_data = json.load(f)
        old_data.update(step4_batch_result)
        step4_batch_result = old_data

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(step4_batch_result, f, ensure_ascii=False, indent=2)

    print(f"✅ 已完成 {list(filtered.keys())}，輸出 {out_path}")
    print("   Stats:", dict(emo_stats))


# ==============================
# 執行範例
# ==============================
run_step4_for_titles("step2_batches/step2_batch_1.json", titles=["Article_40"])
run_step4_for_titles("step2_batches/step2_batch_5.json", titles=["Article_255"])
run_step4_for_titles("step2_batches/step2_batch_7.json", titles=["Article_349", "Article_350", "Article_379"])
run_step4_for_titles("step2_batches/step2_batch_8.json", titles=["Article_429"])
run_step4_for_titles("step2_batches/step2_batch_10.json", titles=["Article_524"])
run_step4_for_titles("step2_batches/step2_batch_11.json", titles=["Article_553"])



🔎 Debugging Article_40
✅ 已完成 ['Article_40']，輸出 step4_batches_debug\step4_batch_1__subset_debug.json
   Stats: {'done': 1}

🔎 Debugging Article_255
✅ 已完成 ['Article_255']，輸出 step4_batches_debug\step4_batch_5__subset_debug.json
   Stats: {'done': 1}

🔎 Debugging Article_349

🔎 Debugging Article_350

🔎 Debugging Article_379
✅ 已完成 ['Article_349', 'Article_350', 'Article_379']，輸出 step4_batches_debug\step4_batch_7__subset_debug.json
   Stats: {'done': 3}

🔎 Debugging Article_429
✅ 已完成 ['Article_429']，輸出 step4_batches_debug\step4_batch_8__subset_debug.json
   Stats: {'done': 1}

🔎 Debugging Article_524
✅ 已完成 ['Article_524']，輸出 step4_batches_debug\step4_batch_10__subset_debug.json
   Stats: {'done': 1}

🔎 Debugging Article_553
✅ 已完成 ['Article_553']，輸出 step4_batches_debug\step4_batch_11__subset_debug.json
   Stats: {'done': 1}


In [159]:
def rerun_step3_to_subset(step2_file, titles, out_dir="step3_batches_debug", out_suffix="__subset_debug"):
    os.makedirs(out_dir, exist_ok=True)

    # 讀 step2
    with open(step2_file, "r", encoding="utf-8") as f:
        step2_batch_result = json.load(f)

    # 過濾出要跑的文章
    filtered = {t: step2_batch_result[t] for t in titles if t in step2_batch_result}
    if not filtered:
        print("⚠️ 找不到指定的 titles")
        return

    step3_batch_result = {}

    for title, raw in filtered.items():
        print(f"\n🔎 Re-running {title}")
        entities = normalize_step2_result(title, raw)
        if not entities:
            print(f"⚠️ {title} step2 無法解析")
            debug_json_failure(raw)
            continue

        entity_outputs = {}
        for entity, meta in entities.items():
            entity_type = meta.get("entity_type", "")
            asian_status = meta.get("asian_status", "")
            relevant_sentences = to_text(meta.get("relevant_sentences", "")).strip()

            # --- Gate
            gate_resp = get_response(build_gate_prompt(relevant_sentences))
            gate_json = parse_model_json(gate_resp, default={"has_reaction": "no", "evidence": ""})
            if str(gate_json.get("has_reaction", "no")).lower() != "yes":
                entity_outputs[entity] = {
                    "entity_type": entity_type,
                    "asian_status": asian_status,
                    "reaction": "Cannot be inferred",
                    "reaction_reason": ""
                }
                continue

            # --- Classifier
            cls_resp = get_response(build_classifier_prompt(entity_type, asian_status, relevant_sentences))
            cls_json = parse_model_json(cls_resp, default={"reaction": "Cannot be inferred", "reaction_reason": ""})
            entity_outputs[entity] = {
                "entity_type": entity_type,
                "asian_status": asian_status,
                "reaction": cls_json.get("reaction", "Cannot be inferred"),
                "reaction_reason": cls_json.get("reaction_reason", "")
            }

        step3_batch_result[title] = entity_outputs
        print(f"✅ {title} 已完成")

    # 輸出到 subset 檔案
    base = os.path.basename(step2_file).replace("step2_", "step3_").replace(".json", f"{out_suffix}.json")
    out_path = os.path.join(out_dir, base)

    # 如果 subset 檔案已存在 → 合併
    if os.path.exists(out_path):
        with open(out_path, "r", encoding="utf-8") as f:
            existing = json.load(f)
    else:
        existing = {}

    existing.update(step3_batch_result)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(existing, f, ensure_ascii=False, indent=2)

    print(f"💾 已寫回 {out_path}，目前共 {len(existing)} 篇")

rerun_step3_to_subset(
    step2_file="step2_batches/step2_batch_5.json",
    titles=["Article_220"],
    out_dir="step3_batches_debug",
    out_suffix="__subset_debug"
)



🔎 Re-running Article_220
✅ Article_220 已完成
💾 已寫回 step3_batches_debug\step3_batch_5__subset_debug.json，目前共 7 篇


In [161]:
# 修復後的結果合併到step3_all

import os
import glob
import json
import pandas as pd

def merge_step3_with_existing(all_json="step3_all.json", all_csv="step3_all.csv",
                              subset_dir="step3_batches_debug",
                              prefix="step3_batch_", suffix="__subset_debug.json"):
    # 先讀舊的 all.json
    if os.path.exists(all_json):
        with open(all_json, "r", encoding="utf-8") as f:
            merged_result = json.load(f)
        print(f"📂 載入既有 {all_json}，已有 {len(merged_result)} 篇文章")
    else:
        merged_result = {}
        print(f"⚠️ 找不到 {all_json}，建立新檔案")

    # 找 subset 檔案
    subset_files = sorted(glob.glob(os.path.join(subset_dir, f"{prefix}*{suffix}")))
    print(f"🔍 偵測到 {len(subset_files)} 個 subset 檔案")

    # 合併 subset → all
    for file in subset_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            before = len(merged_result)
            merged_result.update(data)  # ⚠️ 若有同一篇文章，會覆蓋
            after = len(merged_result)
        print(f"✅ 合併 {os.path.basename(file)}，+{after-before} 篇")

    # 存回 all.json
    with open(all_json, "w", encoding="utf-8") as f:
        json.dump(merged_result, f, ensure_ascii=False, indent=2)

    # 轉成 CSV
    rows = []
    idx = 1
    for article_id, entities in merged_result.items():
        for entity, meta in entities.items():
            rows.append({
                "reaction_id": f"reaction_{idx}",
                "article_id": article_id,
                "entity": entity,
                "entity_type": meta.get("entity_type", ""),
                "asian_status": meta.get("asian_status", ""),
                "reaction": meta.get("reaction", ""),
                "reaction_reason": meta.get("reaction_reason", "")
            })
            idx += 1

    df = pd.DataFrame(rows)
    df.to_csv(all_csv, index=False, encoding="utf-8-sig")

    print(f"💾 已更新 {all_json} 和 {all_csv}，共 {len(df)} 筆 reactions")

# 使用範例
if __name__ == "__main__":
    merge_step3_with_existing(
        all_json="step3_all.json",
        all_csv="step3_all.csv",
        subset_dir="step3_batches_debug",
        prefix="step3_batch_",
        suffix="__subset_debug.json"
    )


📂 載入既有 step3_all.json，已有 584 篇文章
🔍 偵測到 11 個 subset 檔案
✅ 合併 step3_batch_10__subset_debug.json，+0 篇
✅ 合併 step3_batch_11__subset_debug.json，+0 篇
✅ 合併 step3_batch_1__subset_debug.json，+0 篇
✅ 合併 step3_batch_2__subset_debug.json，+0 篇
✅ 合併 step3_batch_3__subset_debug.json，+0 篇
✅ 合併 step3_batch_4__subset_debug.json，+0 篇
✅ 合併 step3_batch_5__subset_debug.json，+0 篇
✅ 合併 step3_batch_6__subset_debug.json，+0 篇
✅ 合併 step3_batch_7__subset_debug.json，+0 篇
✅ 合併 step3_batch_8__subset_debug.json，+0 篇
✅ 合併 step3_batch_9__subset_debug.json，+0 篇
💾 已更新 step3_all.json 和 step3_all.csv，共 6063 筆 reactions


In [167]:
# 修復後的結果合併到step4_all

# -*- coding: utf-8 -*-
import os
import glob
import json
import pandas as pd

def merge_step4_with_existing(all_json="step4_all.json", all_csv="step4_all.csv",
                              subset_dir="step4_batches_debug",
                              prefix="step4_batch_", suffix="__subset_debug.json"):
    # 先讀舊的 all.json
    if os.path.exists(all_json):
        with open(all_json, "r", encoding="utf-8") as f:
            merged_result = json.load(f)
        print(f"📂 載入既有 {all_json}，已有 {len(merged_result)} 篇文章")
    else:
        merged_result = {}
        print(f"⚠️ 找不到 {all_json}，建立新檔案")

    # 找 subset 檔案
    subset_files = sorted(glob.glob(os.path.join(subset_dir, f"{prefix}*{suffix}")))
    print(f"🔍 偵測到 {len(subset_files)} 個 subset 檔案")

    # 合併 subset → all
    for file in subset_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            before = len(merged_result)
            merged_result.update(data)  # ⚠️ 若有同一篇文章，會覆蓋
            after = len(merged_result)
        print(f"✅ 合併 {os.path.basename(file)}，+{after-before} 篇")

    # 存回 all.json
    with open(all_json, "w", encoding="utf-8") as f:
        json.dump(merged_result, f, ensure_ascii=False, indent=2)

    # 轉成 CSV
    rows = []
    idx = 1
    for article_id, entities in merged_result.items():
        for entity, meta in entities.items():
            emotions = meta.get("emotions", [])

            # emotions 可能是字串或 list
            if isinstance(emotions, str):
                rows.append({
                    "emotion_id": f"emotion_{idx}",
                    "article_id": article_id,
                    "entity": entity,
                    "entity_type": meta.get("entity_type", ""),
                    "asian_status": meta.get("asian_status", ""),
                    "emotion": emotions,
                    "emotion_reason": ""
                })
                idx += 1

            elif isinstance(emotions, list):
                if not emotions:  # 空 list
                    rows.append({
                        "emotion_id": f"emotion_{idx}",
                        "article_id": article_id,
                        "entity": entity,
                        "entity_type": meta.get("entity_type", ""),
                        "asian_status": meta.get("asian_status", ""),
                        "emotion": "Cannot be inferred",
                        "emotion_reason": ""
                    })
                    idx += 1
                else:
                    for emo in emotions:
                        if isinstance(emo, dict):
                            rows.append({
                                "emotion_id": f"emotion_{idx}",
                                "article_id": article_id,
                                "entity": entity,
                                "entity_type": meta.get("entity_type", ""),
                                "asian_status": meta.get("asian_status", ""),
                                "emotion": emo.get("emotion", "Cannot be inferred"),
                                "emotion_reason": emo.get("emotion_reason", "")
                            })
                        else:  # list 裡還是字串
                            rows.append({
                                "emotion_id": f"emotion_{idx}",
                                "article_id": article_id,
                                "entity": entity,
                                "entity_type": meta.get("entity_type", ""),
                                "asian_status": meta.get("asian_status", ""),
                                "emotion": str(emo),
                                "emotion_reason": ""
                            })
                        idx += 1

    df = pd.DataFrame(rows)
    df.to_csv(all_csv, index=False, encoding="utf-8-sig")

    print(f"💾 已更新 {all_json} 和 {all_csv}，共 {len(df)} 筆 emotions")


# 使用範例
if __name__ == "__main__":
    merge_step4_with_existing(
        all_json="step4_all.json",
        all_csv="step4_all.csv",
        subset_dir="step4_batches_debug",
        prefix="step4_batch_",
        suffix="__subset_debug.json"
    )


📂 載入既有 step4_all.json，已有 584 篇文章
🔍 偵測到 6 個 subset 檔案
✅ 合併 step4_batch_10__subset_debug.json，+0 篇
✅ 合併 step4_batch_11__subset_debug.json，+0 篇
✅ 合併 step4_batch_1__subset_debug.json，+0 篇
✅ 合併 step4_batch_5__subset_debug.json，+0 篇
✅ 合併 step4_batch_7__subset_debug.json，+0 篇
✅ 合併 step4_batch_8__subset_debug.json，+0 篇
💾 已更新 step4_all.json 和 step4_all.csv，共 8127 筆 emotions


In [164]:
# 自動修復

import json, re

def fix_relevant_sentences_block(txt: str) -> str:
    """
    找出 relevant_sentences 區塊，把裡面的句子逐條修正
    """
    def fix_sentence_block(match):
        block = match.group(0)
        # 抽取句子內容（刪掉 JSON 格式）
        sentences = re.findall(r'"(.*?)"', block, flags=re.S)
        fixed = []
        for s in sentences:
            # 合併報導式引號
            s = re.sub(r',"\s*([A-Z][^"]+?\s+said)', r', \1', s)
            s = re.sub(r'"\s*([A-Z][^"]+?\s+said)', r' \1', s)
            # 刪掉句子中殘留的裸引號
            s = s.replace('\\"', '"')  # 避免重複 escape
            s = re.sub(r'(?<!\\)"', "'", s)  # 把內部裸引號換成單引號
            fixed.append(s.strip())
        # 重建成合法 JSON 陣列
        rebuilt = "[\n      " + ",\n      ".join(json.dumps(s) for s in fixed) + "\n    ]"
        return rebuilt

    # 找 relevant_sentences block
    return re.sub(r'\[\s*(".*?")\s*\]', fix_sentence_block, txt, flags=re.S)


# def fix_unclosed_blocks(txt: str) -> str:
#     """
#     嘗試修復未關閉的 relevant_sentences 區塊
#     """
#     # 如果有 relevant_sentences: [ 但後面沒有 ]
#     if '"relevant_sentences": [' in txt and not re.search(r'\]\s*\}', txt):
#         print("🔧 偵測到 relevant_sentences 未關閉，補上 ] }")
#         # 補上缺失的結尾
#         txt = re.sub(r'("relevant_sentences": \[[^\]]+)$',
#                      r'\1\n    ]\n  }',
#                      txt, flags=re.S)
#     return txt
    
def remove_trailing_commas(txt: str) -> str:
    """
    移除 JSON 中不合法的尾逗號
    """
    # 陣列或物件結尾前的逗號
    txt = re.sub(r",(\s*[\]}])", r"\1", txt)
    return txt

def fix_outer_braces(txt: str) -> str:
    """
    確保 JSON 以 { 開頭，以 } 結尾
    """
    txt = txt.strip()
    if not txt.startswith("{"):
        txt = "{\n" + txt
    if not txt.endswith("}"):
        txt = txt + "\n}"
    return txt

def fix_unclosed_blocks(txt: str) -> str:
    if '"relevant_sentences": [' in txt and not re.search(r'\]\s*[\},]', txt):
        print("🔧 偵測到 relevant_sentences 未關閉，補上 ] }")
        txt = re.sub(r'("relevant_sentences": \[[^\]]+)$',
                     r'\1\n    ]\n  }',
                     txt, flags=re.S)
    return txt

def sanitize_relevant_sentences(txt: str) -> str:
    def fix_block(match):
        block = match.group(0)
        sentences = re.findall(r'"(.*?)"', block, flags=re.S)

        fixed = []
        for s in sentences:
            s = s.replace("\n", " ").strip()   # 清理換行
            fixed.append(s)

        return "[\n      " + ",\n      ".join(json.dumps(s) for s in fixed) + "\n    ]"

    return re.sub(r'\[\s*(".*?")\s*\]', fix_block, txt, flags=re.S)



def fix_single_article(step2_file, article_id):
    with open(step2_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    raw = data[article_id]

    if isinstance(raw, str):
        txt = raw.strip()
        if txt.startswith("```"):
            txt = re.sub(r"^```json", "", txt, flags=re.I).strip()
            txt = re.sub(r"```$", "", txt).strip()

        # 修 relevant_sentences
        txt = fix_relevant_sentences_block(txt)
        txt = sanitize_relevant_sentences(txt)
        txt = fix_unclosed_blocks(txt)
        txt = remove_trailing_commas(txt)
        txt = fix_outer_braces(txt)

        try:
            parsed = json.loads(txt)
        except json.JSONDecodeError as e:
            print(f"❌ 修復後仍無法 parse: {e}")
            print("⚠️ 修復後片段:\n", "\n".join(txt.splitlines()[:20]))
            return

        # ✅ 放回 Article_x
        data[article_id] = parsed

    # ✅ 存回完整 batch
    with open(step2_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"✅ {article_id} 已修復並寫回 {step2_file}")



# 測試修復 batch_2 的幾篇文章
# fix_single_article("step2_batches/step2_batch_2.json", "Article_59")
# fix_single_article("step2_batches/step2_batch_4.json", "Article_197")
fix_single_article("step2_batches/step2_batch_7.json", "Article_349")


✅ Article_349 已修復並寫回 step2_batches/step2_batch_7.json


In [156]:
# 手動修復

import json

def manual_fix_articles(step2_file, fixes: dict):
    """
    fixes: dict，key = Article_x，value = Python dict（修正版 JSON）
    """
    with open(step2_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    for article_id, fixed_content in fixes.items():
        if article_id in data:
            data[article_id] = fixed_content
            print(f"✅ {article_id} 已手動修復")
        else:
            print(f"⚠️ {article_id} 不存在於 {step2_file}")

    with open(step2_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"💾 已寫回 {step2_file}")


# =======================
# 直接手動修復三篇文章
# =======================
fixes_batch5 = {
    "Article_220": {
        "Sen. Kevin Thomas (D-Levittown)": {
            "entity_type": "politician",
            "asian_status": "Asian",
            "relevant_sentences": [
                "We are not looking for any different way of treating us, said Sen. Kevin Thomas (D-Levittown), who arrived at the United States at the age of 10.",
                "Just treat us the same as everyone else."
            ]
        }
    }
}

manual_fix_articles("step2_batches/step2_batch_5.json", fixes_batch5)



✅ Article_220 已手動修復
💾 已寫回 step2_batches/step2_batch_5.json


In [8]:
import glob, json

missing_articles = [
    "Article_218", "Article_492", "Article_493", "Article_350", "Article_379", "Article_429", "Article_524", "Article_553"
]

step2_files = sorted(glob.glob("step2_batches/step2_batch_*.json"))
missing_map = {}

for f in step2_files:
    with open(f, "r", encoding="utf-8") as fh:   # ✅ 用 fh
        data = json.load(fh)                    # ✅ json.load(fh) 而不是 f
    for art in missing_articles:
        if art in data:
            missing_map.setdefault(f, []).append(art)

print(json.dumps(missing_map, indent=2, ensure_ascii=False))


{
  "step2_batches\\step2_batch_10.json": [
    "Article_492",
    "Article_493",
    "Article_524"
  ],
  "step2_batches\\step2_batch_11.json": [
    "Article_553"
  ],
  "step2_batches\\step2_batch_5.json": [
    "Article_218"
  ],
  "step2_batches\\step2_batch_7.json": [
    "Article_350",
    "Article_379"
  ],
  "step2_batches\\step2_batch_8.json": [
    "Article_429"
  ]
}


# 重分 entity_type

In [7]:
import pandas as pd

# === 1. 讀取原始資料 ===
df = pd.read_csv("step3_all_new.csv")  # 換成你的檔案名稱

# === 2. 定義標準化對應表 ===
entity_type_mapping = {
    # Individuals
    "victim": "victims",
    "victims": "victims",
    "perpetrator": "perpetrators",
    "perpetrators": "perpetrators",
    "politician": "politicians",
    "politicians": "politicians",
    "professional": "professionals",
    "professionals": "professionals",
    "celebrity": "celebrities",
    "musician": "celebrities",
    "actor": "celebrities",
    "actress": "celebrities",
    "journalist": "professionals",
    "reporter": "professionals",
    "professor": "professionals",
    "student": "professionals",
    "educator": "professionals",
    "scholar": "professionals",
    "adjunct instructor": "professionals",
    "attorney": "professionals",
    "director": "professionals",
    "city_manager": "professionals",
    "sociology professor": "professionals",
    "editor": "professionals",
    "deputy inspector": "law_enforcement_agencies",
    "police_officer": "law_enforcement_agencies",
    "police spokesperson": "law_enforcement_agencies",
    "family_member": "other_individuals",
    "friend": "other_individuals",
    "witness": "other_individuals",
    "individual": "other_individuals",
    "individuals": "other_individuals",
    "general public": "other_individuals",
    "general_public": "other_individuals",
    "social_circle": "other_individuals",
    "community_activist": "other_individuals",
    "community_leader": "other_individuals",
    "organizer": "other_individuals",
    "community organizer": "other_individuals",
    "other individual": "other_individuals",
    "rally organizer": "other_individuals",
    "activist": "other_individuals",
    "supporter": "other_individuals",
    "co-host": "celebrities",
    "artist": "celebrities",
    "former assistant district attorney": "professionals",
    "official": "professionals",
    "non-Asian": "other_individuals",
    "youth coordinator": "professionals",
    "school_board_member": "professionals",
    "Dean": "professionals",
    "community leader": "other_individuals",
    "government body": "government_bodies",

    # Organizations
    "law_enforcement_agency": "law_enforcement_agencies",
    "law_enforcement_agencies": "law_enforcement_agencies",
    "government_body": "government_bodies",
    "government_bodies": "government_bodies",
    "ngo_or_advocacy_group": "ngo_or_advocacy_groups",
    "ngo_or_advocacy_groups": "ngo_or_advocacy_groups",
    "business_entity": "business_entities",
    "business_entities": "business_entities",
    "community_group": "community_groups",
    "community_groups": "community_groups",
    "educational_institution": "government_bodies",  # 假設為正式機構

    # Fallback
    "other": "other_individuals",
    "other_individual": "other_individuals",
    "other_individuals": "other_individuals",
    "group": "unknown",
    # "Cannot be inferred": "unknown",
}

# === 3. 替換 entity_type 欄位（直接覆蓋）===
df["entity_type"] = df["entity_type"].map(entity_type_mapping).fillna(df["entity_type"])

# === 4. 輸出成新檔案 ===
df.to_csv("step3_all_new.csv", index=False)
print("✅ finish")


✅ finish


In [4]:
import pandas as pd
import json
from openai import OpenAI

client = OpenAI()

# ======================
# LLM 分類提示
# ======================
def build_refine_prompt(relevant_sentences: str) -> str:
    return f"""You are a sociology professor re-checking misclassified reactions to anti-Asian hate.

Task: Re-classify the reaction based ONLY on the exact `reaction_reason`.

Reaction Concept Tree:
- Support Asian Americans:People or organizations condemned/do not want Anti-Asian incidents to happen, including all forms of crimes, attacks, violence, assaults, physical, verbal, and online harassment. This kind of support is at a conscientious (cognitive) level, not yet taking concrete actions to stop AAPI hate. 
  - Attending marches/rallies:People or organizations actively attended marches/rallies for supporting Asian American communities.
  - Speaking up on social media:People or organizations spoke up in public, such as via social media, to condemn Anti-Asian hate.
  - Calling for being united:Asians/Asian Americans become united to support each other to overcome Anti-Asian hate.
  - Fostering conversations about anti-Asian hate:The public fostered conversations regarding the Anti-Asian hate issues. Some organizations/groups (e.g., Asian American communities) also think it’s important to have conversations to address the root cause behind the Anti-Asian hate incidents so this will help us to make change.
  - Providing shopkeepers with air horns:Some stores or groups provide shopkeepers with air horns. If any anti-Asian hate crimes happened, they can use the air horns to draw everyone’s attention
- Advocacy/take actions for changes:Individuals, groups, or organizations want or advocate for changing the current situation where Asians/Asian Americans suffered from Anti-Asian hate, such as physical, verbal, and online harassment, attacks, violence, assaults, and hate crimes. They want cultural shift, open dialogue and listening sessions regarding incidents, practical change in racial stereotypes
and perceptions, more awareness about Anti-Asian hate, and human rights.
- Politicians initiated anti-Asian hate and racism:Trump and some republican politicians made a lot of comments on the COVID-19 pandemic. One kind of comment is that he used/dubbed Asian-related objects to combine with disease (virus/flu/covid) or directly calling coronavirus/covid (e.g., country’s or region’s names or Kung flu[modified by Kung Fu], or races) to verbally attack Asians/Asian Americans, such as  “China/Chinese virus” or “Kung flu.” Another type of comment is that he blamed China for causing the pandemic. Such kinds of comments initiated racism, Anti-Asian bigotry/hate, and Anti-Asian hate incidents in the US society. Such comments also led some Americans to blame Asians/Asian Americans for causing the pandemic.
- Undermining human rights:People want to dehumanize and to undermine the fundamental rights, dignity and belonging of those they target.
- Color blind/minimizing racism:Some Americans do not believe discrimination, racism, or racist bias/bigotry against Asians/Asian Americans exists in the community. Incidents of anti-Asian hate (including physical, verbal, and online harassment, attacks, violence, assaults, and Anti-Asian crimes) were downplayed, ignored, or perceived as not existing by the public, the law enforcement system (e.g., the police), and governors (e.g., Mr. Donald Trump). When Asians/Asian Americans were attacked, Anti-Asian hate or racism was not perceived as the perpetrators’ motives/motivations by the police or the perpetrators said their motivations were not triggered by Anti-Asian bigotry or racism.
- Youth as not an excuse:Robert Aaron Long murdered eight people in the incident of the 2021 Atlanta Spa Shootings. In news reports, he was called “the 21-year-old.” Some comments advocated stopping calling him “the 21-year-old” as if his youth is an excuse to murder others because of their race, ethnicities, and sex.
- Videotaping/confronting harasser/attacker:Asians/Asian Americans or bystanders videotaped/recorded the incidents of physical or verbal harassment; Anti-Asian attacks, assaults, or violence; and Anti-Asian crimes. Asians/Asian Americans who suffered physical harassment, attacks, assaults, violence attacked back to the harassers or attackers. Bystanders’ behaviors aim to defense those who were attacked. Cell phones and survelliance system can be used for videotaping or recording. Additionally, those who experienced verbal harassment speak out to the harassers to let them know their thoughts were biased, offensive, and unjust and tell them to stop. 
- Sex (sexual) addiction:Excessive sexual thoughts, desires, urges or behaviors that can’t be controlled and cause distress and harm to your relationships, finances and other aspects of life. It is also called hypersexuality or compulsive sexual behavior. It is what the Atlanta shooter claimed as a motivation that led to his senseless killings of the victims.
- Religion as a reason:In the 2021 Atlanta Spa Shootings, Robert Aaron Long was the killer who murdered eight people. He told the police that his motive was religious guilt about his sexuality. He said he had sexual desire so he wanted to eliminate it. That’s why he went to the spa to skill women of Asian descent. Asian advocacy groups mentioned whether the killer’s motive was religious guilt about his sexuality, no one should ignore the broader context of Anti-Asian violence and hate crimes. Asian advocacy groups tend to attribute the killer’s motive stems from racism or xenophobia, misogyny, and gendered racism
- Feeling hopeless or support AAPI being not enough:Asians/Asian Americans felt worried, frustrated, anxious, and afraid that they may experience Anti-Asian hate crimes, attacks, assaults, and violence. But they felt that nothing happened to stop them. Support for Asian American communities is not enough.
- Not confronting attacker/harasser or not reporting:Asians/Asian Americans did not want to confront attackers/harassers/bullies who physically or verbally harassed or attacked them. They thought it is not worthy of reporting the incidents. They did not want to confront because they were afraid of their safety. They just wanted to leave from the incidents soon.
- Useless law enforcement:Police did not take a police report and denied there was an Anti-Asian hate crime for the incidents of physical, verbal, or online harassment, attacks, assaults, violence, and Anti-Asian crimes. Another situation is that police affirmed there was a crime, but the motivation did not come from Anti-Asian hate or bigotry/prejudice or racism. Additionally, Asian Americans thought if police often patrolled the streets, a lot of Anti-Asian hate crimes, attacks, assaults, and violence would not happen. But in reality, policy did not do so. 
  - Did not take a report on Anti-Asian hate crime:police did not take a report on Anti-Asian hate crime, including physical, verbal, or online harassment, attacks, assaults, and violence.
  - Did not often patrol the streets:police affirmed there was a crime, but police did not often patrol the streets so that there were a lot of Anti-Asian hate crimes (e.g., physical, verbal, or online harassment, attacks, assaults, and violence) happened.
- Takes actions to stop AAPI hate:After the incidents of Anti-Asian hate crimes, attacks, assaults, and violence, state or city government or individuals take concrete actions that aim to stop AAPI hate.
  - Installing hotlines:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments) install hotlines for victims or people who witness Anti-Asian incidents to report.
  - Launching a hate-crime task force:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments) launched an Asian hate crime task force to develop approaches to stopping anti-Asian hate crimes.
  - Making an announcement to condemn anti-Asian hate:This is a type of action to stop AAPI hate. Some organizations (e.g., city and state governments)  made an open announcement to condemn anti-Asian hate.
  - Increasing patrols:Some organizations (e.g., city and state governments) increased patrolling the streets to ensure the safety of Asian Americans. 
  - Organizing a town hall:Some organizations (e.g., city and state governments) organized a town hall meeting to discuss how to stop anti-Asian hate racism.
  - Hiring security guards:Some stores or groups hired security guards to increase safety for Asian Americans and prevent anti-Asian hate crimes or racism.
  - Educating students:Schoolteachers and university faculty took actions to educate students on current social and political issues on Anti-Asian hate. They aim to use education to change the public’s view about Asian Americans/Asians and increase the awareness of respecting Asian Americans/Asians
  - Rewarding the public to report the info about the suspects:Individuals, groups, or organizations provide rewards to the public when they report any information regarding the suspects who may commit anti-Asian hate crimes.


Strict Rules:
- Use ONLY the given reaction_reason, no outside knowledge.
- If no observable reaction, return "Cannot be inferred".
- Always pick the most specific category.

reaction_reason:
{relevant_sentences}

Output JSON:
{{
  "reaction": "<one label from the tree>",
  "reaction_reason": "{relevant_sentences}"
}}
"""

def get_llm_response(prompt: str) -> dict:
    resp = client.chat.completions.create(
        model="gpt-4o-mini",   # 可換成你常用的模型
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    text = resp.choices[0].message.content.strip()
    try:
        return json.loads(text)
    except:
        return {"reaction": "Cannot be inferred", "reaction_reason": ""}

# ======================
# 主程式：重分 Fostering
# ======================
# def reclassify_fostering(input_csv="step3_all_new.csv", output_csv="step3_all_refined.csv"):
#     df = pd.read_csv(input_csv)

#     # 找出 fostering 的資料
#     mask = df['reaction'] == "Fostering conversations about anti-Asian hate"
#     fostering_df = df[mask].copy()

#     print(f"🔎 找到 {len(fostering_df)} 筆 fostering 需要重分")

#     new_labels = []
#     for _, row in fostering_df.iterrows():
#         prompt = build_refine_prompt(str(row['reaction_reason']))
#         result = get_llm_response(prompt)
#         new_labels.append(result.get("reaction", "Cannot be inferred"))

#     # 更新回去
#     df.loc[mask, "reaction"] = new_labels

#     # 存新檔
#     df.to_csv(output_csv, index=False, encoding="utf-8-sig")
#     print(f"✅ 已完成重分，輸出到 {output_csv}")

# # ======================
# # 執行
# # ======================
# if __name__ == "__main__":
#     reclassify_fostering()

def reclassify_fostering(input_csv="step3_all_refined.csv", output_csv="step3_all_new_refined.csv"):
    df = pd.read_csv(input_csv)

    # 找出 fostering 的資料
    mask = df['reaction'] == "Support Asian Americans"
    fostering_df = df[mask].copy()

    print(f"🔎 找到 {len(fostering_df)} 筆 fostering 需要重分")

    new_labels = []
    for _, row in fostering_df.iterrows():
        prompt = build_refine_prompt(str(row['reaction_reason']))
        result = get_llm_response(prompt)
        new_labels.append(result.get("reaction", "Cannot be inferred"))

    # 更新回去
    df.loc[mask, "reaction"] = new_labels

    # 存新檔
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"✅ 已完成重分，輸出到 {output_csv}")

# ======================
# 執行
# ======================
if __name__ == "__main__":
    reclassify_fostering()

🔎 找到 118 筆 fostering 需要重分
✅ 已完成重分，輸出到 step3_all_refined.csv


# 重分 emotion

In [None]:
import pandas as pd

# 載入 CSV
df = pd.read_csv("step4_all_with_date.csv")

# 定義 emotion 對照表（細分類 → 六大情緒，全小寫）
emotion_map = {
    # love
    "love": "love",
    "support": "love", "solidarity": "love",
    "empathy": "love", "compassion": "love",
    "recognition": "love", "gratitude": "love",
    "appreciation": "love", "encouragement": "love",
    "affection": "love", "lust": "love", "longing": "love",

    # joy
    "joy": "joy",
    "confidence": "joy", "optimism": "joy", "empowerment": "joy",
    "cheerfulness": "joy", "zest": "joy", "contentment": "joy",
    "pride": "joy", "relief": "joy",

    # anger
    "anger": "anger",
    "outrage": "anger", "defiance": "anger", "responsibility": "anger",
    "irritation": "anger", "exasperation": "anger", "rage": "anger",
    "disgust": "anger", "envy": "anger", "determination": "anger",
    "urgency": "anger", "frustration": "anger",

    # sadness
    "sadness": "sadness",
    "worry": "sadness", "resignation": "sadness", "regret": "sadness",
    "mixed emotions": "sadness", "dismay": "sadness",
    "disquiet": "sadness", "disturbance": "sadness",
    "guilt": "sadness",
    "suffering": "sadness", "disappointment": "sadness", "shame": "sadness",
    "neglect": "sadness", "sympathy": "sadness", "heartbreak": "sadness",
    "pain": "sadness", "grief": "sadness", "grieving": "sadness",
    "hurt": "sadness", "loneliness": "sadness", "despondency": "sadness",
    "helplessness": "sadness", "exhaustion": "sadness",

    # fear
    "fear": "fear",
    "terror": "fear", "doubt": "fear",
    "alarm": "fear", "anxiety": "fear", "insecurity": "fear",
    "panic": "fear", "dread": "fear", "overwhelming": "fear",
    "overwhelmed": "fear", "horror": "fear", "shock": "fear",

    # surprise
    "surprise": "surprise",
    "confusion": "surprise", "lightbulb moment": "surprise",
    "amazement": "surprise", "wonder": "surprise"
}

def map_emotions(emotion_str):
    """把情緒收斂成六大基本情緒，其他歸為 cannot be inferred，全小寫"""
    if pd.isna(emotion_str):
        return "cannot be inferred"
    emotions = [e.strip().lower() for e in emotion_str.split("|")]
    mapped = [emotion_map.get(e, "cannot be inferred") for e in emotions]
    mapped = list(dict.fromkeys(mapped))  # 去重但保留順序
    return " | ".join(mapped)

# 建立新的欄位
df["emotion"] = df["emotion"].apply(map_emotions)

# 輸出結果
df.to_csv("step4_all_with_date.csv", index=False)
print("✅ 已完成：emotion 全部轉成小寫 (love, joy, anger, sadness, fear, surprise, cannot be inferred)")


In [168]:
# -*- coding: utf-8 -*-
import os
import glob
import json
import pandas as pd

def merge_step2_to_csv(input_dir="step2_batches", prefix="step2_batch_", 
                       output_json="step2_all.json", output_csv="step2_all.csv"):
    batch_files = sorted(glob.glob(os.path.join(input_dir, f"{prefix}*.json")))
    merged_result = {}

    # 合併所有 batch JSON
    for file in batch_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            merged_result.update(data)
        print(f"✅ 已讀取 {os.path.basename(file)}，目前總文章數：{len(merged_result)}")

    # 存成 step2_all.json
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(merged_result, f, ensure_ascii=False, indent=2)

    # 轉成 CSV
    rows = []
    idx = 1
    for article_id, entities in merged_result.items():
        if isinstance(entities, dict):  # 正常狀況
            for entity, meta in entities.items():
                rows.append({
                    "id": f"entity_{idx}",  # ← 這裡改成 id
                    "article_id": article_id,
                    "entity": entity,
                    "entity_type": meta.get("entity_type", ""),
                    "asian_status": meta.get("asian_status", ""),
                    "relevant_sentences": "\n".join(meta.get("relevant_sentences", [])) 
                                           if isinstance(meta.get("relevant_sentences", []), list)
                                           else str(meta.get("relevant_sentences", ""))
                })
                idx += 1
        else:
            # 如果 step2 有壞掉的（存成字串），就保留原始
            rows.append({
                "id": f"entity_{idx}",  # ← 同樣改成 id
                "article_id": article_id,
                "entity": "",
                "entity_type": "",
                "asian_status": "",
                "relevant_sentences": str(entities)
            })
            idx += 1

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")

    print(f"💾 已輸出 {output_json} 和 {output_csv}，共 {len(df)} 筆 entities")

# 執行
if __name__ == "__main__":
    merge_step2_to_csv(
        input_dir="step2_batches",
        prefix="step2_batch_",
        output_json="step2_all.json",
        output_csv="step2_all.csv"
    )


✅ 已讀取 step2_batch_1.json，目前總文章數：50
✅ 已讀取 step2_batch_10.json，目前總文章數：100
✅ 已讀取 step2_batch_11.json，目前總文章數：150
✅ 已讀取 step2_batch_12.json，目前總文章數：184
✅ 已讀取 step2_batch_2.json，目前總文章數：234
✅ 已讀取 step2_batch_3.json，目前總文章數：284
✅ 已讀取 step2_batch_4.json，目前總文章數：334
✅ 已讀取 step2_batch_5.json，目前總文章數：384
✅ 已讀取 step2_batch_6.json，目前總文章數：434
✅ 已讀取 step2_batch_7.json，目前總文章數：484
✅ 已讀取 step2_batch_8.json，目前總文章數：534
✅ 已讀取 step2_batch_9.json，目前總文章數：584
💾 已輸出 step2_all.json 和 step2_all.csv，共 6068 筆 entities
