In [1]:
import pandas as pd
import openai
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv

article_num = 2

# Load API key
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load your data
articles_df = pd.read_csv("articles.csv").head(article_num)
definitions_df = pd.read_excel("racism_types_definitions.xlsx")
samples_df = pd.read_excel("sample_racism_classification.xlsx").head(150)

# Truncate long definitions to save tokens
concept_defs = "\n".join(
    f"{row['concepts']}: {row['definitions'][:300]}"
    for _, row in definitions_df.iterrows()
)

examples = "\n".join(
    f'"{row["annotated_sentence"]}" → {row["annotation_content"]}'
    for _, row in samples_df.iterrows()
)

# Split long articles into ~3000‑char chunks at sentence boundaries
def split_text(text, max_chars=3000):
    sentences = text.split('. ')
    chunks, buf = [], ""
    for sent in sentences:
        sent = sent.strip()
        if len(buf) + len(sent) + 2 < max_chars:
            buf += sent + ". "
        else:
            chunks.append(buf.strip())
            buf = sent + ". "
    if buf:
        chunks.append(buf.strip())
    return chunks


def build_stage1_prompt():
    return f"""
Stage 1: Definitions  
You are a sociology professor with 30 years of experience studying racism against Asians.
Read these racism concept definitions carefully—you need to internalize them so you can spot matching quotes later.

Definitions:
{concept_defs}

If you understand these definitions and are ready for the next step, respond **exactly** with:
I understand.
Otherwise, respond **exactly** with:
I do not understand the task.
"""

def build_stage2_prompt():
    return f"""
Stage 2: Examples  
Now that you’ve read the definitions, here are some example quotes labeled with their concepts.
Study them so you see how quotes map to definitions in practice:

{examples}

If you understand how to use these examples to guide your labeling, respond **exactly** with:
I understand.
Otherwise, respond **exactly** with:
I do not understand the task.
"""

def build_stage3_prompt(chunk_text):
    return f"""
Stage 3: Labeling  
You will now read the ARTICLE CHUNK below.  
For each quote matching ≥1 concept, output a JSON array of objects with keys:
- "quote": the exact text (if <50 chars, include one sentence before & after as "context" instead),
- "concepts": list of matching concept names,
- "victim": the race of the victim. If the race cannot be inferred, label it as unknown,
- "context": only when quote was expanded (otherwise can repeat "quote").

ARTICLE CHUNK:
{chunk_text}

Return only the JSON array, e.g.:

[
  {{ "quote": "...", "context": "...", "concepts": ["C1","C2"], "victim": "Asian" }},
  ...
]
"""

all_results = []

for _, row in tqdm(articles_df.iterrows(), total=len(articles_df)):
    article_id = row["id"]
    title = row["title"]
    chunks = split_text(row["ARTICLE_TEXT"])

    for chunk_i, chunk in enumerate(chunks):
        # --- Stage 1: Definitions ---
        resp1 = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a sociology professor analyzing racism in text."},
                {"role": "user",   "content": build_stage1_prompt()}
            ],
            temperature=0
        ).choices[0].message.content.strip()

        if resp1 != "I understand.":
            print(f"❌ Halt at Stage 1 for article {article_id}, chunk {chunk_i}: {resp1}")
            exit(1)

        # --- Stage 2: Examples ---
        resp2 = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a sociology professor analyzing racism in text."},
                {"role": "user",   "content": build_stage2_prompt()}
            ],
            temperature=0
        ).choices[0].message.content.strip()

        if resp2 != "I understand.":
            print(f"❌ Halt at Stage 2 for article {article_id}, chunk {chunk_i}: {resp2}")
            exit(1)

        # --- Stage 3: Labeling ---
        resp3 = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a sociology professor analyzing racism in text."},
                {"role": "user",   "content": build_stage3_prompt(chunk)}
            ],
            temperature=0
        ).choices[0].message.content

        # Parse JSON output
        try:
            labels = json.loads(resp3)
            for item in labels:
                all_results.append({
                    "article_id": article_id,
                    "title": title,
                    "quote": item["quote"],
                    "context": item.get("context", item["quote"]),
                    "concepts": ";".join(item["concepts"]),
                    "victim": item["victim"]
                })
        except Exception as e:
            print(f"⚠️ JSON parse error for article {article_id}, chunk {chunk_i}: {e}")
            print("Raw output:", resp3)

# Save flattened results
results_df = pd.DataFrame(all_results)
results_df.to_csv("classification_results_with_victims_and_stages.csv", index=False)
print("✅ Done — all stages completed and results saved.")

100%|██████████| 2/2 [02:58<00:00, 89.27s/it]

✅ Done! Saved to classification_results.csv



