In [None]:
import pandas as pd
import openai
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv
article_num = 2

# Load API key from .env file
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load your files
articles_df = pd.read_csv("articles.csv").head(article_num)
definitions_df = pd.read_excel("racism_types_definitions.xlsx")
samples_df = pd.read_excel("sample_racism_classification.xlsx").head(150)

concept_defs = "\n".join(
    f"{row['concepts']}: {row['definitions'][:300]}"
    for _, row in definitions_df.iterrows()
)

examples = "\n".join(
    f'"{row["annotated_sentence"]}" → {row["annotation_content"]}'
    for _, row in samples_df.iterrows()
)
def split_text(text, max_chars=3000):
    """Splits text into chunks of approximately max_chars, preferably at sentence boundaries."""
    sentences = text.split('. ')
    chunks = []
    current_chunk = ''

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chars:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Prompt builder
def build_full_article_prompt(article_text, concept_defs, examples):
    return f"""
You are a sociology professor with 30 years of experience analyzing the effects and causes of Asian racism.
Your task is to identify the quotes in articles that match your list of types of racism concepts.

First, read through the racism concept definitions. 
You need to understand these definitions so you can accurately recognize when a quote fits one or more of these concepts:
{concept_defs}

Next, review the example labeled quotes provided. 
You need to study these examples to see how quotes have been matched to concepts in practice, which will guide your own labeling decisions:
{examples}

Now, read the article below. For each quote that matches a concept, return:
- The quote (exact text from article)
- The matched concept(s)

ARTICLE:
{article_text}

Return a list of quote/concept pairs in this format:
[
  {{"quote": "...", "concepts": ["concept1", "concept2"]}},
  ...
]
"""

# Output container
all_results = []

# Loop through a few articles (start small to avoid token overload)
for idx, row in tqdm(articles_df.iterrows(), total=article_num):
    article_text = row["ARTICLE_TEXT"]
    article_id = row["id"]
    title = row["title"]

    # Split into chunks
    chunks = split_text(article_text)

    for chunk_idx, chunk_text in enumerate(chunks):
        prompt = build_full_article_prompt(chunk_text, concept_defs, examples)

        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a sociology professor analyzing racism in text. Label quotes using provided concepts."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.2
            )

            output = response.choices[0].message.content

            # Parse JSON output
            try:
                quote_labels = json.loads(output)
                for q in quote_labels:
                    for concept in q["concepts"]:
                        all_results.append({
                            "article_id": article_id,
                            "title": title,
                            "quote": q["quote"],
                            "concept": concept
                        })
            except Exception as e:
                print(f"⚠️ JSON error for article {article_id} chunk {chunk_idx}: {e}")
                print("🔍 Model output:\n", output)

        except Exception as e:
            print(f"❌ API error for article {article_id} chunk {chunk_idx}: {e}")

# Save results
results_df = pd.DataFrame(all_results)
results_df.to_csv("classification_results.csv", index=False)
print("✅ Done! Saved to classification_results.csv")

 50%|█████     | 1/2 [00:46<00:46, 46.43s/it]