In [49]:
import requests
import pandas as pd
import json
import os
import time
from docx import Document
import pdfplumber

# Set DeepSeek API Key
with open("DEEPSEEK_API_KEY.txt", "r") as file:
    deepseek_api_key = file.read().strip()

DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"

# Load Concept Definitions CSV (Manually defined concepts)
concepts_df = pd.read_csv("concepts_definitions.csv")
# Build a dictionary mapping concept name to its definition.
concept_definitions = {row["concepts"]: row["definitions"] for _, row in concepts_df.iterrows()}

# Function to extract text from Word files
def extract_text_from_word(word_path):
    doc = Document(word_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# Function to load glossary terms from files (Word and PDF)
def load_glossary_from_folder(glossary_folder_path, max_entries=5):
    glossary = {}
    entries_added = 0
    for file_name in os.listdir(glossary_folder_path):
        if entries_added >= max_entries:
            break  # Limit glossary size to avoid exceeding token limits
        file_path = os.path.join(glossary_folder_path, file_name)
        if file_name.endswith(".docx"):
            text = extract_text_from_word(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip unsupported file types
        glossary[file_name] = text[:2000]  # Limit text length per glossary file
        entries_added += 1
    return glossary

# Load limited glossary terms (max 5 entries)
glossary_folder = "glossary"
glossary = load_glossary_from_folder(glossary_folder)

# Load articles CSV
articles_df = pd.read_csv("articles.csv")

# Function to format concepts for DeepSeek API.
# Manual concepts now include the concept name (with definition and source),
# and glossary concepts are appended with their source.
def format_concepts_for_deepseek(max_manual=10, max_glossary=2):
    manual_concepts = "\n".join([
        f"{concept} (Definition: {definition}, Source: concepts_definitions.csv)"
        for concept, definition in list(concept_definitions.items())[:max_manual]
    ])
    
    glossary_concepts = "\n\n".join([
        f"{file_name} Terms: {text[:1000]} (Source: {file_name})"
        for file_name, text in list(glossary.items())[:max_glossary]
    ])
    
    return manual_concepts, glossary_concepts

# Function to classify articles and extract multiple concepts using DeepSeek.
# The prompt instructs the AI to find all relevant concepts and even keywords that exactly match.
def classify_article_with_deepseek(title, text):
    manual_concepts, glossary_concepts = format_concepts_for_deepseek()

    # Revised prompt:
    prompt = f"""
You are an expert in text analysis for racism-related themes. Below is a list of defined concepts and glossary keywords.

Manual Concepts:
{manual_concepts}

Glossary Concepts:
{glossary_concepts}

Now, read the following article and do the following:
1. Identify all concepts that are relevant to the article.
2. For each detected concept, if the concept name or an exact keyword appears in the article, output that exact word as the "Quote".
3. If the concept is only inferred (and no exact keyword appears), then include the most relevant excerpt from the article as the Quote.
4. Include the source for each concept (either "concepts_definitions.csv" or the corresponding glossary file name).

Article Title: {title}
Article Text: {text}

Provide the output in the following format:
- Concept: [Detected Concept]
  Quote: [Exact matching keyword or relevant excerpt]
  Source: [Concept Source]
Make sure to list every relevant concept separately.
    """

    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "deepseek-coder",  # You can adjust the model as needed
        "messages": [
            {"role": "system", "content": "You are an expert in text analysis."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 2000
    }

    try:
        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ DeepSeek API failed: {e}")
        return ""

# Process only the first article for testing
row = articles_df.iloc[0]
title = row["title"]
article_text = row["ARTICLE_TEXT"]

print(f"🔄 Processing single test article using DeepSeek: {title}")

try:
    print("⏳ Waiting for DeepSeek response...")
    extracted_info = classify_article_with_deepseek(title, article_text)
    print("✅ Response received!")
except Exception as e:
    print(f"❌ DeepSeek API failed: {e}")
    extracted_info = ""

# Parse response to extract multiple concepts, quotes, and sources
results = []
try:
    # We expect the output to be in a structured format with each concept starting with "- Concept:"
    lines = extracted_info.split("\n")
    concept, quote, source = None, None, None

    for line in lines:
        if line.startswith("- Concept:"):
            if concept and quote and source:
                results.append({"title": title, "concept": concept, "quote": quote, "source": source})
            concept = line.replace("- Concept:", "").strip()
            quote = None
            source = None
        elif line.startswith("  Quote:"):
            quote = line.replace("  Quote:", "").strip()
        elif line.startswith("  Source:"):
            source = line.replace("  Source:", "").strip()
    if concept and quote and source:
        results.append({"title": title, "concept": concept, "quote": quote, "source": source})
except Exception as e:
    print(f"❌ Error parsing response: {e}")

# Print results
for idx, res in enumerate(results):
    print(f"📝 Concept {idx+1}: {res['concept']}")
    print(f"📌 Quote: {res['quote']}")
    print(f"📖 Source: {res['source']}\n")

# Save the result to a test CSV
test_df = pd.DataFrame(results)
test_df.to_csv("test_concept_extraction_deepseek.csv", index=False)

print("🎉 Test complete! Results saved in test_concept_extraction_deepseek.csv")

🔄 Processing single test article using DeepSeek: Racism is the other virus sweeping America during this pandemic
⏳ Waiting for DeepSeek response...
✅ Response received!
📝 Concept 1: Verbal harassment
📌 Quote: "they are hearing the all-too-familiar vitriol toward Asian Americans being spewed from the dark and angry corners of social media and beyond."
📖 Source: concepts_definitions.csv

📝 Concept 2: Online harassment
📌 Quote: "they are hearing the all-too-familiar vitriol toward Asian Americans being spewed from the dark and angry corners of social media and beyond."
📖 Source: concepts_definitions.csv

📝 Concept 3: Worry about safety
📌 Quote: "This treatment of Asian Americans is immoral, but also dangerous to public health. Research shows that hateful speech and other actions against racial and ethnic minorities -- even seemingly small slights -- might make people sick, contributing to heart disease, respiratory illness and other chronic diseases."
📖 Source: concepts_definitions.csv

📝

In [14]:
import requests
import pandas as pd
import json
import os
import time
from docx import Document
import pdfplumber

# Set DeepSeek API Key
with open("DEEPSEEK_API_KEY.txt", "r") as file:
    deepseek_api_key = file.read().strip()

DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"

# Load Concept Definitions CSV (Manually defined concepts)
concepts_df = pd.read_csv("racism_types_definitions.csv")
# Build a dictionary mapping concept name to its definition.
concept_definitions = {row["concepts"]: row["definitions"] for _, row in concepts_df.iterrows()}

# Function to extract text from Word files
def extract_text_from_word(word_path):
    doc = Document(word_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# Function to load glossary terms from files (Word and PDF)
def load_glossary_from_folder(glossary_folder_path, max_entries=5):
    glossary = {}
    entries_added = 0
    for file_name in os.listdir(glossary_folder_path):
        if entries_added >= max_entries:
            break  # Limit glossary size to avoid exceeding token limits
        file_path = os.path.join(glossary_folder_path, file_name)
        if file_name.endswith(".docx"):
            text = extract_text_from_word(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip unsupported file types
        glossary[file_name] = text[:2000]  # Limit text length per glossary file
        entries_added += 1
    return glossary

# Load limited glossary terms (max 5 entries)
glossary_folder = "glossary"
glossary = load_glossary_from_folder(glossary_folder)

# Load articles CSV
articles_df = pd.read_csv("articles.csv")

# Function to format concepts for DeepSeek API.
# Manual concepts now include the concept name (with definition and source),
# and glossary concepts are appended with their source.
def format_concepts_for_deepseek(max_manual=10, max_glossary=2):
    manual_concepts = "\n".join([
        f"{concept} (Definition: {definition}, Source: concepts_definitions.csv)"
        for concept, definition in list(concept_definitions.items())[:max_manual]
    ])
    
    glossary_concepts = "\n\n".join([
        f"{file_name} Terms: {text[:1000]} (Source: {file_name})"
        for file_name, text in list(glossary.items())[:max_glossary]
    ])
    
    return manual_concepts, glossary_concepts

# Function to classify articles and extract multiple concepts using DeepSeek.
# The prompt instructs the AI to find all relevant concepts and even keywords that exactly match.
def classify_article_with_deepseek(title, text):
    manual_concepts, glossary_concepts = format_concepts_for_deepseek()

    # Revised prompt:
    prompt = f"""
You are an expert in text analysis for racism-related themes. Below is a list of defined racism types concepts and glossary keywords.

Manual Concepts:
{manual_concepts}

Glossary Concepts:
{glossary_concepts}

Now, read the following article and do the following:
1. For each racism type, search the article for any occurrence of its name or known synonyms. For example, if the concept "Bigotry/prejudice" is defined and the article contains the keyword "bigotry", then that concept should be detected.
2. When a matching keyword or synonym is found, output the **entire sentence** that contains the match as the "Quote".
3. Use the standardized concept label from the manual concepts (e.g. "Bigotry/prejudice") as the "Concept".
4. If the concept is not directly mentioned but is inferred from the context, output the sentence that best represents it.
5. If a racism type is detected but no manual concepts match, then you can label it with the glossary concepts.  
6. Always include the source for each concept (e.g. "concepts_definitions.csv" or the corresponding glossary file name).
6. There are many synonyms; if multiple variants are found in one sentence, still output that sentence once, with the standardized concept label.

Article Title: {title}
Article Text: {text}

Provide the output in the following format:
- Concept: [Detected Concept]
  Quote: [Exact matching keyword or relevant excerpt]
  Source: [Concept Source]
Make sure to list every relevant concept separately.
    """

    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "deepseek-coder",  # You can adjust the model as needed
        "messages": [
            {"role": "system", "content": "You are an expert in text analysis."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 2000
    }

    try:
        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ DeepSeek API failed: {e}")
        return ""

# Process only the first article for testing
row = articles_df.iloc[0]
title = row["title"]
article_text = row["ARTICLE_TEXT"]

print(f"🔄 Processing single test article using DeepSeek: {title}")

try:
    print("⏳ Waiting for DeepSeek response...")
    extracted_info = classify_article_with_deepseek(title, article_text)
    print("✅ Response received!")
except Exception as e:
    print(f"❌ DeepSeek API failed: {e}")
    extracted_info = ""

# Parse response to extract multiple concepts, quotes, and sources
results = []
try:
    # We expect the output to be in a structured format with each concept starting with "- Concept:"
    lines = extracted_info.split("\n")
    concept, quote, source = None, None, None

    for line in lines:
        if line.startswith("- Concept:"):
            if concept and quote and source:
                results.append({"title": title, "concept": concept, "quote": quote, "source": source})
            concept = line.replace("- Concept:", "").strip()
            quote = None
            source = None
        elif line.startswith("  Quote:"):
            quote = line.replace("  Quote:", "").strip()
        elif line.startswith("  Source:"):
            source = line.replace("  Source:", "").strip()
    if concept and quote and source:
        results.append({"title": title, "concept": concept, "quote": quote, "source": source})
except Exception as e:
    print(f"❌ Error parsing response: {e}")

# Print results
for idx, res in enumerate(results):
    print(f"📝 Concept {idx+1}: {res['concept']}")
    print(f"📌 Quote: {res['quote']}")
    print(f"📖 Source: {res['source']}\n")

# Save the result to a test CSV
test_df = pd.DataFrame(results)
test_df.to_csv("test_concept_extraction_deepseek.csv", index=False)

print("🎉 Test complete! Results saved in test_concept_extraction_deepseek.csv")

🔄 Processing single test article using DeepSeek: Racism is the other virus sweeping America during this pandemic
⏳ Waiting for DeepSeek response...
✅ Response received!
📝 Concept 1: Xenophobia
📌 Quote: "With the coronavirus pandemic today, they are hearing the all-too-familiar vitriol toward Asian Americans being spewed from the dark and angry corners of social media and beyond."
📖 Source: concepts_definitions.csv

📝 Concept 2: Verbal harassment
📌 Quote: "With the coronavirus pandemic today, they are hearing the all-too-familiar vitriol toward Asian Americans being spewed from the dark and angry corners of social media and beyond."
📖 Source: concepts_definitions.csv

📝 Concept 3: Online harassment
📌 Quote: "With the coronavirus pandemic today, they are hearing the all-too-familiar vitriol toward Asian Americans being spewed from the dark and angry corners of social media and beyond."
📖 Source: concepts_definitions.csv

📝 Concept 4: Anti-Asian hate crimes(general)
📌 Quote: "Now, I must 

With synonyms & glossary
Still using concept_definitions.csv(should change to racism_types_definitions.xlsx)

In [25]:
import requests
import pandas as pd
import json
import os
import re
import time
from docx import Document
import pdfplumber

# -----------------------
# SETUP & UTILITY FUNCTIONS
# -----------------------

# Set DeepSeek API Key
with open("DEEPSEEK_API_KEY.txt", "r") as file:
    deepseek_api_key = file.read().strip()

DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"

# Load Concept Definitions CSV (Manually defined concepts)
concepts_df = pd.read_csv("concepts_definitions.csv")
# Build a dictionary mapping concept name to its definition.
concept_definitions = {row["concepts"]: row["definitions"] for _, row in concepts_df.iterrows()}

# Optionally, define synonyms mapping.
# For example, if "bigotry" appears in an article, treat it as "Bigotry/prejudice".
synonyms = {
    "bigotry": "Bigotry/prejudice",
    "prejudice": "Bigotry/prejudice",
    "Asian virus": "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "kung flu": "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "ramen noodle flu" : "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "racial injustice": "racial injustice/inequity and oppression",
    "inequity and oppression" :"racial injustice/inequity and oppression"

}

# Function to extract text from Word files
def extract_text_from_word(word_path):
    doc = Document(word_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# Function to load glossary terms from files (Word and PDF)
def load_glossary_from_folder(glossary_folder_path, max_entries=5):
    glossary = {}
    entries_added = 0
    for file_name in os.listdir(glossary_folder_path):
        if entries_added >= max_entries:
            break  # Limit glossary size
        file_path = os.path.join(glossary_folder_path, file_name)
        if file_name.endswith(".docx"):
            text = extract_text_from_word(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        else:
            continue
        glossary[file_name] = text[:2000]  # Limit text length per file
        entries_added += 1
    return glossary

# Load limited glossary terms (max 5 entries)
glossary_folder = "glossary"
glossary = load_glossary_from_folder(glossary_folder)

# Function to format concepts for DeepSeek API.
def format_concepts_for_deepseek(max_manual=10, max_glossary=2):
    manual_concepts = "\n".join([
        f"{concept} (Definition: {definition}, Source: concepts_definitions.csv)"
        for concept, definition in list(concept_definitions.items())[:max_manual]
    ])
    glossary_concepts = "\n\n".join([
        f"{file_name} Terms: {text[:1000]} (Source: {file_name})"
        for file_name, text in list(glossary.items())[:max_glossary]
    ])
    return manual_concepts, glossary_concepts

# -----------------------
# DEEPSEEK CALL & EXACT MATCHING
# -----------------------

# Function to classify an article using DeepSeek and also perform exact word matching.
def classify_article_with_deepseek(title, text):
    results = []

    # --- Step 1: Exact matching for each concept (from concepts_definitions.csv)
    for concept in concept_definitions.keys():
        # Check if the exact concept (as a whole word) appears in the text.
        if re.search(rf'\b{re.escape(concept)}\b', text, flags=re.IGNORECASE):
            results.append({
                "concept": concept,
                "quote": concept,  # Use the matched word as quote.
                "source": "concepts_definitions.csv"
            })
    
    # --- Step 2: Check synonyms.
    for term, mapped_concept in synonyms.items():
        if re.search(rf'\b{re.escape(term)}\b', text, flags=re.IGNORECASE):
            # Add the mapped concept if not already added.
            already_added = any(r["concept"].lower() == mapped_concept.lower() for r in results)
            if not already_added:
                results.append({
                    "concept": mapped_concept,
                    "quote": term,  # Use the exact matching keyword.
                    "source": "concepts_definitions.csv"
                })

    # --- Step 3: DeepSeek API call for additional concepts.
    manual_concepts, glossary_concepts = format_concepts_for_deepseek()
    prompt = f"""
You are an expert in text analysis for racism-related themes. Below is a list of defined concepts and glossary keywords.

Manual Concepts:
{manual_concepts}

Glossary Concepts:
{glossary_concepts}

Now, read the following article and do the following. There's no need to further response like "Here are the relevant concepts detected in the article along with their corresponding quotes and sources::
1. For each concept, search the article for any occurrence of its name or known synonyms. For example, if the concept "Bigotry/prejudice" is defined and the article contains the keyword "bigotry", then that concept should be detected.
2. When a matching keyword or synonym is found, output the **entire sentence** that contains the match as the "Quote".
3. Use the standardized concept label from the provided list (e.g. "Bigotry/prejudice") as the "Concept".
4. If the concept is not directly mentioned but is inferred from the context, output the sentence that best represents it.
5. Always include the source for each concept (e.g. "concepts_definitions.csv" or the corresponding glossary file name).
6. There are many synonyms; if multiple variants are found in one sentence, still output that sentence once, with the standardized concept label.

Article Title: {title}
Article Text: {text}

Provide the output in the following format:
- Concept: [Detected Concept]
  Quote: [Exact matching keyword or relevant excerpt]
  Source: [Concept Source]
Make sure to list every relevant concept separately.
    """
    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "deepseek-coder",  # Adjust model as needed.
        "messages": [
            {"role": "system", "content": "You are an expert in text analysis."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 2000
    }
    try:
        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        ai_output = response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ DeepSeek API failed: {e}")
        ai_output = ""
    print(ai_output)
    # --- Step 4: Parse the AI output.
    ai_matches = []
    try:
        lines = ai_output.split("\n")
        concept, quote, source = None, None, None
        for line in lines:
            if line.startswith("- Concept:"):
                if concept and quote and source:
                    ai_matches.append({"concept": concept, "quote": quote, "source": source})
                concept = line.replace("- Concept:", "").strip()
                quote = None
                source = None
            elif line.startswith("  Quote:"):
                quote = line.replace("  Quote:", "").strip()
            elif line.startswith("  Source:"):
                source = line.replace("  Source:", "").strip()
        if concept and quote and source:
            ai_matches.append({"concept": concept, "quote": quote, "source": source})
    except Exception as e:
        print(f"❌ Error parsing AI output: {e}")
    
    # --- Step 5: Merge the exact matches with AI matches, avoiding duplicates.
    merged = {r["concept"].lower(): r for r in results}  # Use lower-case keys.
    for m in ai_matches:
        key = m["concept"].lower()
        #if key not in merged:
        merged[key] = m
    return list(merged.values())

# -----------------------
# READ ARTICLES FROM TXT FILES
# -----------------------

# Assume the txt files are named "0.txt", "1.txt", ... in the folder "txt".
txt_folder = "txt"
article_files = sorted([f for f in os.listdir(txt_folder) if f.endswith(".txt")],
                       key=lambda x: int(os.path.splitext(x)[0]))

all_results = []
# Process each txt file.
for file_name in article_files:
    file_path = os.path.join(txt_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        article_text = f.read()
    # Use file name (without extension) as article title.
    title = os.path.splitext(file_name)[0]
    print(f"🔄 Processing article {title} from file {file_name}...")
    try:
        detected_concepts = classify_article_with_deepseek(title, article_text)
        # Add a title field to each result.
        for item in detected_concepts:
            item["title"] = title
        all_results.extend(detected_concepts)
        print(f"✅ Processed article {title}: {len(detected_concepts)} concept(s) detected.")
    except Exception as e:
        print(f"❌ Failed processing article {title}: {e}")

# -----------------------
# SAVE RESULTS TO CSV
# -----------------------
if all_results:
    output_df = pd.DataFrame(all_results)
    output_df.to_csv("test_concept_extraction_deepseek.csv", index=False)
    print("🎉 All articles processed! Results saved in test_concept_extraction_deepseek.csv")
else:
    print("No results to save.")


🔄 Processing article 0 from file 0.txt...
- Concept: Worry about safety  
  Quote: “I'm scared. Everybody's scared," she said. “I hope that the coronavirus passes soon and fast, and Chinatown comes back."  
  Source: concepts_definitions.csv  

- Concept: Verbal harassment  
  Quote: “All the financial factors are compounded by the stirring racism," said Church, a longtime resident of Chinatown. “Everybody's concerned about the so-called 'Chinese Virus.'"  
  Source: concepts_definitions.csv  

- Concept: Support Asian Americans  
  Quote: “There's been a long fight to keep Chinatown affordable and not displace the residents and the businesses that make the community special," said Wu.  
  Source: concepts_definitions.csv  

- Concept: Worry about safety  
  Quote: “This is going to be a long haul for everyone," said Boston City Councilor Michelle Wu. “But there is especially pain in the Asian-American community in Boston and across the country."  
  Source: concepts_definitions.csv  


PermissionError: [Errno 13] Permission denied: 'test_concept_extraction_deepseek.csv'

Without Glossary

In [None]:
import requests
import pandas as pd
import json
import os
import re
import time
from docx import Document
import pdfplumber

# -----------------------
# SETUP & UTILITY FUNCTIONS
# -----------------------

# Set DeepSeek API Key
with open("DEEPSEEK_API_KEY.txt", "r") as file:
    deepseek_api_key = file.read().strip()

DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"

# Load Concept Definitions CSV (Manually defined concepts)
concepts_df = pd.read_excel("racism_types_definitions.xlsx")
# Build a dictionary mapping concept name to its definition.
concept_definitions = {row["concepts"]: row["definitions"] for _, row in concepts_df.iterrows()}

# Optionally, define synonyms mapping.
# For example, if "bigotry" appears in an article, treat it as "Bigotry/prejudice".
synonyms = {
    "bigotry": "Bigotry/prejudice",
    "prejudice": "Bigotry/prejudice",
    "Asian virus": "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "kung flu": "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "ramen noodle flu" : "China/Chinese/Asian virus”/“Kung flu/plague/Ramen noodle flu",
    "racial injustice": "racial injustice/inequity and oppression",
    "inequity and oppression" :"racial injustice/inequity and oppression"

}
# Function to format concepts for DeepSeek API.
def format_concepts_for_deepseek(max_manual=10):
    manual_concepts = "\n".join([
        f"{concept} (Definition: {definition}, Source: racism_types_definitions.xlsx)"
        for concept, definition in list(concept_definitions.items())[:max_manual]
    ])
    return manual_concepts

# -----------------------
# DEEPSEEK CALL & EXACT MATCHING
# -----------------------

# Function to classify an article using DeepSeek and also perform exact word matching.
def classify_article_with_deepseek(title, text):
    results = []

    # --- Step 1: Exact matching for each concept (from concepts_definitions.csv)
    for concept in concept_definitions.keys():
        # Check if the exact concept (as a whole word) appears in the text.
        if re.search(rf'\b{re.escape(concept)}\b', text, flags=re.IGNORECASE):
            results.append({
                "concept": concept,
                "quote": concept,  # Use the matched word as quote.
                "source": "racism_types_definitions.xlsx"
            })
    
    # --- Step 2: Check synonyms.
    for term, mapped_concept in synonyms.items():
        if re.search(rf'\b{re.escape(term)}\b', text, flags=re.IGNORECASE):
            # Add the mapped concept if not already added.
            already_added = any(r["concept"].lower() == mapped_concept.lower() for r in results)
            if not already_added:
                results.append({
                    "concept": mapped_concept,
                    "quote": term,  # Use the exact matching keyword.
                    "source": "racism_types_definitions.xlsx"
                })

    # --- Step 3: DeepSeek API call for additional concepts.
    manual_concepts = format_concepts_for_deepseek()
    prompt = f"""
You are an expert in text analysis for racism-related themes. Below is a list of defined racism types with their definitions.

Defined Racism Types:
{manual_concepts}

Now, read the following article and identify instances of these racism types. For each detected racism type, output an object with the following keys:
- "Concept": standardized racism type label from the provided list.
- "Quote": the entire sentence from the article where the racism type or its known synonym appears.
- "Source": "racism_types_definitions.xlsx".


Article Title: {title}
Article Text: {text}

Provide the output in the following format,There's no need to further response like "Here are the relevant concepts detected in the article along with their corresponding quotes and sources:
- Concept: [Detected Concept]
  Quote: [Exact matching keyword or relevant excerpt]
  Source: [Concept Source]
Make sure to list every relevant concept separately.
    """
    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "deepseek-coder",  # Adjust model as needed.
        "messages": [
            {"role": "system", "content": "You are an expert in text analysis."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 2000
    }
    try:
        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        ai_output = response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ DeepSeek API failed: {e}")
        ai_output = ""
    print(ai_output)
    # --- Step 4: Parse the AI output.
    ai_matches = []
    try:
        lines = ai_output.split("\n")
        concept, quote, source = None, None, None
        for line in lines:
            if line.startswith("- Concept:"):
                if concept and quote and source:
                    ai_matches.append({"concept": concept, "quote": quote, "source": source})
                concept = line.replace("- Concept:", "").strip()
                quote = None
                source = None
            elif line.startswith("  Quote:"):
                quote = line.replace("  Quote:", "").strip()
            elif line.startswith("  Source:"):
                source = line.replace("  Source:", "").strip()
        if concept and quote and source:
            ai_matches.append({"concept": concept, "quote": quote, "source": source})
    except Exception as e:
        print(f"❌ Error parsing AI output: {e}")
    
    # --- Step 5: Merge the exact matches with AI matches, avoiding duplicates.
    merged = {r["concept"].lower(): r for r in results}  # Use lower-case keys.
    for m in ai_matches:
        key = m["concept"].lower()
        #if key not in merged:
        merged[key] = m
    return list(merged.values())

# -----------------------
# READ ARTICLES FROM TXT FILES
# -----------------------

# Assume the txt files are named "0.txt", "1.txt", ... in the folder "txt".
txt_folder = "txt"
article_files = sorted([f for f in os.listdir(txt_folder) if f.endswith(".txt")],
                       key=lambda x: int(os.path.splitext(x)[0]))

all_results = []
# Process each txt file.
for file_name in article_files:
    file_path = os.path.join(txt_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        article_text = f.read()
    # Use file name (without extension) as article title.
    title = os.path.splitext(file_name)[0]
    print(f"🔄 Processing article {title} from file {file_name}...")
    try:
        detected_concepts = classify_article_with_deepseek(title, article_text)
        # Add a title field to each result.
        for item in detected_concepts:
            item["title"] = title
        all_results.extend(detected_concepts)
        print(f"✅ Processed article {title}: {len(detected_concepts)} concept(s) detected.")
    except Exception as e:
        print(f"❌ Failed processing article {title}: {e}")

# -----------------------
# SAVE RESULTS TO CSV
# -----------------------
if all_results:
    output_df = pd.DataFrame(all_results)
    output_df.to_csv("test_concept_extraction_deepseek.csv", index=False)
    print("🎉 All articles processed! Results saved in test_concept_extraction_deepseek.csv")
else:
    print("No results to save.")

Whole article passed into prompt

In [49]:
import pandas as pd
import openai
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv
article_num = 2

# Load API key from .env file
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load your files
articles_df = pd.read_csv("articles.csv").head(article_num)
definitions_df = pd.read_excel("racism_types_definitions.xlsx")
samples_df = pd.read_excel("sample_racism_classification.xlsx").head(140)

concept_defs = "\n".join(
    f"{row['concepts']}: {row['definitions'][:300]}"
    for _, row in definitions_df.iterrows()
)

examples = "\n".join(
    f'"{row["annotated_sentence"]}" → {row["annotation_content"]}'
    for _, row in samples_df.iterrows()
)

# Prompt builder
def build_full_article_prompt(article_text, concept_defs, examples):
    return f"""
You are a sociology professor with 30 years of experience analyzing the effects and causes of Asian racism.
Your task is to identify the quotes in articles that match your list of types of racism concepts.

First, read through the racism concept definitions:
{concept_defs}

Next, read through some example labeled quotes:
{examples}

Now, read the article below. For each quote that matches a concept, return:
- The quote (exact text from article)
- The matched concept(s)

ARTICLE:
{article_text}

Return a list of quote/concept pairs in this format:
[
  {{"quote": "...", "concepts": ["concept1", "concept2"]}},
  ...
]
"""

# Output container
all_results = []

# Loop through a few articles (start small to avoid token overload)
for idx, row in tqdm(articles_df.iterrows(), total=article_num):
    article_text = row["ARTICLE_TEXT"]
    article_id = row["id"]
    title = row["title"]

    prompt = build_full_article_prompt(article_text, concept_defs, examples)

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a sociology professor analyzing racism in text. Label quotes using provided concepts."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2
        )

        output = response.choices[0].message.content

        # Parse the model's JSON response
        try:
            quote_labels = json.loads(output)
            for q in quote_labels:
                for concept in q["concepts"]:
                    all_results.append({
                        "article_id": article_id,
                        "title": title,
                        "quote": q["quote"],
                        "concept": concept
                    })
        except Exception as e:
            print(f"⚠️ JSON error for article {article_id}: {e}")
            print("🔍 Model output:\n", output)

    except Exception as e:
        print(f"❌ API error for article {article_id}: {e}")

# Save results
results_df = pd.DataFrame(all_results)
results_df.to_csv("classification_results.csv", index=False)
print("✅ Done! Saved to classification_results.csv")

100%|██████████| 2/2 [01:01<00:00, 30.89s/it]

⚠️ JSON error for article 2: Unterminated string starting at: line 13 column 4 (char 2426)
🔍 Model output:
 [
  {"quote": "F--- China!", "concepts": ["Verbal harassment"]},
  {"quote": "Then the man spat on Nguyen, he said. The saliva splattered on his jacket.", "concepts": ["Physical harassment"]},
  {"quote": "Nguyen worries that East Asians in the United States will face even more harassment and attacks as coronavirus cases continue to rise.", "concepts": ["Anti-Asian hate crimes(general)", "COVID-19 or coronavirus or pandemic"]},
  {"quote": "He said he believes President Donald Trump stoked such hate-filled reaction during a news briefing last week when he defended his use of 'Chinese virus.'", "concepts": ["Donald Trump", "“China/Chinese virus” or “Kung flu/plague” or “Wuhan virus” or “Diseased Chinese” or “Asian Virus” or “Ramen Noodle flu”"]},
  {"quote": "Nguyen and other Asians in Chicago said they have felt growing apprehension that people take the president's comments as a 




Divide articles into chunks

In [None]:
import pandas as pd
import openai
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv
article_num = 2

# Load API key from .env file
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load your files
articles_df = pd.read_csv("articles.csv").head(article_num)
definitions_df = pd.read_excel("racism_types_definitions.xlsx")
samples_df = pd.read_excel("sample_racism_classification.xlsx").head(150)

concept_defs = "\n".join(
    f"{row['concepts']}: {row['definitions'][:300]}"
    for _, row in definitions_df.iterrows()
)

examples = "\n".join(
    f'"{row["annotated_sentence"]}" → {row["annotation_content"]}'
    for _, row in samples_df.iterrows()
)
def split_text(text, max_chars=3000):
    """Splits text into chunks of approximately max_chars, preferably at sentence boundaries."""
    sentences = text.split('. ')
    chunks = []
    current_chunk = ''

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chars:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Prompt builder
def build_full_article_prompt(article_text, concept_defs, examples):
    return f"""
You are a sociology professor with 30 years of experience analyzing the effects and causes of Asian racism.
Your task is to identify the quotes in articles that match your list of types of racism concepts.

First, read through the racism concept definitions. 
You need to understand these definitions so you can accurately recognize when a quote fits one or more of these concepts:
{concept_defs}

Next, review the example labeled quotes provided. 
You need to study these examples to see how quotes have been matched to concepts in practice, which will guide your own labeling decisions:
{examples}

Now, read the article below. For each quote that matches a concept, return:
- The quote (exact text from article)
- The matched concept(s)

ARTICLE:
{article_text}

Return a list of quote/concept pairs in this format:
[
  {{"quote": "...", "concepts": ["concept1", "concept2"]}},
  ...
]
"""

# Output container
all_results = []

# Loop through a few articles (start small to avoid token overload)
for idx, row in tqdm(articles_df.iterrows(), total=article_num):
    article_text = row["ARTICLE_TEXT"]
    article_id = row["id"]
    title = row["title"]

    # Split into chunks
    chunks = split_text(article_text)

    for chunk_idx, chunk_text in enumerate(chunks):
        prompt = build_full_article_prompt(chunk_text, concept_defs, examples)

        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a sociology professor analyzing racism in text. Label quotes using provided concepts."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.2
            )

            output = response.choices[0].message.content

            # Parse JSON output
            try:
                quote_labels = json.loads(output)
                for q in quote_labels:
                    for concept in q["concepts"]:
                        all_results.append({
                            "article_id": article_id,
                            "title": title,
                            "quote": q["quote"],
                            "concept": concept
                        })
            except Exception as e:
                print(f"⚠️ JSON error for article {article_id} chunk {chunk_idx}: {e}")
                print("🔍 Model output:\n", output)

        except Exception as e:
            print(f"❌ API error for article {article_id} chunk {chunk_idx}: {e}")

# Save results
results_df = pd.DataFrame(all_results)
results_df.to_csv("classification_results.csv", index=False)
print("✅ Done! Saved to classification_results.csv")