In [None]:
# Classification of headlines mentioning "Obama" using 1)Support Model and 2) Performance Model
# Original model: mlburnham/Political_DEBATE_DeBERTa_large_v1.1 on huggingface
# Further trained on sample of candidate related headlines


#: zkava01/AuthorSupport_Oct29 on huggingface
#: zkava1/AuthorPerformance_Oct29 on huggingface

In [1]:
import os
import pandas as pd
import re
from transformers import pipeline
from tqdm import tqdm  

In [2]:
# Construct Dropbox path
dropbox_path = os.path.expanduser("FILEHERE.csv")

# Load the CSV file
df = pd.read_csv(dropbox_path, low_memory=False)

# Convert "Date" to datetime format 
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Here is where you can filter for the particular election year, or years you want #
# Here we'll restrict to articles from election day 2007 to election day 2016 
start_date = "2007-11-01"
end_date = "2016-11-15"
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]

# Define the list of political figures with their last names
figures_full = {
    "Harry Truman": ["Truman"],
    "Thomas Dewey": ["Dewey"],
    "Strom Thurmond": ["Thurmond"],
    "Dwight Eisenhower": ["Eisenhower"],
    "Adlai Stevenson": ["Stevenson"],
    "John F. Kennedy": ["Kennedy"],
    "Richard Nixon": ["Nixon"],
    "Lyndon B. Johnson": ["Johnson"],
    "Barry Goldwater": ["Goldwater"],
    "Hubert Humphrey": ["Humphrey"],
    "George Wallace": ["Wallace"],
    "George McGovern": ["McGovern"],
    "Jimmy Carter": ["Carter"],
    "Gerald Ford": ["Ford"],
    "Ronald Reagan": ["Reagan"],
    "John B. Anderson": ["Anderson"],
    "Walter Mondale": ["Mondale"],
    "George H.W Bush": ["Bush"],
    "Michael Dukakis": ["Dukakis"],
    "Bill Clinton": ["Clinton"],
    "Ross Perot": ["Perot"],
    "Bob Dole": ["Dole"],
    "Al Gore": ["Gore"],
    "John Kerry": ["Kerry"],
    "Barack Obama": ["Obama"],
    "John McCain": ["McCain"],
    "Mitt Romney": ["Romney"],
    "Donald Trump": ["Trump"],
    "Joe Biden": ["Biden"],
    "Kamala Harris": ["Harris"]
}

# Create a mapping for last names to their full figure name
last_name_to_full = {alt.lower(): full for full, alts in figures_full.items() for alt in alts}

# match full names and last names (case insensitive)
figure_pattern = r'\b(?:' + '|'.join(re.escape(name) for name in last_name_to_full.keys()) + r')\b'


In [3]:

# Rebuild the regex pattern 
figure_pattern = r'\b(?:' + '|'.join(re.escape(name) for name in last_name_to_full.keys()) + r')\b'

# Choose the figures you're interested in
key_figures = ["Barack Obama"]  # or ["Donald Trump", "Joe Biden", "Kamala Harris", ...]

# Expand into long format. one row per (headline, matched figure)
records = []
for _, row in df.iterrows():
    title = str(row["title"])
    found = re.findall(figure_pattern, title, flags=re.IGNORECASE)
    for match in set(found):  # unique matches only
        full_name = last_name_to_full[match.lower()]
        if full_name in key_figures:
            record = row.copy()
            record["figure"] = full_name  # <-- this replaces 'figure1'
            records.append(record)

# Final long-format dataframe for classification
df_filtered = pd.DataFrame(records)


In [None]:
#### Updated loop with new models - Support and Performance ####

# Load both classifiers
support_classifier = pipeline(
    "zero-shot-classification",
    model="zkava01/AuthorSupport_Oct29",
    tokenizer="zkava01/AuthorSupport_Oct29"
)

performance_classifier = pipeline(
    "zero-shot-classification",
    model="zkava01/AuthorPerformance_Oct29",
    tokenizer="zkava01/AuthorPerformance_Oct29"
)

# Hypothesis templates
support_hypotheses_template = [
    "The author of this text supports {}",
    "The author of this text does not support {}"
]

performance_hypotheses_template = [
    "The author of this text believes {} is performing/performed/will perform well",
    "The author of this text believes {} is performing/performed/will perform poorly"
]

# Chunk setup
chunk_size = 500
num_chunks = (len(df_filtered) // chunk_size) + 1
output_dir = "OUTPUTHERE"
chunk_dir = os.path.join(output_dir, "chunks")
output_base = "Obama"
os.makedirs(chunk_dir, exist_ok=True)

merged_chunks = []

print(f"Running support and performance classification in {num_chunks} chunks...")

for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(df_filtered))
    chunk = df_filtered.iloc[start_idx:end_idx].copy()

    # Add output columns
    chunk["debate_support"] = "NA"
    chunk["debate_performance"] = "NA"

    for idx, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {i+1}/{num_chunks}"):
        title = row["title"]
        figure = row["figure"]

        if figure in key_figures:
            # Author Support 
            support_hypotheses = [h.format(figure) for h in support_hypotheses_template]
            support_output = support_classifier(title, support_hypotheses, multi_label=True)
            support_scores = dict(zip(support_output["labels"], support_output["scores"]))

            if support_scores.get(support_hypotheses[0], 0) > 0.99:
                chunk.at[idx, "debate_support"] = 1
            elif support_scores.get(support_hypotheses[1], 0) > 0.99:
                chunk.at[idx, "debate_support"] = -1
            else:
                chunk.at[idx, "debate_support"] = 0

            # Author Performance 
            performance_hypotheses = [h.format(figure) for h in performance_hypotheses_template]
            perf_output = performance_classifier(title, performance_hypotheses, multi_label=True)
            perf_scores = dict(zip(perf_output["labels"], perf_output["scores"]))

            if perf_scores.get(performance_hypotheses[0], 0) > 0.5:
                chunk.at[idx, "debate_performance"] = 1
            elif perf_scores.get(performance_hypotheses[1], 0) > 0.5:
                chunk.at[idx, "debate_performance"] = -1
            else:
                chunk.at[idx, "debate_performance"] = 0

    # Save chunk
    chunk_filename = f"{output_base}_chunk{i+1}.csv"
    chunk_path = os.path.join(chunk_dir, chunk_filename)
    chunk.to_csv(chunk_path, index=False)
    print(f"Saved {chunk_filename}")
    merged_chunks.append(chunk)

# Merge and save all chunks
final_df = pd.concat(merged_chunks, ignore_index=True)
final_output_path = os.path.join(output_dir, "ObamaHeadlines.csv")
final_df.to_csv(final_output_path, index=False)
print(f"Saved final combined file: {final_output_path}")
