# This notebook is going to preprocess and clean the 500 annotated data that Yalda created on June 2025, to compare the performance of the best trained models (Roberta) with LLMs

In [1]:
import pandas as pd
import os

In [2]:
# Define the data types for each column
dtype_dict = {
    'full_text': str,
    'stance': str,
    'moral': int,
    'care': int,
    'fairness': int,
    'authority': int,
    'loyalty': int,
    'purity': int
}

# Read the CSV with proper data types
annotated_tweets_df = pd.read_csv(os.path.join("../raw/500_annotated_tweets.csv"), dtype=dtype_dict)

# Display basic information about the dataset
print("Dataset shape:", annotated_tweets_df.shape)
print("\nData types:")
print(annotated_tweets_df.dtypes)
print("\nFirst few rows:")
annotated_tweets_df.head()

Dataset shape: (500, 8)

Data types:
full_text    object
stance       object
moral         int64
care          int64
fairness      int64
authority     int64
loyalty       int64
purity        int64
dtype: object

First few rows:


Unnamed: 0,full_text,stance,moral,care,fairness,authority,loyalty,purity
0,The conversations I have with #prolife org rep...,Prochoice,1,1,0,0,0,0
1,This is why @VoteChoice Local Impact Report ma...,Prochoice,1,0,0,1,0,0
2,Just 1 more way we seek to empower women. #pro...,Prolife,0,0,0,0,0,0
3,This fight about Roe V Wade is insane. Doesn't...,Prochoice,1,0,1,0,0,0
4,Rise &amp; shine! It's a new day and you're al...,Prolife,1,0,0,1,0,1


In [3]:
# assert that there are no NAs or problematic values in the dataset
assert annotated_tweets_df.isna().sum().sum() == 0

In [4]:
# What are the unique values in the stance column?
annotated_tweets_df["stance"].unique()

array(['Prochoice ', 'Prolife ', 'Neutral ', 'Throw out'], dtype=object)

In [5]:
# What are the unique values in the moral column?
print(annotated_tweets_df["moral"].unique())

# What are the unique values in the care column?
print(annotated_tweets_df["care"].unique())

# What are the unique values in the fairness column?
print(annotated_tweets_df["fairness"].unique())

# What are the unique values in the authority column?
print(annotated_tweets_df["authority"].unique())

[1 0]
[1 0]
[0 1]
[0 1]


In [6]:
# Assert that if 'moral' column is 1, the OR of 'care', 'fairness', 'authority', 'loyalty', and 'purity' has to be 1
for index, row in annotated_tweets_df.iterrows():
    if row["moral"] == 1:
        assert any(row[["care", "fairness", "authority", "loyalty", "purity"]]) == 1, f"Row {index} has moral=1 but OR of care, fairness, authority, loyalty, and purity is not 1"

# Assert that if 'moral' column is 0, the OR of 'care', 'fairness', 'authority', 'loyalty', and 'purity' has to be 0
for index, row in annotated_tweets_df.iterrows():
    if row["moral"] == 0:
        assert not any(row[["care", "fairness", "authority", "loyalty", "purity"]])

In [7]:
# Clean up the stance column values
annotated_tweets_df["stance"] = annotated_tweets_df["stance"].str.strip()
annotated_tweets_df["stance"] = annotated_tweets_df["stance"].replace({
    "Prochoice": "choice",
    "Prolife": "life", 
    "Neutral": "neutral",
    "Throw out": "throw_out"
})

# Verify the changes
print("Updated stance column values:")
print(annotated_tweets_df["stance"].unique())

# Assert that the stance column has only ['choice', 'life', 'neutral', 'throw_out'] values
assert annotated_tweets_df["stance"].isin(["choice", "life", "neutral", "throw_out"]).all()

# Assert that the moral column has only [0, 1] values
assert annotated_tweets_df["moral"].isin([0, 1]).all()

Updated stance column values:
['choice' 'life' 'neutral' 'throw_out']


In [9]:
# assert that these are among the column names: "care", "purity", "loyalty", "authority", "fairness"
assert all(col in annotated_tweets_df.columns for col in ["care", "purity", "loyalty", "authority", "fairness"])

In [10]:
# rename the "stance" column to "prolife_prochoice"
# rename the "moral" column to "binary_morality"
annotated_tweets_df.rename(columns={"stance": "prolife_prochoice", "moral": "binary_morality"}, inplace=True)
annotated_tweets_df

Unnamed: 0,full_text,prolife_prochoice,binary_morality,care,fairness,authority,loyalty,purity
0,The conversations I have with #prolife org rep...,choice,1,1,0,0,0,0
1,This is why @VoteChoice Local Impact Report ma...,choice,1,0,0,1,0,0
2,Just 1 more way we seek to empower women. #pro...,life,0,0,0,0,0,0
3,This fight about Roe V Wade is insane. Doesn't...,choice,1,0,1,0,0,0
4,Rise &amp; shine! It's a new day and you're al...,life,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...
495,We're going to put Roe v Wade on the SD ballot...,neutral,0,0,0,0,0,0
496,Almost 3 decades ago I had callers to my talk ...,life,1,1,0,0,0,0
497,If you still fucking with this economy and pos...,choice,0,0,0,0,0,0
498,Well they got Roe vs Wade overturned which is ...,choice,0,0,0,0,0,0


In [11]:
# if 'binary_morality' column is set to 1, set it to 'moral' and if it is set to 0, set it to 'non-moral'
annotated_tweets_df["binary_morality"] = annotated_tweets_df["binary_morality"].map({1: "moral", 0: "non-moral"})
annotated_tweets_df

Unnamed: 0,full_text,prolife_prochoice,binary_morality,care,fairness,authority,loyalty,purity
0,The conversations I have with #prolife org rep...,choice,moral,1,0,0,0,0
1,This is why @VoteChoice Local Impact Report ma...,choice,moral,0,0,1,0,0
2,Just 1 more way we seek to empower women. #pro...,life,non-moral,0,0,0,0,0
3,This fight about Roe V Wade is insane. Doesn't...,choice,moral,0,1,0,0,0
4,Rise &amp; shine! It's a new day and you're al...,life,moral,0,0,1,0,1
...,...,...,...,...,...,...,...,...
495,We're going to put Roe v Wade on the SD ballot...,neutral,non-moral,0,0,0,0,0
496,Almost 3 decades ago I had callers to my talk ...,life,moral,1,0,0,0,0
497,If you still fucking with this economy and pos...,choice,non-moral,0,0,0,0,0
498,Well they got Roe vs Wade overturned which is ...,choice,non-moral,0,0,0,0,0


In [12]:
annotated_tweets_df

Unnamed: 0,full_text,prolife_prochoice,binary_morality,care,fairness,authority,loyalty,purity
0,The conversations I have with #prolife org rep...,choice,moral,1,0,0,0,0
1,This is why @VoteChoice Local Impact Report ma...,choice,moral,0,0,1,0,0
2,Just 1 more way we seek to empower women. #pro...,life,non-moral,0,0,0,0,0
3,This fight about Roe V Wade is insane. Doesn't...,choice,moral,0,1,0,0,0
4,Rise &amp; shine! It's a new day and you're al...,life,moral,0,0,1,0,1
...,...,...,...,...,...,...,...,...
495,We're going to put Roe v Wade on the SD ballot...,neutral,non-moral,0,0,0,0,0
496,Almost 3 decades ago I had callers to my talk ...,life,moral,1,0,0,0,0
497,If you still fucking with this economy and pos...,choice,non-moral,0,0,0,0,0
498,Well they got Roe vs Wade overturned which is ...,choice,non-moral,0,0,0,0,0


In [13]:
# save it along the original csv file with the name "preprocessed_500_annotated_tweets.csv"
annotated_tweets_df.to_csv(os.path.join("../raw/preprocessed_500_annotated_tweets.csv"), index=False)