In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

# Statistical and ML imports
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

# Load the Anthropic Persuasion Dataset
print("Loading Anthropic Persuasion Dataset...")
ds = load_dataset("Anthropic/persuasion")
print(f"Dataset loaded successfully!")
print(f"Split: {ds.keys()}")
print(f"Dataset size: {len(ds['train'])} records")

Loading Anthropic Persuasion Dataset...




README.md: 0.00B [00:00, ?B/s]

persuasion_data.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3939 [00:00<?, ? examples/s]

Dataset loaded successfully!
Split: dict_keys(['train'])
Dataset size: 3939 records


In [2]:
# Dataset Structure
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(ds['train'])
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()


DataFrame shape: (3939, 8)

Column names and types:
worker_id                object
claim                    object
argument                 object
source                   object
prompt_type              object
rating_initial           object
rating_final             object
persuasiveness_metric     int64
dtype: object

First few rows:


Unnamed: 0,worker_id,claim,argument,source,prompt_type,rating_initial,rating_final,persuasiveness_metric
0,PQVTZECGNK3K,Governments and technology companies must do m...,It's time for governments and tech companies t...,Claude 2,Expert Writer Rhetorics,7 - Strongly support,7 - Strongly support,0
1,3KTT9HNPV9WX,Governments and technology companies must do m...,"In today's hyper-connected world, our personal...",Claude 3 Haiku,Expert Writer Rhetorics,7 - Strongly support,7 - Strongly support,0
2,M76GMRF46C69,Cultured/lab-grown meats should be allowed to ...,The future of food must include cultured/lab-g...,Claude 2,Compelling Case,3 - Somewhat oppose,5 - Somewhat support,2
3,3W4KKCTPTP7R,Social media companies should be required to l...,Social media companies should be required to l...,Claude 2,Compelling Case,3 - Somewhat oppose,6 - Support,3
4,QQDKMRY3HRXJ,Employers should be allowed to monitor employe...,Allowing employers to monitor employees throug...,Claude 3 Opus,Logical Reasoning,5 - Somewhat support,5 - Somewhat support,0


In [3]:
# Statistical Summary
print("\nNumerical Columns Summary:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nCategorical Variables Unique Values:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    if df[col].nunique() <= 10:
        print(f"  Values: {df[col].unique()}")


Numerical Columns Summary:
       persuasiveness_metric
count            3939.000000
mean                0.417111
std                 0.886626
min                -2.000000
25%                 0.000000
50%                 0.000000
75%                 1.000000
max                 5.000000

Missing Values:
worker_id                  0
claim                      0
argument                   0
source                     0
prompt_type              522
rating_initial             0
rating_final               0
persuasiveness_metric      0
dtype: int64

Categorical Variables Unique Values:
worker_id: 3832 unique values
claim: 75 unique values
argument: 1313 unique values
source: 7 unique values
  Values: ['Claude 2' 'Claude 3 Haiku' 'Claude 3 Opus' 'Claude Instant 1.2'
 'Claude 1.3' 'Human' 'Control']
prompt_type: 5 unique values
  Values: ['Expert Writer Rhetorics' 'Compelling Case' 'Logical Reasoning'
 'Deceptive' None 'Control Prompt']
rating_initial: 7 unique values
  Values: ['7 - Strongl

In [4]:
# DATA PRE-PROCESSING
import re
import string

def clean_text(text):
    """
    Apply basic text cleaning: lowercase, remove punctuation, remove extra whitespace
    """
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 1. Rating Conversion (Text to Numeric)
rating_map = {
    '1 - Strongly oppose': 1,
    '2 - Oppose': 2,
    '3 - Somewhat oppose': 3,
    '4 - Neither oppose nor support': 4,
    '5 - Somewhat support': 5,
    '6 - Support': 6,
    '7 - Strongly support': 7
}

print("Converting categorical ratings to numeric scale (1-7)...")
df['rating_initial_num'] = df['rating_initial'].map(rating_map)
df['rating_final_num'] = df['rating_final'].map(rating_map)

# Replace original columns with numeric ones for compatibility with existing code
df['rating_initial'] = df['rating_initial_num']
df['rating_final'] = df['rating_final_num']

# 2. Text Cleaning
print("Cleaning argument text for feature extraction...")
df['argument_cleaned'] = df['argument'].apply(clean_text)

# 3. Handle Missing Values
print("Handling missing values...")
df['prompt_type'] = df['prompt_type'].fillna('Unknown')
# Drop rows where ratings couldn't be converted
df = df.dropna(subset=['rating_initial', 'rating_final']).copy()

# 4. Source Type Classification
print("Classifying sources (AI vs Human)...")
df['source_type'] = df['source'].apply(lambda x: 'AI' if any(ai in x.lower() for ai in ['claude', 'gpt', 'ai']) else 'Human')

print(f"\nPre-processing complete. Processed {len(df)} records.")
df[['source', 'source_type', 'rating_initial', 'rating_final']].head()

Converting categorical ratings to numeric scale (1-7)...
Cleaning argument text for feature extraction...
Handling missing values...
Classifying sources (AI vs Human)...

Pre-processing complete. Processed 3939 records.


Unnamed: 0,source,source_type,rating_initial,rating_final
0,Claude 2,AI,7,7
1,Claude 3 Haiku,AI,7,7
2,Claude 2,AI,3,5
3,Claude 2,AI,3,6
4,Claude 3 Opus,AI,5,5
