In [5]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Load Data
train_df = pd.read_csv('Training_data.csv')
test_df = pd.read_csv('Test_data.csv')

# P-Hacked Heuristic: Counts occurrences of the word 'the'
def count_the(text):
    return text.lower().split().count('the')

# Apply heuristic to both datasets
train_df['The_Score'] = train_df['Document'].apply(count_the)
test_df['The_Score'] = test_df['Document'].apply(count_the)

# Function to Extract Scores for Human and AI
def get_scores(df, score_column):
    human_scores = df[df['Human OR AI'] == 'Human'][score_column]
    ai_scores = df[df['Human OR AI'] == 'AI'][score_column]
    return human_scores, ai_scores

# Extract scores
train_human, train_ai = get_scores(train_df, 'The_Score')
test_human, test_ai = get_scores(test_df, 'The_Score')

# Statistical Significance Tests (t-test)
train_t_stat, train_p_value = ttest_ind(train_human, train_ai, equal_var=False)
test_t_stat, test_p_value = ttest_ind(test_human, test_ai, equal_var=False)

# Display t-test Results
print("True P-Hacked Algorithm Evaluation (Word 'the')")
print(f"Training Data: t-stat={train_t_stat:.4f}, p-value={train_p_value:.4f}")
print(f"Test Data: t-stat={test_t_stat:.4f}, p-value={test_p_value:.4f}")

# Confusion Matrix Calculation Function
def build_confusion_matrix(df, score_column, threshold):
    df['Predicted AI'] = df[score_column] > threshold
    tp = np.sum((df['Predicted AI'] == True) & (df['Human OR AI'] == 'AI'))
    tn = np.sum((df['Predicted AI'] == False) & (df['Human OR AI'] == 'Human'))
    fp = np.sum((df['Predicted AI'] == True) & (df['Human OR AI'] == 'Human'))
    fn = np.sum((df['Predicted AI'] == False) & (df['Human OR AI'] == 'AI'))
    return {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}

# Determine Threshold: Midpoint Between Human and AI Means
threshold = (train_human.mean() + train_ai.mean()) / 2

# Build and Display Confusion Matrices
train_conf_matrix = build_confusion_matrix(train_df, 'The_Score', threshold)
test_conf_matrix = build_confusion_matrix(test_df, 'The_Score', threshold)

print("\nConfusion Matrix on Training Data:", train_conf_matrix)
print("Confusion Matrix on Test Data:", test_conf_matrix)


True P-Hacked Algorithm Evaluation (Word 'the')
Training Data: t-stat=1.4904, p-value=0.1647
Test Data: t-stat=0.5382, p-value=0.6172

Confusion Matrix on Training Data: {'TP': np.int64(4), 'TN': np.int64(2), 'FP': np.int64(6), 'FN': np.int64(4)}
Confusion Matrix on Test Data: {'TP': np.int64(2), 'TN': np.int64(2), 'FP': np.int64(2), 'FN': np.int64(2)}
