# Setup

# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

# Read data

In [3]:
df = pd.read_csv("trials_with_predictions.csv")

In [4]:

# Fill all empty cells in column "Annotation_disagreement" with 0
df["Annotation_disagreement"] = df["Annotation_disagreement"].fillna(0)

# Fill all empty cells in column "Disagreement_reason" with "No disagreement"
df["Disagreement_reason"] = df["Disagreement_reason"].fillna("No disagreement")

# Rename POSITIVE to Positive and NEGATIVE to Negative
df = df.replace({"POSITIVE": "Positive", "NEGATIVE": "Negative", "NEGATIVE.": "Negative"})

In [2]:
df.to_csv("trials_with_predictions_cleaned.csv", index=False)

NameError: name 'df' is not defined

# Compute annotator agreement

In [None]:
df["Annotation_disagreement"].value_counts()

print(f"Annotators disagreed on {df['Annotation_disagreement'].value_counts()[1] / df.shape[0]:.2%} (n={df['Annotation_disagreement'].value_counts()[1]}) of trials")


# Analyze Characteristics

In [None]:
# Count the number and percentages of positive and negative trials
print(df["Annotation_accept"].value_counts(normalize=False))
print(df["Annotation_accept"].value_counts(normalize=True))


# Analyze Performance


## Conclusion

### GPT-3.5 Turbo

In [None]:
# Create a crosstab of the GPT-3.5 Turbo predictions when reading only the conclusion and the true labels
gpt35turbo_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt35turbo_temp100_conclusion_response_raw"]
)

# Calculate true positives, false positives, false negatives, and true negatives
gpt35turbo_conclusion_tp = gpt35turbo_conclusion_crosstab.loc["Positive", "Positive"]
gpt35turbo_conclusion_fp = gpt35turbo_conclusion_crosstab.loc["Negative", "Positive"]
gpt35turbo_conclusion_fn = gpt35turbo_conclusion_crosstab.loc["Positive", "Negative"]
gpt35turbo_conclusion_tn = gpt35turbo_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt35turbo_conclusion_accuracy = (gpt35turbo_conclusion_tp + gpt35turbo_conclusion_tn) / (gpt35turbo_conclusion_tp + gpt35turbo_conclusion_tn + gpt35turbo_conclusion_fp + gpt35turbo_conclusion_fn)
gpt35turbo_conclusion_precision = gpt35turbo_conclusion_tp / (gpt35turbo_conclusion_tp + gpt35turbo_conclusion_fp)
gpt35turbo_conclusion_recall = gpt35turbo_conclusion_tp / (gpt35turbo_conclusion_tp + gpt35turbo_conclusion_fn)
gpt35turbo_conclusion_f1 = 2 * (gpt35turbo_conclusion_precision * gpt35turbo_conclusion_recall) / (gpt35turbo_conclusion_precision + gpt35turbo_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt35turbo_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt35turbo_conclusion_accuracy * (1 - gpt35turbo_conclusion_accuracy)) / df.shape[0])
gpt35turbo_conclusion_precision_ci = 1.96 * np.sqrt((gpt35turbo_conclusion_precision * (1 - gpt35turbo_conclusion_precision)) / df.shape[0])
gpt35turbo_conclusion_recall_ci = 1.96 * np.sqrt((gpt35turbo_conclusion_recall * (1 - gpt35turbo_conclusion_recall)) / df.shape[0])
gpt35turbo_conclusion_f1_ci = 1.96 * np.sqrt((gpt35turbo_conclusion_f1 * (1 - gpt35turbo_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-3.5 Turbo Conclusion Accuracy: {gpt35turbo_conclusion_accuracy:.2f} ({gpt35turbo_conclusion_accuracy - gpt35turbo_conclusion_accuracy_ci:.2f} - {gpt35turbo_conclusion_accuracy + gpt35turbo_conclusion_accuracy_ci:.2f})")
print(f"GPT-3.5 Turbo Conclusion Precision: {gpt35turbo_conclusion_precision:.2f} ({gpt35turbo_conclusion_precision - gpt35turbo_conclusion_precision_ci:.2f} - {gpt35turbo_conclusion_precision + gpt35turbo_conclusion_precision_ci:.2f})")
print(f"GPT-3.5 Turbo Conclusion Recall: {gpt35turbo_conclusion_recall:.2f} ({gpt35turbo_conclusion_recall - gpt35turbo_conclusion_recall_ci:.2f} - {gpt35turbo_conclusion_recall + gpt35turbo_conclusion_recall_ci:.2f})")
print(f"GPT-3.5 Turbo Conclusion F1: {gpt35turbo_conclusion_f1:.2f} ({gpt35turbo_conclusion_f1 - gpt35turbo_conclusion_f1_ci:.2f} - {gpt35turbo_conclusion_f1 + gpt35turbo_conclusion_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt35turbo_conclusion = gpt35turbo_conclusion_crosstab.values.astype(str)
labels_gpt35turbo_conclusion[0, 0] = f"{labels_gpt35turbo_conclusion[0, 0]} ({100 * gpt35turbo_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_conclusion[0, 1] = f"{labels_gpt35turbo_conclusion[0, 1]} ({100 * gpt35turbo_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt35turbo_conclusion[1, 0] = f"{labels_gpt35turbo_conclusion[1, 0]} ({100 * gpt35turbo_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_conclusion[1, 1] = f"{labels_gpt35turbo_conclusion[1, 1]} ({100 * gpt35turbo_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({'font.size': 14})  # Increase font size
sns.heatmap(gpt35turbo_conclusion_crosstab, annot=labels_gpt35turbo_conclusion, fmt="", cmap="Blues", annot_kws={'size': 12})
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt35turbo_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

### GPT-4o

In [None]:
# Create a crosstab of the GPT-4o predictions when reading only the conclusions and the true labels
gpt4o_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt4o_temp100_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt4o_conclusion_tp = gpt4o_conclusion_crosstab.loc["Positive", "Positive"]
gpt4o_conclusion_fp = gpt4o_conclusion_crosstab.loc["Negative", "Positive"]
gpt4o_conclusion_fn = gpt4o_conclusion_crosstab.loc["Positive", "Negative"]
gpt4o_conclusion_tn = gpt4o_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt4o_conclusion_accuracy = (gpt4o_conclusion_tp + gpt4o_conclusion_tn) / (gpt4o_conclusion_tp + gpt4o_conclusion_tn + gpt4o_conclusion_fp + gpt4o_conclusion_fn)
gpt4o_conclusion_precision = gpt4o_conclusion_tp / (gpt4o_conclusion_tp + gpt4o_conclusion_fp)
gpt4o_conclusion_recall = gpt4o_conclusion_tp / (gpt4o_conclusion_tp + gpt4o_conclusion_fn)
gpt4o_conclusion_f1 = 2 * (gpt4o_conclusion_precision * gpt4o_conclusion_recall) / (gpt4o_conclusion_precision + gpt4o_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt4o_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt4o_conclusion_accuracy * (1 - gpt4o_conclusion_accuracy)) / df.shape[0])
gpt4o_conclusion_precision_ci = 1.96 * np.sqrt((gpt4o_conclusion_precision * (1 - gpt4o_conclusion_precision)) / df.shape[0])
gpt4o_conclusion_recall_ci = 1.96 * np.sqrt((gpt4o_conclusion_recall * (1 - gpt4o_conclusion_recall)) / df.shape[0])
gpt4o_conclusion_f1_ci = 1.96 * np.sqrt((gpt4o_conclusion_f1 * (1 - gpt4o_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-4o Conclusion Accuracy: {gpt4o_conclusion_accuracy:.2f} ({gpt4o_conclusion_accuracy - gpt4o_conclusion_accuracy_ci:.2f} - {gpt4o_conclusion_accuracy + gpt4o_conclusion_accuracy_ci:.2f})")
print(f"GPT-4o Conclusion Precision: {gpt4o_conclusion_precision:.2f} ({gpt4o_conclusion_precision - gpt4o_conclusion_precision_ci:.2f} - {gpt4o_conclusion_precision + gpt4o_conclusion_precision_ci:.2f})")
print(f"GPT-4o Conclusion Recall: {gpt4o_conclusion_recall:.2f} ({gpt4o_conclusion_recall - gpt4o_conclusion_recall_ci:.2f} - {gpt4o_conclusion_recall + gpt4o_conclusion_recall_ci:.2f})")
print(f"GPT-4o Conclusion F1: {gpt4o_conclusion_f1:.2f} ({gpt4o_conclusion_f1 - gpt4o_conclusion_f1_ci:.2f} - {gpt4o_conclusion_f1 + gpt4o_conclusion_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt4o_conclusion = gpt4o_conclusion_crosstab.values.astype(str)
labels_gpt4o_conclusion[0, 0] = f"{labels_gpt4o_conclusion[0, 0]} ({100 * gpt4o_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt4o_conclusion[0, 1] = f"{labels_gpt4o_conclusion[0, 1]} ({100 * gpt4o_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt4o_conclusion[1, 0] = f"{labels_gpt4o_conclusion[1, 0]} ({100 * gpt4o_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt4o_conclusion[1, 1] = f"{labels_gpt4o_conclusion[1, 1]} ({100 * gpt4o_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({'font.size': 14})  # Increase font size
sns.heatmap(gpt4o_conclusion_crosstab, annot=labels_gpt4o_conclusion, fmt="", cmap="Blues", annot_kws={'size': 12})
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt4o_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

### o1

In [None]:
# Create a crosstab of the o1 predictions when reading only the conclusions and the true labels
o1_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["o1_temp100_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
o1_conclusion_tp = o1_conclusion_crosstab.loc["Positive", "Positive"]
o1_conclusion_fp = o1_conclusion_crosstab.loc["Negative", "Positive"]
o1_conclusion_fn = o1_conclusion_crosstab.loc["Positive", "Negative"]
o1_conclusion_tn = o1_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
o1_conclusion_accuracy = (o1_conclusion_tp + o1_conclusion_tn) / (o1_conclusion_tp + o1_conclusion_tn + o1_conclusion_fp + o1_conclusion_fn)
o1_conclusion_precision = o1_conclusion_tp / (o1_conclusion_tp + o1_conclusion_fp)
o1_conclusion_recall = o1_conclusion_tp / (o1_conclusion_tp + o1_conclusion_fn)
o1_conclusion_f1 = 2 * (o1_conclusion_precision * o1_conclusion_recall) / (o1_conclusion_precision + o1_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
o1_conclusion_accuracy_ci = 1.96 * np.sqrt((o1_conclusion_accuracy * (1 - o1_conclusion_accuracy)) / df.shape[0])
o1_conclusion_precision_ci = 1.96 * np.sqrt((o1_conclusion_precision * (1 - o1_conclusion_precision)) / df.shape[0])
o1_conclusion_recall_ci = 1.96 * np.sqrt((o1_conclusion_recall * (1 - o1_conclusion_recall)) / df.shape[0])
o1_conclusion_f1_ci = 1.96 * np.sqrt((o1_conclusion_f1 * (1 - o1_conclusion_f1)) / df.shape[0])

# Print the results
print(f"o1 Conclusion Accuracy: {o1_conclusion_accuracy:.2f} ({o1_conclusion_accuracy - o1_conclusion_accuracy_ci:.2f} - {o1_conclusion_accuracy + o1_conclusion_accuracy_ci:.2f})")
print(f"o1 Conclusion Precision: {o1_conclusion_precision:.2f} ({o1_conclusion_precision - o1_conclusion_precision_ci:.2f} - {o1_conclusion_precision + o1_conclusion_precision_ci:.2f})")
print(f"o1 Conclusion Recall: {o1_conclusion_recall:.2f} ({o1_conclusion_recall - o1_conclusion_recall_ci:.2f} - {o1_conclusion_recall + o1_conclusion_recall_ci:.2f})")
print(f"o1 Conclusion F1: {o1_conclusion_f1:.2f} ({o1_conclusion_f1 - o1_conclusion_f1_ci:.2f} - {o1_conclusion_f1 + o1_conclusion_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_o1_conclusion = o1_conclusion_crosstab.values.astype(str)
labels_o1_conclusion[0, 0] = f"{labels_o1_conclusion[0, 0]} ({100 * o1_conclusion_tn / (df.shape[0]):.1f}%)"
labels_o1_conclusion[0, 1] = f"{labels_o1_conclusion[0, 1]} ({100 * o1_conclusion_fp / (df.shape[0]):.1f}%)"
labels_o1_conclusion[1, 0] = f"{labels_o1_conclusion[1, 0]} ({100 * o1_conclusion_fn / (df.shape[0]):.1f}%)"
labels_o1_conclusion[1, 1] = f"{labels_o1_conclusion[1, 1]} ({100 * o1_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({'font.size': 14})  # Increase font size
sns.heatmap(o1_conclusion_crosstab, annot=labels_o1_conclusion, fmt="", cmap="Blues", annot_kws={'size': 12})
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/o1_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

## Methods and Conclusion

### GPT-3.5 Turbo

In [None]:
# Create a crosstab of the GPT-3.5 Turbo predictions when reading only the methods/conclusion and the true labels
gpt35turbo_methods_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt35turbo_temp100_methods_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt35turbo_methods_conclusion_tp = gpt35turbo_methods_conclusion_crosstab.loc["Positive", "Positive"]
gpt35turbo_methods_conclusion_fp = gpt35turbo_methods_conclusion_crosstab.loc["Negative", "Positive"]
gpt35turbo_methods_conclusion_fn = gpt35turbo_methods_conclusion_crosstab.loc["Positive", "Negative"]
gpt35turbo_methods_conclusion_tn = gpt35turbo_methods_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt35turbo_methods_conclusion_accuracy = (gpt35turbo_methods_conclusion_tp + gpt35turbo_methods_conclusion_tn) / (gpt35turbo_methods_conclusion_tp + gpt35turbo_methods_conclusion_tn + gpt35turbo_methods_conclusion_fp + gpt35turbo_methods_conclusion_fn)
gpt35turbo_methods_conclusion_precision = gpt35turbo_methods_conclusion_tp / (gpt35turbo_methods_conclusion_tp + gpt35turbo_methods_conclusion_fp)
gpt35turbo_methods_conclusion_recall = gpt35turbo_methods_conclusion_tp / (gpt35turbo_methods_conclusion_tp + gpt35turbo_methods_conclusion_fn)
gpt35turbo_methods_conclusion_f1 = 2 * (gpt35turbo_methods_conclusion_precision * gpt35turbo_methods_conclusion_recall) / (gpt35turbo_methods_conclusion_precision + gpt35turbo_methods_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt35turbo_methods_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt35turbo_methods_conclusion_accuracy * (1 - gpt35turbo_methods_conclusion_accuracy)) / df.shape[0])
gpt35turbo_methods_conclusion_precision_ci = 1.96 * np.sqrt((gpt35turbo_methods_conclusion_precision * (1 - gpt35turbo_methods_conclusion_precision)) / df.shape[0])
gpt35turbo_methods_conclusion_recall_ci = 1.96 * np.sqrt((gpt35turbo_methods_conclusion_recall * (1 - gpt35turbo_methods_conclusion_recall)) / df.shape[0])
gpt35turbo_methods_conclusion_f1_ci = 1.96 * np.sqrt((gpt35turbo_methods_conclusion_f1 * (1 - gpt35turbo_methods_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-3.5 Turbo Methods and Conclusion Accuracy: {gpt35turbo_methods_conclusion_accuracy:.2f} ({gpt35turbo_methods_conclusion_accuracy - gpt35turbo_methods_conclusion_accuracy_ci:.2f} - {gpt35turbo_methods_conclusion_accuracy + gpt35turbo_methods_conclusion_accuracy_ci:.2f})")
print(f"GPT-3.5 Turbo Methods and Conclusion Precision: {gpt35turbo_methods_conclusion_precision:.2f} ({gpt35turbo_methods_conclusion_precision - gpt35turbo_methods_conclusion_precision_ci:.2f} - {gpt35turbo_methods_conclusion_precision + gpt35turbo_methods_conclusion_precision_ci:.2f})")
print(f"GPT-3.5 Turbo Methods and Conclusion Recall: {gpt35turbo_methods_conclusion_recall:.2f} ({gpt35turbo_methods_conclusion_recall - gpt35turbo_methods_conclusion_recall_ci:.2f} - {gpt35turbo_methods_conclusion_recall + gpt35turbo_methods_conclusion_recall_ci:.2f})")
print(f"GPT-3.5 Turbo Methods and Conclusion F1: {gpt35turbo_methods_conclusion_f1:.2f} ({gpt35turbo_methods_conclusion_f1 - gpt35turbo_methods_conclusion_f1_ci:.2f} - {gpt35turbo_methods_conclusion_f1 + gpt35turbo_methods_conclusion_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt35turbo_methods_conclusion = gpt35turbo_methods_conclusion_crosstab.values.astype(str)
labels_gpt35turbo_methods_conclusion[0, 0] = f"{labels_gpt35turbo_methods_conclusion[0, 0]} ({100 * gpt35turbo_methods_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_conclusion[0, 1] = f"{labels_gpt35turbo_methods_conclusion[0, 1]} ({100 * gpt35turbo_methods_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_conclusion[1, 0] = f"{labels_gpt35turbo_methods_conclusion[1, 0]} ({100 * gpt35turbo_methods_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_conclusion[1, 1] = f"{labels_gpt35turbo_methods_conclusion[1, 1]} ({100 * gpt35turbo_methods_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt35turbo_methods_conclusion_crosstab, annot=labels_gpt35turbo_methods_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig(
    "plots/gpt35turbo_methods_conclusion_confusion_matrix.png",
    dpi=300,
    bbox_inches="tight",
)
plt.show()  

### GPT-4o

In [None]:
# Create a crosstab of the GPT-4o predictions when reading only the methods/conclusion and the true labels
gpt4o_methods_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt4o_temp100_methods_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt4o_methods_conclusion_tp = gpt4o_methods_conclusion_crosstab.loc["Positive", "Positive"]
gpt4o_methods_conclusion_fp = gpt4o_methods_conclusion_crosstab.loc["Negative", "Positive"]
gpt4o_methods_conclusion_fn = gpt4o_methods_conclusion_crosstab.loc["Positive", "Negative"]
gpt4o_methods_conclusion_tn = gpt4o_methods_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt4o_methods_conclusion_accuracy = (gpt4o_methods_conclusion_tp + gpt4o_methods_conclusion_tn) / (gpt4o_methods_conclusion_tp + gpt4o_methods_conclusion_tn + gpt4o_methods_conclusion_fp + gpt4o_methods_conclusion_fn)
gpt4o_methods_conclusion_precision = gpt4o_methods_conclusion_tp / (gpt4o_methods_conclusion_tp + gpt4o_methods_conclusion_fp)
gpt4o_methods_conclusion_recall = gpt4o_methods_conclusion_tp / (gpt4o_methods_conclusion_tp + gpt4o_methods_conclusion_fn)
gpt4o_methods_conclusion_f1 = 2 * (gpt4o_methods_conclusion_precision * gpt4o_methods_conclusion_recall) / (gpt4o_methods_conclusion_precision + gpt4o_methods_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt4o_methods_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt4o_methods_conclusion_accuracy * (1 - gpt4o_methods_conclusion_accuracy)) / df.shape[0])
gpt4o_methods_conclusion_precision_ci = 1.96 * np.sqrt((gpt4o_methods_conclusion_precision * (1 - gpt4o_methods_conclusion_precision)) / df.shape[0])
gpt4o_methods_conclusion_recall_ci = 1.96 * np.sqrt((gpt4o_methods_conclusion_recall * (1 - gpt4o_methods_conclusion_recall)) / df.shape[0])
gpt4o_methods_conclusion_f1_ci = 1.96 * np.sqrt((gpt4o_methods_conclusion_f1 * (1 - gpt4o_methods_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-4o Methods and Conclusion Accuracy: {gpt4o_methods_conclusion_accuracy:.2f} ({gpt4o_methods_conclusion_accuracy - gpt4o_methods_conclusion_accuracy_ci:.2f} - {gpt4o_methods_conclusion_accuracy + gpt4o_methods_conclusion_accuracy_ci:.2f})")
print(f"GPT-4o Methods and Conclusion Precision: {gpt4o_methods_conclusion_precision:.2f} ({gpt4o_methods_conclusion_precision - gpt4o_methods_conclusion_precision_ci:.2f} - {gpt4o_methods_conclusion_precision + gpt4o_methods_conclusion_precision_ci:.2f})")
print(f"GPT-4o Methods and Conclusion Recall: {gpt4o_methods_conclusion_recall:.2f} ({gpt4o_methods_conclusion_recall - gpt4o_methods_conclusion_recall_ci:.2f} - {gpt4o_methods_conclusion_recall + gpt4o_methods_conclusion_recall_ci:.2f})")
print(f"GPT-4o Methods and Conclusion F1: {gpt4o_methods_conclusion_f1:.2f} ({gpt4o_methods_conclusion_f1 - gpt4o_methods_conclusion_f1_ci:.2f} - {gpt4o_methods_conclusion_f1 + gpt4o_methods_conclusion_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt4o_methods_conclusion = gpt4o_methods_conclusion_crosstab.values.astype(str)
labels_gpt4o_methods_conclusion[0, 0] = f"{labels_gpt4o_methods_conclusion[0, 0]} ({100 * gpt4o_methods_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_conclusion[0, 1] = f"{labels_gpt4o_methods_conclusion[0, 1]} ({100 * gpt4o_methods_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_conclusion[1, 0] = f"{labels_gpt4o_methods_conclusion[1, 0]} ({100 * gpt4o_methods_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_conclusion[1, 1] = f"{labels_gpt4o_methods_conclusion[1, 1]} ({100 * gpt4o_methods_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt4o_methods_conclusion_crosstab, annot=labels_gpt4o_methods_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt4o_methods_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

### o1

In [None]:
# Create a crosstab of the o1 predictions when reading only the methods/conclusion and the true labels
o1_methods_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["o1_temp100_methods_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
o1_methods_conclusion_tp = o1_methods_conclusion_crosstab.loc["Positive", "Positive"]
o1_methods_conclusion_fp = o1_methods_conclusion_crosstab.loc["Negative", "Positive"]
o1_methods_conclusion_fn = o1_methods_conclusion_crosstab.loc["Positive", "Negative"]
o1_methods_conclusion_tn = o1_methods_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
o1_methods_conclusion_accuracy = (o1_methods_conclusion_tp + o1_methods_conclusion_tn) / (o1_methods_conclusion_tp + o1_methods_conclusion_tn + o1_methods_conclusion_fp + o1_methods_conclusion_fn)
o1_methods_conclusion_precision = o1_methods_conclusion_tp / (o1_methods_conclusion_tp + o1_methods_conclusion_fp)
o1_methods_conclusion_recall = o1_methods_conclusion_tp / (o1_methods_conclusion_tp + o1_methods_conclusion_fn)
o1_methods_conclusion_f1 = 2 * (o1_methods_conclusion_precision * o1_methods_conclusion_recall) / (o1_methods_conclusion_precision + o1_methods_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
o1_methods_conclusion_accuracy_ci = 1.96 * np.sqrt((o1_methods_conclusion_accuracy * (1 - o1_methods_conclusion_accuracy)) / df.shape[0])
o1_methods_conclusion_precision_ci = 1.96 * np.sqrt((o1_methods_conclusion_precision * (1 - o1_methods_conclusion_precision)) / df.shape[0])
o1_methods_conclusion_recall_ci = 1.96 * np.sqrt((o1_methods_conclusion_recall * (1 - o1_methods_conclusion_recall)) / df.shape[0])
o1_methods_conclusion_f1_ci = 1.96 * np.sqrt((o1_methods_conclusion_f1 * (1 - o1_methods_conclusion_f1)) / df.shape[0])

# Print the results
print(f"o1 Methods and Conclusion Accuracy: {o1_methods_conclusion_accuracy:.2f} ({o1_methods_conclusion_accuracy - o1_methods_conclusion_accuracy_ci:.2f} - {o1_methods_conclusion_accuracy + o1_methods_conclusion_accuracy_ci:.2f})")
print(f"o1 Methods and Conclusion Precision: {o1_methods_conclusion_precision:.2f} ({o1_methods_conclusion_precision - o1_methods_conclusion_precision_ci:.2f} - {o1_methods_conclusion_precision + o1_methods_conclusion_precision_ci:.2f})")
print(f"o1 Methods and Conclusion Recall: {o1_methods_conclusion_recall:.2f} ({o1_methods_conclusion_recall - o1_methods_conclusion_recall_ci:.2f} - {o1_methods_conclusion_recall + o1_methods_conclusion_recall_ci:.2f})")
print(f"o1 Methods and Conclusion F1: {o1_methods_conclusion_f1:.2f} ({o1_methods_conclusion_f1 - o1_methods_conclusion_f1_ci:.2f} - {o1_methods_conclusion_f1 + o1_methods_conclusion_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_o1_methods_conclusion = o1_methods_conclusion_crosstab.values.astype(str)
labels_o1_methods_conclusion[0, 0] = f"{labels_o1_methods_conclusion[0, 0]} ({100 * o1_methods_conclusion_tn / (df.shape[0]):.1f}%)"
labels_o1_methods_conclusion[0, 1] = f"{labels_o1_methods_conclusion[0, 1]} ({100 * o1_methods_conclusion_fp / (df.shape[0]):.1f}%)"
labels_o1_methods_conclusion[1, 0] = f"{labels_o1_methods_conclusion[1, 0]} ({100 * o1_methods_conclusion_fn / (df.shape[0]):.1f}%)"
labels_o1_methods_conclusion[1, 1] = f"{labels_o1_methods_conclusion[1, 1]} ({100 * o1_methods_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(o1_methods_conclusion_crosstab, annot=labels_o1_methods_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/o1_methods_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

## Methods, Results, and Conclusion

### GPT-3.5 Turbo

In [None]:
# Create a crosstab of the GPT-3.5 Turbo predictions when reading only the methods/results/conclusion and the true labels
gpt35turbo_methods_results_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt35turbo_temp100_methods_results_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt35turbo_methods_results_conclusion_tp = gpt35turbo_methods_results_conclusion_crosstab.loc["Positive", "Positive"]
gpt35turbo_methods_results_conclusion_fp = gpt35turbo_methods_results_conclusion_crosstab.loc["Negative", "Positive"]
gpt35turbo_methods_results_conclusion_fn = gpt35turbo_methods_results_conclusion_crosstab.loc["Positive", "Negative"]
gpt35turbo_methods_results_conclusion_tn = gpt35turbo_methods_results_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt35turbo_methods_results_conclusion_accuracy = (gpt35turbo_methods_results_conclusion_tp + gpt35turbo_methods_results_conclusion_tn) / (gpt35turbo_methods_results_conclusion_tp + gpt35turbo_methods_results_conclusion_tn + gpt35turbo_methods_results_conclusion_fp + gpt35turbo_methods_results_conclusion_fn)
gpt35turbo_methods_results_conclusion_precision = gpt35turbo_methods_results_conclusion_tp / (gpt35turbo_methods_results_conclusion_tp + gpt35turbo_methods_results_conclusion_fp)
gpt35turbo_methods_results_conclusion_recall = gpt35turbo_methods_results_conclusion_tp / (gpt35turbo_methods_results_conclusion_tp + gpt35turbo_methods_results_conclusion_fn)
gpt35turbo_methods_results_conclusion_f1 = 2 * (gpt35turbo_methods_results_conclusion_precision * gpt35turbo_methods_results_conclusion_recall) / (gpt35turbo_methods_results_conclusion_precision + gpt35turbo_methods_results_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt35turbo_methods_results_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt35turbo_methods_results_conclusion_accuracy * (1 - gpt35turbo_methods_results_conclusion_accuracy)) / df.shape[0])
gpt35turbo_methods_results_conclusion_precision_ci = 1.96 * np.sqrt((gpt35turbo_methods_results_conclusion_precision * (1 - gpt35turbo_methods_results_conclusion_precision)) / df.shape[0])
gpt35turbo_methods_results_conclusion_recall_ci = 1.96 * np.sqrt((gpt35turbo_methods_results_conclusion_recall * (1 - gpt35turbo_methods_results_conclusion_recall)) / df.shape[0])
gpt35turbo_methods_results_conclusion_f1_ci = 1.96 * np.sqrt((gpt35turbo_methods_results_conclusion_f1 * (1 - gpt35turbo_methods_results_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-3.5 Turbo Methods, Results, and Conclusion Accuracy: {gpt35turbo_methods_results_conclusion_accuracy:.2f} ({gpt35turbo_methods_results_conclusion_accuracy - gpt35turbo_methods_results_conclusion_accuracy_ci:.2f} - {gpt35turbo_methods_results_conclusion_accuracy + gpt35turbo_methods_results_conclusion_accuracy_ci:.2f})")
print(f"GPT-3.5 Turbo Methods, Results, and Conclusion Precision: {gpt35turbo_methods_results_conclusion_precision:.2f} ({gpt35turbo_methods_results_conclusion_precision - gpt35turbo_methods_results_conclusion_precision_ci:.2f} - {gpt35turbo_methods_results_conclusion_precision + gpt35turbo_methods_results_conclusion_precision_ci:.2f})")
print(f"GPT-3.5 Turbo Methods, Results, and Conclusion Recall: {gpt35turbo_methods_results_conclusion_recall:.2f} ({gpt35turbo_methods_results_conclusion_recall - gpt35turbo_methods_results_conclusion_recall_ci:.2f} - {gpt35turbo_methods_results_conclusion_recall + gpt35turbo_methods_results_conclusion_recall_ci:.2f})")
print(f"GPT-3.5 Turbo Methods, Results, and Conclusion F1: {gpt35turbo_methods_results_conclusion_f1:.2f} ({gpt35turbo_methods_results_conclusion_f1 - gpt35turbo_methods_results_conclusion_f1_ci:.2f} - {gpt35turbo_methods_results_conclusion_f1 + gpt35turbo_methods_results_conclusion_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt35turbo_methods_results_conclusion = gpt35turbo_methods_results_conclusion_crosstab.values.astype(str)
labels_gpt35turbo_methods_results_conclusion[0, 0] = f"{labels_gpt35turbo_methods_results_conclusion[0, 0]} ({100 * gpt35turbo_methods_results_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_results_conclusion[0, 1] = f"{labels_gpt35turbo_methods_results_conclusion[0, 1]} ({100 * gpt35turbo_methods_results_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_results_conclusion[1, 0] = f"{labels_gpt35turbo_methods_results_conclusion[1, 0]} ({100 * gpt35turbo_methods_results_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_methods_results_conclusion[1, 1] = f"{labels_gpt35turbo_methods_results_conclusion[1, 1]} ({100 * gpt35turbo_methods_results_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt35turbo_methods_results_conclusion_crosstab, annot=labels_gpt35turbo_methods_results_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt35turbo_methods_results_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

### GPT-4o

In [None]:
# Create a crosstab of the GPT-4o predictions when reading only the methods/results/conclusion and the true labels
gpt4o_methods_results_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt4o_temp100_methods_results_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt4o_methods_results_conclusion_tp = gpt4o_methods_results_conclusion_crosstab.loc["Positive", "Positive"]
gpt4o_methods_results_conclusion_fp = gpt4o_methods_results_conclusion_crosstab.loc["Negative", "Positive"]
gpt4o_methods_results_conclusion_fn = gpt4o_methods_results_conclusion_crosstab.loc["Positive", "Negative"]
gpt4o_methods_results_conclusion_tn = gpt4o_methods_results_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt4o_methods_results_conclusion_accuracy = (gpt4o_methods_results_conclusion_tp + gpt4o_methods_results_conclusion_tn) / (gpt4o_methods_results_conclusion_tp + gpt4o_methods_results_conclusion_tn + gpt4o_methods_results_conclusion_fp + gpt4o_methods_results_conclusion_fn)
gpt4o_methods_results_conclusion_precision = gpt4o_methods_results_conclusion_tp / (gpt4o_methods_results_conclusion_tp + gpt4o_methods_results_conclusion_fp)
gpt4o_methods_results_conclusion_recall = gpt4o_methods_results_conclusion_tp / (gpt4o_methods_results_conclusion_tp + gpt4o_methods_results_conclusion_fn)
gpt4o_methods_results_conclusion_f1 = 2 * (gpt4o_methods_results_conclusion_precision * gpt4o_methods_results_conclusion_recall) / (gpt4o_methods_results_conclusion_precision + gpt4o_methods_results_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt4o_methods_results_conclusion_accuracy_ci = 1.96 * np.sqrt((gpt4o_methods_results_conclusion_accuracy * (1 - gpt4o_methods_results_conclusion_accuracy)) / df.shape[0])
gpt4o_methods_results_conclusion_precision_ci = 1.96 * np.sqrt((gpt4o_methods_results_conclusion_precision * (1 - gpt4o_methods_results_conclusion_precision)) / df.shape[0])
gpt4o_methods_results_conclusion_recall_ci = 1.96 * np.sqrt((gpt4o_methods_results_conclusion_recall * (1 - gpt4o_methods_results_conclusion_recall)) / df.shape[0])
gpt4o_methods_results_conclusion_f1_ci = 1.96 * np.sqrt((gpt4o_methods_results_conclusion_f1 * (1 - gpt4o_methods_results_conclusion_f1)) / df.shape[0])

# Print the results
print(f"GPT-4o Methods, Results, and Conclusion Accuracy: {gpt4o_methods_results_conclusion_accuracy:.2f} ({gpt4o_methods_results_conclusion_accuracy - gpt4o_methods_results_conclusion_accuracy_ci:.2f} - {gpt4o_methods_results_conclusion_accuracy + gpt4o_methods_results_conclusion_accuracy_ci:.2f})")
print(f"GPT-4o Methods, Results, and Conclusion Precision: {gpt4o_methods_results_conclusion_precision:.2f} ({gpt4o_methods_results_conclusion_precision - gpt4o_methods_results_conclusion_precision_ci:.2f} - {gpt4o_methods_results_conclusion_precision + gpt4o_methods_results_conclusion_precision_ci:.2f})")
print(f"GPT-4o Methods, Results, and Conclusion Recall: {gpt4o_methods_results_conclusion_recall:.2f} ({gpt4o_methods_results_conclusion_recall - gpt4o_methods_results_conclusion_recall_ci:.2f} - {gpt4o_methods_results_conclusion_recall + gpt4o_methods_results_conclusion_recall_ci:.2f})")
print(f"GPT-4o Methods, Results, and Conclusion F1: {gpt4o_methods_results_conclusion_f1:.2f} ({gpt4o_methods_results_conclusion_f1 - gpt4o_methods_results_conclusion_f1_ci:.2f} - {gpt4o_methods_results_conclusion_f1 + gpt4o_methods_results_conclusion_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt4o_methods_results_conclusion = gpt4o_methods_results_conclusion_crosstab.values.astype(str)
labels_gpt4o_methods_results_conclusion[0, 0] = f"{labels_gpt4o_methods_results_conclusion[0, 0]} ({100 * gpt4o_methods_results_conclusion_tn / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_results_conclusion[0, 1] = f"{labels_gpt4o_methods_results_conclusion[0, 1]} ({100 * gpt4o_methods_results_conclusion_fp / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_results_conclusion[1, 0] = f"{labels_gpt4o_methods_results_conclusion[1, 0]} ({100 * gpt4o_methods_results_conclusion_fn / (df.shape[0]):.1f}%)"
labels_gpt4o_methods_results_conclusion[1, 1] = f"{labels_gpt4o_methods_results_conclusion[1, 1]} ({100 * gpt4o_methods_results_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt4o_methods_results_conclusion_crosstab, annot=labels_gpt4o_methods_results_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt4o_methods_results_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  


### o1

In [None]:
# Create a crosstab of the GPT-4o predictions when reading only the methods/results/conclusion and the true labels
o1_methods_results_conclusion_crosstab = pd.crosstab(
    df["Annotation_accept"], df["o1_temp100_methods_results_conclusion_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
o1_methods_results_conclusion_tp = o1_methods_results_conclusion_crosstab.loc["Positive", "Positive"]
o1_methods_results_conclusion_fp = o1_methods_results_conclusion_crosstab.loc["Negative", "Positive"]
o1_methods_results_conclusion_fn = o1_methods_results_conclusion_crosstab.loc["Positive", "Negative"]
o1_methods_results_conclusion_tn = o1_methods_results_conclusion_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
o1_methods_results_conclusion_accuracy = (o1_methods_results_conclusion_tp + o1_methods_results_conclusion_tn) / (o1_methods_results_conclusion_tp + o1_methods_results_conclusion_tn + o1_methods_results_conclusion_fp + o1_methods_results_conclusion_fn)
o1_methods_results_conclusion_precision = o1_methods_results_conclusion_tp / (o1_methods_results_conclusion_tp + o1_methods_results_conclusion_fp)
o1_methods_results_conclusion_recall = o1_methods_results_conclusion_tp / (o1_methods_results_conclusion_tp + o1_methods_results_conclusion_fn)
o1_methods_results_conclusion_f1 = 2 * (o1_methods_results_conclusion_precision * o1_methods_results_conclusion_recall) / (o1_methods_results_conclusion_precision + o1_methods_results_conclusion_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
o1_methods_results_conclusion_accuracy_ci = 1.96 * np.sqrt((o1_methods_results_conclusion_accuracy * (1 - o1_methods_results_conclusion_accuracy)) / df.shape[0])
o1_methods_results_conclusion_precision_ci = 1.96 * np.sqrt((o1_methods_results_conclusion_precision * (1 - o1_methods_results_conclusion_precision)) / df.shape[0])
o1_methods_results_conclusion_recall_ci = 1.96 * np.sqrt((o1_methods_results_conclusion_recall * (1 - o1_methods_results_conclusion_recall)) / df.shape[0])
o1_methods_results_conclusion_f1_ci = 1.96 * np.sqrt((o1_methods_results_conclusion_f1 * (1 - o1_methods_results_conclusion_f1)) / df.shape[0])

# Print the results
print(f"o1 Methods, Results, and Conclusion Accuracy: {o1_methods_results_conclusion_accuracy:.2f} ({o1_methods_results_conclusion_accuracy - o1_methods_results_conclusion_accuracy_ci:.2f} - {o1_methods_results_conclusion_accuracy + o1_methods_results_conclusion_accuracy_ci:.2f})")
print(f"o1 Methods, Results, and Conclusion Precision: {o1_methods_results_conclusion_precision:.2f} ({o1_methods_results_conclusion_precision - o1_methods_results_conclusion_precision_ci:.2f} - {o1_methods_results_conclusion_precision + o1_methods_results_conclusion_precision_ci:.2f})")
print(f"o1 Methods, Results, and Conclusion Recall: {o1_methods_results_conclusion_recall:.2f} ({o1_methods_results_conclusion_recall - o1_methods_results_conclusion_recall_ci:.2f} - {o1_methods_results_conclusion_recall + o1_methods_results_conclusion_recall_ci:.2f})")
print(f"o1 Methods, Results, and Conclusion F1: {o1_methods_results_conclusion_f1:.2f} ({o1_methods_results_conclusion_f1 - o1_methods_results_conclusion_f1_ci:.2f} - {o1_methods_results_conclusion_f1 + o1_methods_results_conclusion_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_o1_methods_results_conclusion = o1_methods_results_conclusion_crosstab.values.astype(str)
labels_o1_methods_results_conclusion[0, 0] = f"{labels_o1_methods_results_conclusion[0, 0]} ({100 * o1_methods_results_conclusion_tn / (df.shape[0]):.1f}%)"
labels_o1_methods_results_conclusion[0, 1] = f"{labels_o1_methods_results_conclusion[0, 1]} ({100 * o1_methods_results_conclusion_fp / (df.shape[0]):.1f}%)"
labels_o1_methods_results_conclusion[1, 0] = f"{labels_o1_methods_results_conclusion[1, 0]} ({100 * o1_methods_results_conclusion_fn / (df.shape[0]):.1f}%)"
labels_o1_methods_results_conclusion[1, 1] = f"{labels_o1_methods_results_conclusion[1, 1]} ({100 * o1_methods_results_conclusion_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(o1_methods_results_conclusion_crosstab, annot=labels_o1_methods_results_conclusion, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/o1_methods_results_conclusion_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  


## Title and Abstract

### GPT-3.5 Turbo


In [None]:
# Create a crosstab of the GPT-3.5 Turbo predictions when reading the title/abstract and the true labels
gpt35turbo_title_abstract_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt35turbo_temp100_title_abstract_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt35turbo_title_abstract_tp = gpt35turbo_title_abstract_crosstab.loc["Positive", "Positive"]
gpt35turbo_title_abstract_fp = gpt35turbo_title_abstract_crosstab.loc["Negative", "Positive"]
gpt35turbo_title_abstract_fn = gpt35turbo_title_abstract_crosstab.loc["Positive", "Negative"]
gpt35turbo_title_abstract_tn = gpt35turbo_title_abstract_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt35turbo_title_abstract_accuracy = (gpt35turbo_title_abstract_tp + gpt35turbo_title_abstract_tn) / (gpt35turbo_title_abstract_tp + gpt35turbo_title_abstract_tn + gpt35turbo_title_abstract_fp + gpt35turbo_title_abstract_fn)
gpt35turbo_title_abstract_precision = gpt35turbo_title_abstract_tp / (gpt35turbo_title_abstract_tp + gpt35turbo_title_abstract_fp)
gpt35turbo_title_abstract_recall = gpt35turbo_title_abstract_tp / (gpt35turbo_title_abstract_tp + gpt35turbo_title_abstract_fn)
gpt35turbo_title_abstract_f1 = 2 * (gpt35turbo_title_abstract_precision * gpt35turbo_title_abstract_recall) / (gpt35turbo_title_abstract_precision + gpt35turbo_title_abstract_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt35turbo_title_abstract_accuracy_ci = 1.96 * np.sqrt((gpt35turbo_title_abstract_accuracy * (1 - gpt35turbo_title_abstract_accuracy)) / df.shape[0])
gpt35turbo_title_abstract_precision_ci = 1.96 * np.sqrt((gpt35turbo_title_abstract_precision * (1 - gpt35turbo_title_abstract_precision)) / df.shape[0])
gpt35turbo_title_abstract_recall_ci = 1.96 * np.sqrt((gpt35turbo_title_abstract_recall * (1 - gpt35turbo_title_abstract_recall)) / df.shape[0])
gpt35turbo_title_abstract_f1_ci = 1.96 * np.sqrt((gpt35turbo_title_abstract_f1 * (1 - gpt35turbo_title_abstract_f1)) / df.shape[0])

# Print the results
print(f"GPT-3.5 Turbo Title and Abstract Accuracy: {gpt35turbo_title_abstract_accuracy:.2f} ({gpt35turbo_title_abstract_accuracy - gpt35turbo_title_abstract_accuracy_ci:.2f} - {gpt35turbo_title_abstract_accuracy + gpt35turbo_title_abstract_accuracy_ci:.2f})")
print(f"GPT-3.5 Turbo Title and Abstract Precision: {gpt35turbo_title_abstract_precision:.2f} ({gpt35turbo_title_abstract_precision - gpt35turbo_title_abstract_precision_ci:.2f} - {gpt35turbo_title_abstract_precision + gpt35turbo_title_abstract_precision_ci:.2f})")
print(f"GPT-3.5 Turbo Title and Abstract Recall: {gpt35turbo_title_abstract_recall:.2f} ({gpt35turbo_title_abstract_recall - gpt35turbo_title_abstract_recall_ci:.2f} - {gpt35turbo_title_abstract_recall + gpt35turbo_title_abstract_recall_ci:.2f})")
print(f"GPT-3.5 Turbo Title and Abstract F1: {gpt35turbo_title_abstract_f1:.2f} ({gpt35turbo_title_abstract_f1 - gpt35turbo_title_abstract_f1_ci:.2f} - {gpt35turbo_title_abstract_f1 + gpt35turbo_title_abstract_f1_ci:.2f})")


In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt35turbo_title_abstract = gpt35turbo_title_abstract_crosstab.values.astype(str)
labels_gpt35turbo_title_abstract[0, 0] = f"{labels_gpt35turbo_title_abstract[0, 0]} ({100 * gpt35turbo_title_abstract_tn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_title_abstract[0, 1] = f"{labels_gpt35turbo_title_abstract[0, 1]} ({100 * gpt35turbo_title_abstract_fp / (df.shape[0]):.1f}%)"
labels_gpt35turbo_title_abstract[1, 0] = f"{labels_gpt35turbo_title_abstract[1, 0]} ({100 * gpt35turbo_title_abstract_fn / (df.shape[0]):.1f}%)"
labels_gpt35turbo_title_abstract[1, 1] = f"{labels_gpt35turbo_title_abstract[1, 1]} ({100 * gpt35turbo_title_abstract_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt35turbo_title_abstract_crosstab, annot=labels_gpt35turbo_title_abstract, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt35turbo_title_abstract_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  


### GPT-4o

In [None]:
# Create a crosstab of the GPT-4o predictions when reading the title/abstract and the true labels
gpt4o_title_abstract_crosstab = pd.crosstab(
    df["Annotation_accept"], df["gpt4o_temp100_title_abstract_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
gpt4o_title_abstract_tp = gpt4o_title_abstract_crosstab.loc["Positive", "Positive"]
gpt4o_title_abstract_fp = gpt4o_title_abstract_crosstab.loc["Negative", "Positive"]
gpt4o_title_abstract_fn = gpt4o_title_abstract_crosstab.loc["Positive", "Negative"]
gpt4o_title_abstract_tn = gpt4o_title_abstract_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
gpt4o_title_abstract_accuracy = (gpt4o_title_abstract_tp + gpt4o_title_abstract_tn) / (gpt4o_title_abstract_tp + gpt4o_title_abstract_tn + gpt4o_title_abstract_fp + gpt4o_title_abstract_fn)
gpt4o_title_abstract_precision = gpt4o_title_abstract_tp / (gpt4o_title_abstract_tp + gpt4o_title_abstract_fp)
gpt4o_title_abstract_recall = gpt4o_title_abstract_tp / (gpt4o_title_abstract_tp + gpt4o_title_abstract_fn)
gpt4o_title_abstract_f1 = 2 * (gpt4o_title_abstract_precision * gpt4o_title_abstract_recall) / (gpt4o_title_abstract_precision + gpt4o_title_abstract_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
gpt4o_title_abstract_accuracy_ci = 1.96 * np.sqrt((gpt4o_title_abstract_accuracy * (1 - gpt4o_title_abstract_accuracy)) / df.shape[0])
gpt4o_title_abstract_precision_ci = 1.96 * np.sqrt((gpt4o_title_abstract_precision * (1 - gpt4o_title_abstract_precision)) / df.shape[0])
gpt4o_title_abstract_recall_ci = 1.96 * np.sqrt((gpt4o_title_abstract_recall * (1 - gpt4o_title_abstract_recall)) / df.shape[0])
gpt4o_title_abstract_f1_ci = 1.96 * np.sqrt((gpt4o_title_abstract_f1 * (1 - gpt4o_title_abstract_f1)) / df.shape[0])

# Print the results
print(f"GPT-4o Title and Abstract Accuracy: {gpt4o_title_abstract_accuracy:.2f} ({gpt4o_title_abstract_accuracy - gpt4o_title_abstract_accuracy_ci:.2f} - {gpt4o_title_abstract_accuracy + gpt4o_title_abstract_accuracy_ci:.2f})")
print(f"GPT-4o Title and Abstract Precision: {gpt4o_title_abstract_precision:.2f} ({gpt4o_title_abstract_precision - gpt4o_title_abstract_precision_ci:.2f} - {gpt4o_title_abstract_precision + gpt4o_title_abstract_precision_ci:.2f})")
print(f"GPT-4o Title and Abstract Recall: {gpt4o_title_abstract_recall:.2f} ({gpt4o_title_abstract_recall - gpt4o_title_abstract_recall_ci:.2f} - {gpt4o_title_abstract_recall + gpt4o_title_abstract_recall_ci:.2f})")
print(f"GPT-4o Title and Abstract F1: {gpt4o_title_abstract_f1:.2f} ({gpt4o_title_abstract_f1 - gpt4o_title_abstract_f1_ci:.2f} - {gpt4o_title_abstract_f1 + gpt4o_title_abstract_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_gpt4o_title_abstract = gpt4o_title_abstract_crosstab.values.astype(str)
labels_gpt4o_title_abstract[0, 0] = f"{labels_gpt4o_title_abstract[0, 0]} ({100 * gpt4o_title_abstract_tn / (df.shape[0]):.1f}%)"
labels_gpt4o_title_abstract[0, 1] = f"{labels_gpt4o_title_abstract[0, 1]} ({100 * gpt4o_title_abstract_fp / (df.shape[0]):.1f}%)"
labels_gpt4o_title_abstract[1, 0] = f"{labels_gpt4o_title_abstract[1, 0]} ({100 * gpt4o_title_abstract_fn / (df.shape[0]):.1f}%)"
labels_gpt4o_title_abstract[1, 1] = f"{labels_gpt4o_title_abstract[1, 1]} ({100 * gpt4o_title_abstract_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(gpt4o_title_abstract_crosstab, annot=labels_gpt4o_title_abstract, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/gpt4o_title_abstract_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

### o1

In [None]:
# Create a crosstab of the o1 predictions when reading the title/abstract and the true labels
o1_title_abstract_crosstab = pd.crosstab(
    df["Annotation_accept"], df["o1_temp100_title_abstract_response_raw"]
)

# Calculate the true positives, false positives, false negatives, and true negatives
o1_title_abstract_tp = o1_title_abstract_crosstab.loc["Positive", "Positive"]
o1_title_abstract_fp = o1_title_abstract_crosstab.loc["Negative", "Positive"]
o1_title_abstract_fn = o1_title_abstract_crosstab.loc["Positive", "Negative"]
o1_title_abstract_tn = o1_title_abstract_crosstab.loc["Negative", "Negative"]

# Calculate the accuracy, precision, recall, and F1 score
o1_title_abstract_accuracy = (o1_title_abstract_tp + o1_title_abstract_tn) / (o1_title_abstract_tp + o1_title_abstract_tn + o1_title_abstract_fp + o1_title_abstract_fn)
o1_title_abstract_precision = o1_title_abstract_tp / (o1_title_abstract_tp + o1_title_abstract_fp)
o1_title_abstract_recall = o1_title_abstract_tp / (o1_title_abstract_tp + o1_title_abstract_fn)
o1_title_abstract_f1 = 2 * (o1_title_abstract_precision * o1_title_abstract_recall) / (o1_title_abstract_precision + o1_title_abstract_recall)

# Calculate the 95% confidence interval for the accuracy, precision, recall, and F1 score
o1_title_abstract_accuracy_ci = 1.96 * np.sqrt((o1_title_abstract_accuracy * (1 - o1_title_abstract_accuracy)) / df.shape[0])
o1_title_abstract_precision_ci = 1.96 * np.sqrt((o1_title_abstract_precision * (1 - o1_title_abstract_precision)) / df.shape[0])
o1_title_abstract_recall_ci = 1.96 * np.sqrt((o1_title_abstract_recall * (1 - o1_title_abstract_recall)) / df.shape[0])
o1_title_abstract_f1_ci = 1.96 * np.sqrt((o1_title_abstract_f1 * (1 - o1_title_abstract_f1)) / df.shape[0])

# Print the results
print(f"o1 Title and Abstract Accuracy: {o1_title_abstract_accuracy:.2f} ({o1_title_abstract_accuracy - o1_title_abstract_accuracy_ci:.2f} - {o1_title_abstract_accuracy + o1_title_abstract_accuracy_ci:.2f})")
print(f"o1 Title and Abstract Precision: {o1_title_abstract_precision:.2f} ({o1_title_abstract_precision - o1_title_abstract_precision_ci:.2f} - {o1_title_abstract_precision + o1_title_abstract_precision_ci:.2f})")
print(f"o1 Title and Abstract Recall: {o1_title_abstract_recall:.2f} ({o1_title_abstract_recall - o1_title_abstract_recall_ci:.2f} - {o1_title_abstract_recall + o1_title_abstract_recall_ci:.2f})")
print(f"o1 Title and Abstract F1: {o1_title_abstract_f1:.2f} ({o1_title_abstract_f1 - o1_title_abstract_f1_ci:.2f} - {o1_title_abstract_f1 + o1_title_abstract_f1_ci:.2f})")

In [None]:
# Create a plot of the confusion matrix
fig = plt.figure(figsize=(5, 5))

# Create the labels with counts and percentages
labels_o1_title_abstract = o1_title_abstract_crosstab.values.astype(str)
labels_o1_title_abstract[0, 0] = f"{labels_o1_title_abstract[0, 0]} ({100 * o1_title_abstract_tn / (df.shape[0]):.1f}%)"
labels_o1_title_abstract[0, 1] = f"{labels_o1_title_abstract[0, 1]} ({100 * o1_title_abstract_fp / (df.shape[0]):.1f}%)"
labels_o1_title_abstract[1, 0] = f"{labels_o1_title_abstract[1, 0]} ({100 * o1_title_abstract_fn / (df.shape[0]):.1f}%)"
labels_o1_title_abstract[1, 1] = f"{labels_o1_title_abstract[1, 1]} ({100 * o1_title_abstract_tp / (df.shape[0]):.1f}%)"

# Plot the confusion matrix
plt.rcParams.update({"font.size": 14})  # Increase font size
sns.heatmap(o1_title_abstract_crosstab, annot=labels_o1_title_abstract, fmt="", cmap="Blues")
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Ground truth", fontsize=14)
# Remove the colorbar but keep the width of the heatmap
plt.gca().collections[0].colorbar.remove()
plt.savefig("plots/o1_title_abstract_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()  

## Investigation of incorrect predictions

### o1

In [36]:
# Get the false positive predictions by o1 based on the conclusion
o1_conclusion_false_positives = df[(df['o1_temp100_conclusion_response_raw'] == "Positive") & (df['Annotation_accept'] == "Negative")]


# Get only the files that were false positives based on the conclusion but correct based on the title/abstract
o1_conclusion_fp_title_abstract_correct = o1_conclusion_false_positives[o1_conclusion_false_positives['o1_temp100_title_abstract_response_raw'] == "Negative"]

# Display the results
print(f"Number of trials with false positive prediction by o1 based on conclusion but correct based on title/abstract: {len(o1_conclusion_fp_title_abstract_correct)}")

# Display the dois of the trials
print(o1_conclusion_fp_title_abstract_correct['doi'].tolist())



Number of trials with false positive prediction by o1 based on conclusion but correct based on title/abstract: 10
['10.1200/JCO.20.02529', '10.1200/JCO.2015.62.1474', '10.1200/JCO.2005.03.0551', '10.1200/JCO.2012.44.7920', '10.1200/JCO.22.01805', '10.1016/S0140-6736(10)62312-4', '10.1200/JCO.2005.05.112', '10.1016/S1470-2045(13)70539-4', '10.1200/JCO.2006.06.0483', '10.1200/JCO.2015.62.4734']


## Check if true negatives have same patterns 

### o1

In [7]:
# Get 20 true negative predictions by o1 based on the conclusion
o1_conclusion_true_negatives = df[(df['o1_temp100_conclusion_response_raw'] == "Negative") & (df['Annotation_accept'] == "Negative")].sample(10, random_state=1)

# Display the dois of the trials
print(o1_conclusion_true_negatives['doi'].tolist())

['10.1016/S1470-2045(18)30193-1', '10.1200/JCO.2012.43.8820', '10.1016/S1470-2045(13)70510-2', '10.1200/JCO.2005.05.098', '10.1200/JCO.2005.04.5252', '10.1001/jamaoncol.2020.4574', '10.1200/JCO.20.02824', '10.1001/jamaoncol.2017.1269', '10.1001/jama.2014.2626', '10.1200/JCO.2015.61.4578']
