In [None]:
from prompt_systematic_review.automated_review import review_abstract_title_categorical
import pandas as pd
import os
from dotenv import load_dotenv
import openai
import tqdm
from prompt_systematic_review.utils import process_paper_title

load_dotenv(dotenv_path="../.env")  # load all entries from .env file

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
df = pd.read_csv("arxiv_papers_with_abstract.csv")

In [None]:
# Empty list to keep track of results
results = []

# Iterate over DataFrame row by row
for index, row in tqdm.tqdm(df.iterrows()):
    # Apply function to each paper's title and abstract
    result = review_abstract_title_categorical(
        title=row["title"],
        abstract=row["abstract"],
        model="gpt-4-1106-preview",
    )
    # Add result to list
    results.append(result)

In [None]:
for i, result in enumerate(results):
    df.loc[i, "Probability"] = result["Probability"]
    df.loc[i, "Reasoning"] = result["Reasoning"]

In [None]:
df.to_csv("arxiv_papers_with_ai_labels.csv")

In [None]:
blacklist = pd.read_csv("../data/blacklist.csv")
blacklist["Title"] = blacklist["Title"].apply(lambda x: process_paper_title(x))
df["title"] = df["title"].apply(lambda x: process_paper_title(x))

# df = df.iloc[400:800]
df_limited = df.copy()  # .iloc[400:800]

In [None]:
df_limited["human_review"] = ~df_limited["title"].isin(blacklist["Title"])
keepables = ["highly relevant", "somewhat relevant", "neutral"]

df_limited["AI_keep"] = df_limited["Probability"].map(
    lambda x: True if x in keepables else False
)

In [None]:
num_same_rows = (df_limited["AI_keep"] == df_limited["human_review"]).sum()
num_same_rows / len(df_limited["human_review"])

In [None]:
agreement_grid = pd.crosstab(df_limited["AI_keep"], df_limited["human_review"])
print(agreement_grid)

true_positives = agreement_grid.loc[True, True]
false_positives = agreement_grid.loc[True, False]
false_negatives = agreement_grid.loc[False, True]

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

f1_score = 2 * (precision * recall) / (precision + recall)
f1_score

In [None]:
df_limited["AI_keep"].sum()

In [None]:
df_limited["human_review"].sum()

In [None]:
papers = df_limited[
    (df_limited["AI_keep"] == False) & (df_limited["human_review"] == True)
]

# Print the first few paper titles
for i in papers["title"].head(100):
    print(i)

In [None]:
df.iloc[53]["title"]