In [None]:
"""This script downloads and processes the entire dataset"""

%load_ext autoreload
%autoreload 2

from prompt_systematic_review.download_arxiv_query import query_archive
from prompt_systematic_review.run_semantic_scholar import query_semantic_scholar

# from prompt_systematic_review.arxiv_source
# from prompt_systematic_review.semantic_scholar_source import
import pandas as pd

from prompt_systematic_review.utils import process_paper_title

import openai
import tqdm
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")  # load all entries from .env file

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# download CSV of arXiv results
arxiv_df = query_archive(verbose=True)

In [None]:
# clean arXiv CSV
arxiv_df["title"] = arxiv_df["title"].apply(lambda x: process_paper_title(x))
arxiv_df["source"] = "arXiv"
len(arxiv_df)

In [None]:
# download CSV of Semantic Scholar
semantic_scholar_df = query_semantic_scholar(verbose=True)

In [None]:
# clean Semantic CSV
semantic_scholar_df["title"] = semantic_scholar_df["title"].apply(
    lambda x: process_paper_title(x)
)
semantic_scholar_df["source"] = "Semantic Scholar"
len(semantic_scholar_df)

In [None]:
# combine dfs
combined_df = pd.concat([semantic_scholar_df, arxiv_df])
len(combined_df)

In [None]:
# Deduplicate
deduplicated_df = combined_df.drop_duplicates(subset="title")
len(deduplicated_df)

In [None]:
blacklist = pd.read_csv("../data/blacklist.csv")
blacklist["title"] = blacklist["title"].apply(lambda x: process_paper_title(x))
blacklist

In [None]:
deduplicated_df = deduplicated_df[~deduplicated_df["title"].isin(blacklist["title"])]
len(deduplicated_df)

In [None]:
# this code hangs at about 2801 papers
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor


def downloadPaper(url: str, title: str):
    response = requests.get(url)
    recurse = 0
    while (
        str(response.status_code) != "200" or len(response.content) == 0
    ) and recurse < 5:
        # if failed to download try again after waiting 2*recurse seconds
        time.sleep(2 * recurse)
        response = requests.get(url)
        recurse += 1

    if str(response.status_code) == "200" and len(response.content) != 0:
        # replace invalid characters in title
        title = process_paper_title(title=title)
        name = title + ".pdf"
        with open(os.path.join("papers", name), "wb") as f:
            f.write(response.content)


# Assuming deduplicated_df is a pandas DataFrame with columns "url" and "title"
data = list(zip(deduplicated_df["url"].tolist(), deduplicated_df["title"].tolist()))

NUM_PROCESSES = 12  # adjust as needed per your machine
with ThreadPoolExecutor(max_workers=NUM_PROCESSES) as executor:
    executor.map(lambda p: downloadPaper(*p), data)

In [None]:
import os
import PyPDF2
import pandas as pd
import tqdm

new_blacklist = []

# Iterate over the files in the directory
for filename in tqdm.tqdm(os.listdir("papers")):
    try:
        if filename.endswith(".pdf"):
            file_path = os.path.join("papers", filename)
            with open(file_path, "rb") as file:
                pdf = PyPDF2.PdfReader(file)
                contains_prompt = False
                for page in pdf.pages:
                    if "prompt" in page.extract_text().lower():
                        contains_prompt = True
                        break

            if not contains_prompt:
                # Delete the file
                os.remove(file_path)
                # Drop the corresponding row from the dataframe
                deduplicated_df = deduplicated_df[
                    deduplicated_df["title"] != filename[:-4]
                ]
                # Add the paper to the new blacklist
                # TODO: this is messed up, results in an array of 80K single characters
                new_blacklist += filename[:-4]

    except Exception as e:
        # Delete the file if cant be read
        os.remove(file_path)
        # Drop the corresponding row from the dataframe
        deduplicated_df = deduplicated_df[deduplicated_df["title"] != filename[:-4]]
        print(f"Error processing {filename}: {e}")

# Concatenate the old and new blacklist dataframes
# blacklist = pd.concat([blacklist, new_blacklist], ignore_index=True)

# Reset the index of the dataframe after dropping rows
# deduplicated_df.reset_index(drop=True, inplace=True)

# # Save the updated blacklist dataframe
# blacklist.to_csv('../data/blacklist.csv', index=False)

In [None]:
len(deduplicated_df)

In [None]:
# TODO: there is smtg weird going on here...

# Get a list of all the paper titles in the directory (without the .pdf extension)
paper_titles = [
    filename[:-4] for filename in os.listdir("papers") if filename.endswith(".pdf")
]

# Remove any rows from deduplicated_df where the title is not in paper_titles
deduplicated_df = deduplicated_df[deduplicated_df["title"].isin(paper_titles)]

len(deduplicated_df)

In [None]:
# Load the csv file
df_for_review = pd.read_csv("../data/arxiv_papers_for_human_review.csv")

df_for_review["title"] = df_for_review["title"].apply(lambda x: process_paper_title(x))
# Get a list of the titles in the csv file

titles_for_review = df_for_review["title"].tolist()

# have been human reviewed as correct
df_safe = deduplicated_df[deduplicated_df["title"].isin(titles_for_review)]
# need ai review
df_for_ai_review = deduplicated_df[~deduplicated_df["title"].isin(titles_for_review)]

print(len(df_for_ai_review))
print(len(df_safe))

In [None]:
from prompt_systematic_review.automated_review import review_abstract_title_categorical

results = []

# Iterate over DataFrame row by row
for index, row in tqdm.tqdm(df_for_ai_review.iterrows()):
    # Apply function to each paper's title and abstract
    result = review_abstract_title_categorical(
        title=row["title"],
        abstract=row["abstract"],
        model="gpt-4-1106-preview",
    )
    # Add result to list
    results.append(result)

for i, result in enumerate(results):
    df_for_ai_review.loc[i, "Probability"] = result["Probability"]
    df_for_ai_review.loc[i, "Reasoning"] = result["Reasoning"]

In [None]:
keepables = ["highly relevant", "somewhat relevant", "neutral"]
others = ["somewhat irrelevant", "highly irrelevant"]

df_ai_reviewed_positive = df_for_ai_review[
    df_for_ai_review["Probability"].isin(keepables)
]
df_ai_reviewed_negative = df_for_ai_review[df_for_ai_review["Probability"].isin(others)]

In [None]:
# for i in df_ai_reviewed_negative["title"]:
#     print(i)
df_ai_reviewed_negative.iloc[21]

In [None]:
df_combined = pd.concat([df_safe, df_ai_reviewed_positive], ignore_index=True)
len(df_combined)

In [None]:
# Get a list of all the paper titles in the directory (without the .pdf extension)
paper_titles = [
    filename[:-4] for filename in os.listdir("papers") if filename.endswith(".pdf")
]

# Remove any rows from deduplicated_df where the title is not in paper_titles
df_combined = df_combined[df_combined["title"].isin(paper_titles)]

len(df_combined)

In [None]:
import os

# Get a list of all titles in df_combined
df_titles = df_combined["title"].tolist()
c = 0
# Iterate over all files in the "papers" directory
for filename in os.listdir("papers"):
    # Check if the file is a PDF and its title is not in df_titles
    if filename.endswith(".pdf") and filename[:-4] not in df_titles:
        # Remove the file
        os.remove("papers/" + filename)

In [None]:
assert len(os.listdir("papers")) == len(df_combined)

In [None]:
df_combined["Reasoning"]

In [None]:
# TODO: fix this
from prompt_systematic_review.utils import auto_pipeline

df_combined.to_csv("master_papers.csv")

auto_pipeline("master_papers.csv", "papers")