In [40]:
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from tqdm import tqdm
import shelve


def csv_to_markdown(paper_id):
    # Fetch CSV from Hugging Face website
    CSV_URL = f"https://huggingface.co/datasets/taesiri/arxiv_qa/raw/main/papers/{paper_id}.csv"
    csv_response = requests.get(CSV_URL)
    csv_response.raise_for_status()
    csv_content = csv_response.text

    # Fetch the paper title using the arXiv API
    ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
    title_response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
    title_response.raise_for_status()
    xml_content = title_response.content.decode("utf-8")
    title_start = xml_content.find("<title>") + 7
    title_end = xml_content.find("</title>", title_start)
    paper_title = xml_content[title_start:title_end].strip()
    paper_title = paper_title.replace("\n", " ").replace("\r", "")

    # Convert CSV to markdown
    markdown_lines = []
    markdown_lines.append("# " + paper_title)

    csv_reader = csv.DictReader(csv_content.splitlines())
    for row in csv_reader:
        markdown_lines.append("\n## " + row["question"])
        markdown_lines.append("\n" + row["answer"].strip() + "\n")

    markdown_content = "\n".join(markdown_lines)

    # Save to .md file named using the paper_id
    with open(f"./papers/{paper_id}.md", "w") as md_file:
        md_file.write(markdown_content)

In [41]:
def get_paper_ids_from_repo(repo_url):
    # Get the content of the repository's papers folder
    response = requests.get(repo_url)
    response.raise_for_status()

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract all CSV filenames
    csv_files = [
        link.get("href")
        for link in soup.find_all("a")
        if link.get("href", "").endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.split("/")[-1].replace(".csv", "") for filename in csv_files]

    return paper_ids


# Get the list of paper IDs from the repository
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa/tree/main/papers"
paper_ids = get_paper_ids_from_repo(REPO_URL)

In [42]:
import csv
import os
import shelve
from bs4 import BeautifulSoup
from datetime import datetime
from tqdm import tqdm
import git
import requests
import shelve


def csv_to_markdown(paper_id, repo_path, cache):
    # Read CSV from the cloned repository
    with open(os.path.join(repo_path, "papers", f"{paper_id}.csv"), "r") as file:
        csv_content = file.read()

    # Check if the paper title is in the cache
    if paper_id in cache:
        paper_title = cache[paper_id]
    else:
        # Fetch the paper title using the arXiv API
        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        title_response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
        title_response.raise_for_status()
        xml_content = title_response.content.decode("utf-8")
        title_start = xml_content.find("<title>") + 7
        title_end = xml_content.find("</title>", title_start)
        paper_title = xml_content[title_start:title_end].strip()
        paper_title = paper_title.replace("\n", " ").replace("\r", "")

        # Cache the paper title
        cache[paper_id] = paper_title

    # Convert CSV to markdown
    markdown_lines = []
    markdown_lines.append("# " + paper_title)

    csv_reader = csv.DictReader(csv_content.splitlines())
    for row in csv_reader:
        markdown_lines.append("\n## " + row["question"])
        markdown_lines.append("\n" + row["answer"].strip() + "\n")

    markdown_content = "\n".join(markdown_lines)

    # Save to .md file named using the paper_id
    with open(f"./papers/{paper_id}.md", "w") as md_file:
        md_file.write(markdown_content)


def get_paper_ids_from_repo(repo_path):
    # Get the list of CSV files in the papers directory of the cloned repository
    csv_files = [
        filename
        for filename in os.listdir(os.path.join(repo_path, "papers"))
        if filename.endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.replace(".csv", "") for filename in csv_files]

    return paper_ids


# # Clone the repository using GitPython
# REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa.git"
# REPO_PATH = "./arxiv_qa_repo"
# if not os.path.exists(REPO_PATH):
#     git.Repo.clone_from(REPO_URL, REPO_PATH)

# # Get the list of paper IDs from the cloned repository
# paper_ids = get_paper_ids_from_repo(REPO_PATH)

# # Open a shelve cache
# with shelve.open("arxiv_cache") as cache:
#     # Convert each paper's CSV to markdown
#     for paper_id in tqdm(paper_ids):
#         csv_to_markdown(paper_id, REPO_PATH, cache)

In [43]:
# Clone the repository using GitPython or pull the latest changes if it exists
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa.git"
REPO_PATH = "./arxiv_qa_repo"
if not os.path.exists(REPO_PATH):
    git.Repo.clone_from(REPO_URL, REPO_PATH)
else:
    repo = git.Repo(REPO_PATH)
    origin = repo.remotes.origin
    origin.pull()

# Get the list of paper IDs from the cloned repository
paper_ids = get_paper_ids_from_repo(REPO_PATH)

# Open a shelve cache
with shelve.open("arxiv_cache") as cache:
    # Convert each paper's CSV to markdown
    for paper_id in tqdm(paper_ids):
        csv_to_markdown(paper_id, REPO_PATH, cache)

100%|██████████| 648/648 [00:00<00:00, 2249.74it/s]


In [44]:
# import os
# from git import Repo


# def get_paper_ids_from_repo(repo_url, local_dir="./temp_repo"):
#     # Clone the repository
#     Repo.clone_from(repo_url, local_dir)
#     repo = Repo(local_dir)

#     # Get list of CSV filenames in the 'papers' directory
#     csv_files = [
#         item.path
#         for item in repo.tree().traverse()
#         if item.path.endswith(".csv") and item.path.startswith("papers/")
#     ]

#     # Extract paper IDs from filenames
#     paper_ids = [
#         os.path.basename(filename).replace(".csv", "") for filename in csv_files
#     ]

#     # Cleanup: Remove the cloned repository from local storage
#     for root, dirs, files in os.walk(local_dir, topdown=False):
#         for name in files:
#             os.remove(os.path.join(root, name))
#         for name in dirs:
#             os.rmdir(os.path.join(root, name))
#     os.rmdir(local_dir)

#     return paper_ids


# # Get the list of paper IDs from the repository
# REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa"
# paper_ids = get_paper_ids_from_repo(REPO_URL)

In [45]:
# print(len(paper_ids))

In [46]:
# # Convert each CSV to Markdown
# for paper_id in tqdm(paper_ids):
#     csv_to_markdown(paper_id)

In [47]:
# def get_paper_title(paper_id):
#     """Retrieve the paper title from the arXiv API for a given paper_id."""
#     ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
#     response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
#     response.raise_for_status()
#     xml_content = response.content.decode("utf-8")
#     title_start = xml_content.find("<title>") + 7
#     title_end = xml_content.find("</title>", title_start)
#     return xml_content[title_start:title_end].strip()


# def create_parent_md(paper_ids, output_file="./papers.md"):
#     """Create a parent Markdown file listing all papers."""
#     lines = ["# List of Papers\n"]
#     for paper_id in paper_ids:
#         title = get_paper_title(paper_id)
#         arxiv_link = f"https://arxiv.org/abs/{paper_id}"
#         md_link = f"./papers/{paper_id}.md"
#         lines.append(f"- {title} - [[Arxiv]({arxiv_link})] [[QA]({md_link})].\n")

#     with open(output_file, "w") as md_file:
#         md_file.writelines(lines)


# # Assuming paper_ids is the list of paper IDs you've fetched before
# # create_parent_md(paper_ids)

In [48]:
# from glob import glob
# paper_ids = [x.split("/")[-1].split(".md")[0] for x in glob("./papers/*.md")]

In [49]:
paper_ids = list(set(paper_ids))

In [50]:
def get_paper_details(paper_id, cache_file="paper_details_cache"):
    """Retrieve the paper title and publication date from the arXiv API for a given paper_id."""
    with shelve.open(cache_file) as cache:
        # Check if the result is already in the cache
        if paper_id in cache:
            return cache[paper_id]

        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
        response.raise_for_status()
        xml_content = response.content.decode("utf-8")

        # Extract title
        title_start = xml_content.find("<title>") + 7
        title_end = xml_content.find("</title>", title_start)
        title = xml_content[title_start:title_end].strip()

        # Extract publication date
        date_start = xml_content.find("<published>") + 11
        date_end = xml_content.find("</published>", date_start)
        date_str = xml_content[date_start:date_end].strip()
        pub_date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")

        # Store the result in the cache
        cache[paper_id] = (title, pub_date)

    return title, pub_date


def create_parent_md(paper_ids, output_file="./README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    lines = ["# List of Papers\n"]
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header if the year changes
        if year != prev_year:
            lines.append(f"\n## {year}\n")
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"./papers/{paper_id}.md"

        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[ArXiv]({arxiv_link})] [[QA]({md_link})].\n")

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)

In [51]:
create_parent_md(paper_ids)

Fetching paper details: 100%|██████████████████████████████████| 648/648 [00:00<00:00, 20639.63it/s]


In [52]:
len(paper_ids)

648