In [958]:
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from tqdm import tqdm
import shelve
import os
import git
import requests

In [959]:
def get_paper_ids_from_repo(repo_url):
    # Get the content of the repository's papers folder
    response = requests.get(repo_url)
    response.raise_for_status()

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract all CSV filenames
    csv_files = [
        link.get("href")
        for link in soup.find_all("a")
        if link.get("href", "").endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.split("/")[-1].replace(".csv", "") for filename in csv_files]

    return paper_ids


# Get the list of paper IDs from the repository
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa"
paper_ids = get_paper_ids_from_repo(REPO_URL)

In [960]:
# def csv_to_markdown(paper_id, repo_path, cache):
#     # Read CSV from the cloned repository
#     with open(os.path.join(repo_path, "papers", f"{paper_id}.csv"), "r") as file:
#         csv_content = file.read()

#     # Check if the paper title is in the cache
#     if paper_id in cache:
#         paper_title = cache[paper_id]
#     else:
#         # Fetch the paper title using the arXiv API
#         ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
#         title_response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
#         title_response.raise_for_status()
#         xml_content = title_response.content.decode("utf-8")
#         title_start = xml_content.find("<title>") + 7
#         title_end = xml_content.find("</title>", title_start)
#         paper_title = xml_content[title_start:title_end].strip()
#         paper_title = paper_title.replace("\n", " ").replace("\r", "")

#         # Cache the paper title
#         cache[paper_id] = paper_title

#     # Convert CSV to markdown
#     markdown_lines = []
#     paper_title_link = f"[{paper_title}](https://arxiv.org/abs/{paper_id})"
#     markdown_lines.append("# " + paper_title_link)

#     csv_reader = csv.DictReader(csv_content.splitlines())
#     for row in csv_reader:
#         markdown_lines.append("\n## " + row["question"])
#         markdown_lines.append("\n" + row["answer"] + "\n")

#     markdown_content = "\n".join(markdown_lines)

#     # Save to .md file named using the paper_id
#     with open(f"./papers/{paper_id}.md", "w") as md_file:
#         md_file.write(markdown_content)


# def get_paper_ids_from_repo(repo_path):
#     # Get the list of CSV files in the papers directory of the cloned repository
#     csv_files = [
#         filename
#         for filename in os.listdir(os.path.join(repo_path, "papers"))
#         if filename.endswith(".csv")
#     ]

#     # Extract paper IDs from filenames
#     paper_ids = [filename.replace(".csv", "") for filename in csv_files]

#     # Extract base IDs and ensure only one version of each paper is included
#     base_ids = {paper_id.split('v')[0] for paper_id in paper_ids}
#     unique_paper_ids = []
#     for base_id in base_ids:
#         versions = [pid for pid in paper_ids if pid.startswith(base_id)]
#         unique_paper_ids.append(sorted(versions)[0])  # Add the earliest version

#     return unique_paper_ids


In [961]:
import pandas as pd

def csv_to_markdown(paper_id, repo_path, cache, fetch_title_online=True):
    # Read CSV from the cloned repository
    try:
        df = pd.read_csv(os.path.join(repo_path, "papers", f"{paper_id}.csv"))
    except FileNotFoundError:
        print(f"No CSV found for paper ID: {paper_id}")
        return
    
    # Check if the paper title is in the cache
    if paper_id in cache:
        paper_title = cache[paper_id]
    elif fetch_title_online:
        # Fetch the paper title using the arXiv API
        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        try:
            title_response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
            title_response.raise_for_status()
            xml_content = title_response.content.decode("utf-8")
            title_start = xml_content.find("<title>") + 7
            title_end = xml_content.find("</title>", title_start)
            paper_title = xml_content[title_start:title_end].strip()
            paper_title = paper_title.replace("\n", " ").replace("\r", "")
        except requests.RequestException as e:
            print(f"Error fetching title for {paper_id}: {str(e)}")
            return
        # Cache the paper title
        cache[paper_id] = paper_title
    else:
        paper_title = paper_id  # or another default/fallback title
    
    # Convert DataFrame to markdown
    markdown_lines = []
    paper_title_link = f"[{paper_title}](https://arxiv.org/abs/{paper_id})"
    markdown_lines.append("# " + paper_title_link)
    
    for _, row in df.iterrows():
        markdown_lines.append("\n## " + row["question"])
        markdown_lines.append("\n" + str(row["answer"]) + "\n")
    
    markdown_content = "\n".join(markdown_lines)
    
    # Ensure the directory exists
    os.makedirs(os.path.join("./papers"), exist_ok=True)
    
    # Save to .md file named using the paper_id
    with open(f"./papers/{paper_id}.md", "w") as md_file:
        md_file.write(markdown_content)


def get_paper_ids_from_repo(repo_path):
    # Get the list of CSV files in the papers directory of the cloned repository
    csv_files = [
        filename
        for filename in os.listdir(os.path.join(repo_path, "papers"))
        if filename.endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.replace(".csv", "") for filename in csv_files]

    # Extract base IDs and ensure only one version of each paper is included
    base_ids = {paper_id.split('v')[0] for paper_id in paper_ids}
    unique_paper_ids = []
    for base_id in base_ids:
        versions = [pid for pid in paper_ids if pid.startswith(base_id)]
        unique_paper_ids.append(sorted(versions)[0])  # Add the earliest version

    return unique_paper_ids


In [962]:
# Clone the repository using GitPython or pull the latest changes if it exists
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa.git"
REPO_PATH = "./arxiv_qa_repo"
if not os.path.exists(REPO_PATH):
    git.Repo.clone_from(REPO_URL, REPO_PATH)
else:
    repo = git.Repo(REPO_PATH)
    origin = repo.remotes.origin
    origin.pull()

# Get the list of paper IDs from the cloned repository
paper_ids = get_paper_ids_from_repo(REPO_PATH)

# Open a shelve cache
with shelve.open("arxiv_cache") as cache:
    # Convert each paper's CSV to markdown
    for paper_id in tqdm(paper_ids):
        csv_to_markdown(paper_id, REPO_PATH, cache)

100%|██████████| 5051/5051 [00:19<00:00, 262.82it/s]


In [963]:
paper_ids = list(set(paper_ids))

In [964]:
def get_paper_details(paper_id, cache_file="paper_details_cache"):
    """Retrieve the paper title and publication date from the arXiv API for a given paper_id."""
    with shelve.open(cache_file) as cache:
        # Check if the result is already in the cache
        if paper_id in cache:
            return cache[paper_id]

        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
        response.raise_for_status()
        xml_content = response.content.decode("utf-8")

        # Extract title
        title_start = xml_content.find("<title>") + 7
        title_end = xml_content.find("</title>", title_start)
        title = xml_content[title_start:title_end].strip()

        # Extract publication date
        date_start = xml_content.find("<published>") + 11
        date_end = xml_content.find("</published>", date_start)
        date_str = xml_content[date_start:date_end].strip()
        pub_date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")

        # Store the result in the cache
        cache[paper_id] = (title, pub_date)

    return title, pub_date


def create_parent_md(paper_ids, output_file="./README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    lines = ["# List of Papers\n"]
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header if the year changes
        if year != prev_year:
            lines.append(f"\n## {year}\n")
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[ArXiv]({arxiv_link})] [[QA]({md_link})].\n")

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)

In [965]:
def create_parent_md(paper_ids, output_file="./README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    lines = ["# List of Papers\n"]
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header and accordion structure if the year changes
        if year != prev_year:
            if prev_year is not None:
                lines.append("</div></details>")  # Close div for previous year
            lines.extend(
                [
                    f"\n<details open>",  # The "open" attribute makes the year visible by default
                    f"<summary><strong>{year}</strong></summary>",
                    f"<div>\n",
                ]
            )
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        # Use the paper_id as the link to the paper
        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[{paper_id}]({arxiv_link})] [[QA]({md_link})].\n")

    lines.append("</div></details>")  # Close last year's div

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)


In [966]:
create_parent_md(paper_ids)

Fetching paper details: 100%|█████████████████████████████████| 5051/5051 [00:01<00:00, 2682.30it/s]


In [967]:
len(paper_ids)

5051

In [968]:
with open("paper_ids.txt", "w") as f:
    for paper_id in paper_ids:
        f.write(paper_id + "\n")


### HF README

In [969]:
def create_parent_md_hf(paper_ids, output_file="./HF/README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # open header and add header to the beginning of the file
    with open('./HF/HEADER.md', 'r') as file:
        header = file.read()
    
    lines = [header] + ["\n# List of Papers\n"]

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header and accordion structure if the year changes
        if year != prev_year:
            if prev_year is not None:
                lines.append("</div></details>")  # Close div for previous year
            lines.extend(
                [
                    f"\n<details open>",  # The "open" attribute makes the year visible by default
                    f"<summary><strong>{year}</strong></summary>",
                    f"<div>\n",
                ]
            )
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        # Use the paper_id as the link to the paper
        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[{paper_id}]({arxiv_link})] [[QA]({md_link})].\n")

    lines.append("</div></details>")  # Close last year's div

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)

In [970]:
create_parent_md_hf(paper_ids)

Fetching paper details: 100%|█████████████████████████████████| 5051/5051 [00:01<00:00, 3966.93it/s]


In [971]:
import glob

# Get a list of all markdown files in the directory
md_files = glob.glob("./papers/*.md")
bad_files = []
# Loop through each file and check if the phrase is present
count = 0
for file in md_files:
    with open(file, "r") as f:
        content = f.read()
        if "without having access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1
        elif "access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1
        elif "without access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1

print(f"{count} files contain the phrase 'bad phrases'")

44 files contain the phrase 'bad phrases'


In [972]:
bad_files

['./papers/2301.13616.md',
 './papers/2308.16463.md',
 './papers/2212.08073.md',
 './papers/2309.0791.md',
 './papers/2107.13586.md',
 './papers/2305.19835.md',
 './papers/2306.17194.md',
 './papers/2306.04031.md',
 './papers/2304.04746.md',
 './papers/2305.15581.md',
 './papers/2310.03714.md',
 './papers/2305.07185.md',
 './papers/2207.05739.md',
 './papers/2302.04023.md',
 './papers/2303.17651.md',
 './papers/2307.01848.md',
 './papers/2303.04673.md',
 './papers/2307.04577.md',
 './papers/2307.10350.md',
 './papers/2306.17582.md',
 './papers/2308.11551.md',
 './papers/2305.09515.md',
 './papers/2205.11916.md',
 './papers/2309.08637.md',
 './papers/2309.15129.md',
 './papers/2305.10855.md',
 './papers/2309.03409.md',
 './papers/2304.10970.md',
 './papers/2201.07207.md',
 './papers/2305.14540.md',
 './papers/2308.01313.md',
 './papers/1803.11203.md',
 './papers/2309.16588.md',
 './papers/2308.13954.md',
 './papers/2306.09539.md',
 './papers/2309.16235.md',
 './papers/2303.14027.md',
 '

In [973]:
['./papers/2301.13616.md',
 './papers/2308.16463.md',
 './papers/2212.08073.md',
 './papers/2309.0791.md',
 './papers/2107.13586.md',
 './papers/2305.19835.md',
 './papers/2306.17194.md',
 './papers/2306.04031.md',
 './papers/2304.04746.md',
 './papers/2305.15581.md',
 './papers/2310.03714.md',
 './papers/2305.07185.md',
 './papers/2207.05739.md',
 './papers/2302.04023.md',
 './papers/2303.17651.md',
 './papers/2307.01848.md',
 './papers/2303.04673.md',
 './papers/2307.04577.md',
 './papers/2307.10350.md',
 './papers/2306.17582.md',
 './papers/2308.11551.md',
 './papers/2305.09515.md',
 './papers/2205.11916.md',
 './papers/2309.08637.md',
 './papers/2309.15129.md',
 './papers/2305.10855.md',
 './papers/2309.03409.md',
 './papers/2304.10970.md',
 './papers/2201.07207.md',
 './papers/2305.14540.md',
 './papers/2308.01313.md',
 './papers/1803.11203.md',
 './papers/2309.16588.md',
 './papers/2308.13954.md',
 './papers/2306.09539.md',
 './papers/2309.16235.md',
 './papers/2305.16960.md',
 './papers/2307.16715.md',
 './papers/2306.09557.md',
 './papers/2306.05425.md',
 './papers/2107.03374.md',
 './papers/2210.09261.md',
 './papers/2304.13169.md']

['./papers/2301.13616.md',
 './papers/2308.16463.md',
 './papers/2212.08073.md',
 './papers/2309.0791.md',
 './papers/2107.13586.md',
 './papers/2305.19835.md',
 './papers/2306.17194.md',
 './papers/2306.04031.md',
 './papers/2304.04746.md',
 './papers/2305.15581.md',
 './papers/2310.03714.md',
 './papers/2305.07185.md',
 './papers/2207.05739.md',
 './papers/2302.04023.md',
 './papers/2303.17651.md',
 './papers/2307.01848.md',
 './papers/2303.04673.md',
 './papers/2307.04577.md',
 './papers/2307.10350.md',
 './papers/2306.17582.md',
 './papers/2308.11551.md',
 './papers/2305.09515.md',
 './papers/2205.11916.md',
 './papers/2309.08637.md',
 './papers/2309.15129.md',
 './papers/2305.10855.md',
 './papers/2309.03409.md',
 './papers/2304.10970.md',
 './papers/2201.07207.md',
 './papers/2305.14540.md',
 './papers/2308.01313.md',
 './papers/1803.11203.md',
 './papers/2309.16588.md',
 './papers/2308.13954.md',
 './papers/2306.09539.md',
 './papers/2309.16235.md',
 './papers/2305.16960.md',
 '