In [33]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from tqdm import tqdm
import shelve
import os
import git

In [34]:
def get_paper_ids_from_repo(repo_url):
    # Get the content of the repository's papers folder
    response = requests.get(repo_url)
    response.raise_for_status()

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract all CSV filenames
    csv_files = [
        link.get("href")
        for link in soup.find_all("a")
        if link.get("href", "").endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.split("/")[-1].replace(".csv", "") for filename in csv_files]

    return paper_ids


# Get the list of paper IDs from the repository
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa"
paper_ids = get_paper_ids_from_repo(REPO_URL)

In [35]:
import pandas as pd

def csv_to_markdown(paper_id, repo_path, cache, fetch_title_online=True):
    # Read CSV from the cloned repository
    try:
        df = pd.read_csv(os.path.join(repo_path, "papers", f"{paper_id}.csv"))
    except FileNotFoundError:
        print(f"No CSV found for paper ID: {paper_id}")
        return
    
    # Check if the paper title is in the cache
    if paper_id in cache:
        paper_title = cache[paper_id]
    elif fetch_title_online:
        # Fetch the paper title using the arXiv API
        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        try:
            title_response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
            title_response.raise_for_status()
            xml_content = title_response.content.decode("utf-8")
            title_start = xml_content.find("<title>") + 7
            title_end = xml_content.find("</title>", title_start)
            paper_title = xml_content[title_start:title_end].strip()
            paper_title = paper_title.replace("\n", " ").replace("\r", "")
        except requests.RequestException as e:
            print(f"Error fetching title for {paper_id}: {str(e)}")
            return
        # Cache the paper title
        cache[paper_id] = paper_title
    else:
        paper_title = paper_id  # or another default/fallback title
    
    # Convert DataFrame to markdown
    markdown_lines = []
    paper_title_link = f"[{paper_title}](https://arxiv.org/abs/{paper_id})"
    markdown_lines.append("# " + paper_title_link)
    
    for _, row in df.iterrows():
        markdown_lines.append("\n## " + row["question"])
        markdown_lines.append("\n" + str(row["answer"]) + "\n")
    
    markdown_content = "\n".join(markdown_lines)
    
    # Ensure the directory exists
    os.makedirs(os.path.join("./papers"), exist_ok=True)
    
    # Save to .md file named using the paper_id
    with open(f"./papers/{paper_id}.md", "w") as md_file:
        md_file.write(markdown_content)


def get_paper_ids_from_repo(repo_path):
    # Get the list of CSV files in the papers directory of the cloned repository
    csv_files = [
        filename
        for filename in os.listdir(os.path.join(repo_path, "papers"))
        if filename.endswith(".csv")
    ]

    # Extract paper IDs from filenames
    paper_ids = [filename.replace(".csv", "") for filename in csv_files]

    # Extract base IDs and ensure only one version of each paper is included
    base_ids = {paper_id.split('v')[0] for paper_id in paper_ids}
    unique_paper_ids = []
    for base_id in base_ids:
        versions = [pid for pid in paper_ids if pid.startswith(base_id)]
        unique_paper_ids.append(sorted(versions)[0])  # Add the earliest version

    return unique_paper_ids


In [36]:
# Clone the repository using GitPython or pull the latest changes if it exists
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa.git"
REPO_PATH = "./arxiv_qa_repo"
if not os.path.exists(REPO_PATH):
    git.Repo.clone_from(REPO_URL, REPO_PATH)
else:
    repo = git.Repo(REPO_PATH)
    origin = repo.remotes.origin
    origin.pull()

# Get the list of paper IDs from the cloned repository
paper_ids = get_paper_ids_from_repo(REPO_PATH)

# Open a shelve cache
with shelve.open("arxiv_cache") as cache:
    # Convert each paper's CSV to markdown
    for paper_id in tqdm(paper_ids):
        csv_to_markdown(paper_id, REPO_PATH, cache)

 16%|█▌        | 935/6019 [00:11<03:12, 26.41it/s] 

Error fetching title for 1910.11480: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 19%|█▉        | 1147/6019 [00:16<02:55, 27.70it/s] 

Error fetching title for 2212.12645: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 22%|██▏       | 1303/6019 [00:21<03:52, 20.27it/s]

Error fetching title for 2303.02375: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 22%|██▏       | 1346/6019 [00:25<04:59, 15.61it/s]

Error fetching title for 2303.07622: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 24%|██▍       | 1452/6019 [00:30<03:05, 24.58it/s]

Error fetching title for 2111.13196: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 27%|██▋       | 1635/6019 [00:35<02:35, 28.17it/s]

Error fetching title for 2303.15083: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 34%|███▍      | 2033/6019 [00:41<02:18, 28.75it/s] 

Error fetching title for 2303.14435: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 38%|███▊      | 2267/6019 [00:46<02:11, 28.62it/s] 

Error fetching title for 2202.08335: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 42%|████▏     | 2519/6019 [00:51<01:52, 31.17it/s] 

Error fetching title for 2211.07273: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 46%|████▌     | 2749/6019 [00:56<01:59, 27.44it/s] 

Error fetching title for 2005.00558: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 46%|████▋     | 2787/6019 [01:00<03:19, 16.17it/s]

Error fetching title for 2211.14086: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 48%|████▊     | 2889/6019 [01:05<02:35, 20.18it/s]

Error fetching title for 2304.02950: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 57%|█████▋    | 3410/6019 [01:11<01:18, 33.39it/s] 

Error fetching title for 2307.13702: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 62%|██████▏   | 3704/6019 [01:16<01:12, 31.93it/s] 

Error fetching title for 2303.11366: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 63%|██████▎   | 3801/6019 [01:21<01:28, 25.09it/s]

Error fetching title for 2304.01663: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Error fetching title for 2212.12249: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 64%|██████▍   | 3855/6019 [01:29<02:56, 12.24it/s]

Error fetching title for 2304.03283: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 65%|██████▌   | 3941/6019 [01:34<02:10, 15.98it/s]

Error fetching title for 2003.01964: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 70%|███████   | 4216/6019 [01:39<01:00, 29.66it/s] 

Error fetching title for 2303.11681: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 72%|███████▏  | 4358/6019 [01:44<01:08, 24.38it/s]

Error fetching title for 2111.07832: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 76%|███████▌  | 4571/6019 [01:49<00:52, 27.63it/s] 

Error fetching title for 2309.01523: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 79%|███████▉  | 4780/6019 [01:54<00:43, 28.51it/s] 

Error fetching title for 2309.11009: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 80%|████████  | 4834/6019 [01:58<00:59, 19.92it/s]

Error fetching title for 2310.09118: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 82%|████████▏ | 4958/6019 [02:03<00:45, 23.10it/s]

Error fetching title for 2307.03109: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 83%|████████▎ | 4981/6019 [02:07<01:21, 12.79it/s]

Error fetching title for 2212.09072: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 84%|████████▎ | 5026/6019 [02:12<01:20, 12.38it/s]

Error fetching title for 2210.06284: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 84%|████████▍ | 5067/6019 [02:16<01:18, 12.16it/s]

Error fetching title for 2308.12351: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 86%|████████▌ | 5166/6019 [02:20<00:48, 17.60it/s]

Error fetching title for 2010.07492: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 87%|████████▋ | 5255/6019 [02:25<00:43, 17.69it/s]

Error fetching title for 2308.01263: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 89%|████████▉ | 5355/6019 [02:30<00:35, 18.90it/s]

Error fetching title for 2309.11081: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 93%|█████████▎| 5626/6019 [02:35<00:14, 26.55it/s] 

Error fetching title for 2108.05540: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 98%|█████████▊| 5915/6019 [02:41<00:03, 28.95it/s] 

Error fetching title for 1902.11038: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


100%|██████████| 6019/6019 [02:45<00:00, 36.32it/s]

Error fetching title for 2303.10475: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))





In [37]:
paper_ids = list(set(paper_ids))

In [38]:
def get_paper_details(paper_id, cache_file="paper_details_cache"):
    """Retrieve the paper title and publication date from the arXiv API for a given paper_id."""
    with shelve.open(cache_file) as cache:
        # Check if the result is already in the cache
        if paper_id in cache:
            return cache[paper_id]

        ARXIV_API_ENDPOINT = "http://export.arxiv.org/api/query?id_list={}"
        response = requests.get(ARXIV_API_ENDPOINT.format(paper_id))
        response.raise_for_status()
        xml_content = response.content.decode("utf-8")

        # Extract title
        title_start = xml_content.find("<title>") + 7
        title_end = xml_content.find("</title>", title_start)
        title = xml_content[title_start:title_end].strip()

        # Extract publication date
        date_start = xml_content.find("<published>") + 11
        date_end = xml_content.find("</published>", date_start)
        date_str = xml_content[date_start:date_end].strip()
        pub_date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")

        # Store the result in the cache
        cache[paper_id] = (title, pub_date)

    return title, pub_date


def create_parent_md(paper_ids, output_file="./README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    lines = ["# List of Papers\n"]
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header if the year changes
        if year != prev_year:
            lines.append(f"\n## {year}\n")
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[ArXiv]({arxiv_link})] [[QA]({md_link})].\n")

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)

In [39]:
def create_parent_md(paper_ids, output_file="./README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    lines = ["# List of Papers\n"]
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header and accordion structure if the year changes
        if year != prev_year:
            if prev_year is not None:
                lines.append("</div></details>")  # Close div for previous year
            lines.extend(
                [
                    f"\n<details open>",  # The "open" attribute makes the year visible by default
                    f"<summary><strong>{year}</strong></summary>",
                    f"<div>\n",
                ]
            )
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        # Use the paper_id as the link to the paper
        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[{paper_id}]({arxiv_link})] [[QA]({md_link})].\n")

    lines.append("</div></details>")  # Close last year's div

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)


In [40]:
create_parent_md(paper_ids)

Fetching paper details: 100%|█████████████████████████████████| 6019/6019 [00:01<00:00, 4378.11it/s]


In [41]:
len(paper_ids)

6019

In [42]:
with open("paper_ids.txt", "w") as f:
    for paper_id in paper_ids:
        f.write(paper_id + "\n")


### HF README

In [43]:
def create_parent_md_hf(paper_ids, output_file="./HF/README.md"):
    """Create a parent Markdown file listing all papers, sorted by publication date."""
    paper_details = []

    # open header and add header to the beginning of the file
    with open('./HF/HEADER.md', 'r') as file:
        header = file.read()
    
    lines = [header] + ["\n# List of Papers\n"]

    # Fetch titles and dates for all papers
    for paper_id in tqdm(paper_ids, desc="Fetching paper details", ncols=100):
        title, pub_date = get_paper_details(paper_id)
        paper_details.append((paper_id, title, pub_date))

    # Sort papers by publication date
    paper_details.sort(key=lambda x: x[2], reverse=True)

    # Generate Markdown content
    prev_year, prev_month = None, None

    for paper_id, title, pub_date in paper_details:
        year = pub_date.strftime("%Y")
        month_name = pub_date.strftime("%B")

        # Add Year header and accordion structure if the year changes
        if year != prev_year:
            if prev_year is not None:
                lines.append("</div></details>")  # Close div for previous year
            lines.extend(
                [
                    f"\n<details open>",  # The "open" attribute makes the year visible by default
                    f"<summary><strong>{year}</strong></summary>",
                    f"<div>\n",
                ]
            )
            prev_year = year
            prev_month = None  # Reset month whenever year changes

        # Add Month header if the month changes
        if month_name != prev_month:
            lines.append(f"\n### {month_name} {year}\n")
            prev_month = month_name

        date_str = pub_date.strftime("%Y/%m")
        arxiv_link = f"https://arxiv.org/abs/{paper_id}"
        md_link = f"https://github.com/taesiri/ArXivQA/blob/main/papers/{paper_id}.md"

        # Use the paper_id as the link to the paper
        title = title.replace("\n", " ").replace("\r", "")
        lines.append(f"- {title} - [[{paper_id}]({arxiv_link})] [[QA]({md_link})].\n")

    lines.append("</div></details>")  # Close last year's div

    with open(output_file, "w") as md_file:
        md_file.writelines(lines)

In [44]:
create_parent_md_hf(paper_ids)

Fetching paper details: 100%|█████████████████████████████████| 6019/6019 [00:01<00:00, 5179.15it/s]


In [45]:
import glob

# Get a list of all markdown files in the directory
md_files = glob.glob("./papers/*.md")
bad_files = []
# Loop through each file and check if the phrase is present
count = 0
for file in md_files:
    with open(file, "r") as f:
        content = f.read()
        if "without having access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1
        elif "access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1
        elif "without access to the full paper" in content.lower():
            bad_files.append(file)
            count += 1

print(f"{count} files contain the phrase 'bad phrases'")

4 files contain the phrase 'bad phrases'


In [46]:
bad_files

['./papers/2307.01848.md',
 './papers/2205.11916.md',
 './papers/2309.08637.md',
 './papers/2305.10855.md']

In [47]:
# import os

# # Define the path to the papers folder
# papers_folder = "/Github/arxiv_qa/papers"

# # Loop through each file in bad_files and delete it
# for file in bad_files:
#     file = file.replace("./papers/", "")
#     file = file.replace(".md", ".csv")

#     file_path = os.path.join(papers_folder, file)
#     print(f"Deleting {file_path}")
#     os.remove(file_path)


In [48]:
# import os

# # Define the path to the papers folder

# # Loop through each file in bad_files and delete it
# for file in bad_files:
#     file_path = os.path.join(file)
#     print(f"Deleting {file_path}")
#     os.remove(file_path)
