# Two pipelines builder -- arXiv paper scraper & Feed

1. arXiv paper scraper (RAG-focused):
    - Search arXiv in all CS categories (cat:cs.*) for papers that mention “Retrieval Augmented Generation”.
    - Iterate results (up to a large max_results) and download PDFs into a local folder, skipping anything already downloaded (tracked by a text file).
    - Print progress and robustly handle HTTP/network errors.


2. Feed → JSONL converter (with granary + BeautifulSoup):
    - Fetch one or more RSS/Atom feeds.

    - Convert them to **JSON Feed** format using **granary**.

    - Strip HTML to plain text using BeautifulSoup.

    - Append each item as one line in feed.jsonl on disk.

## 1). arXiv scrape

In [9]:
# test the search string
import arxiv

# Define the search query with a category filter for Computer Science
query = '"Retrieval Augmented Generation"'

# Perform the search 
search = arxiv.Search(
    query=query,
    max_results=100,  # Limit the number of results
    sort_by=arxiv.SortCriterion.Relevance  # Sort by relevance
)
print(len(list(search.results())))

# Display the results
for result in search.results():
    print(f"Title: {result.title}")
    #print(f"Authors: {', '.join(author.name for author in result.authors)}")
    #print(f"Published: {result.published}")
    #print(f"Summary: {result.summary}")
    #print(f"PDF Link: {result.pdf_url}")
    #print("-" * 80)

  print(len(list(search.results())))


100


  for result in search.results():


Title: End-to-End Trainable Retrieval-Augmented Generation for Relation Extraction
Title: DuetRAG: Collaborative Retrieval-Augmented Generation
Title: Meta-prompting Optimized Retrieval-augmented Generation
Title: ALoFTRAG: Automatic Local Fine Tuning for Retrieval Augmented Generation
Title: A Retrieval-Augmented Generation Framework for Academic Literature Navigation in Data Science
Title: Benchmarking Large Language Models in Retrieval-Augmented Generation
Title: Similarity is Not All You Need: Endowing Retrieval Augmented Generation with Multi Layered Thoughts
Title: Retrieval Augmented Generation and Representative Vector Summarization for large unstructured textual data in Medical Education
Title: Harnessing Retrieval-Augmented Generation (RAG) for Uncovering Knowledge Gaps
Title: Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models
Title: Retrieval-Augmented Generation for Generative Artificial Intelligence in Medicine
Title: Towards Retrieva

In [8]:
# use in rag env

import arxiv
import os
import requests
from requests.exceptions import HTTPError, RequestException
import urllib.error


# Define the search query with a category filter for Computer Science
query = 'cat:cs.* AND "Retrieval Augmented Generation"'
#query = 'ti:"Retrieval-Augmented Generation"'

# Perform the search
search = arxiv.Search(
    query=query,
    max_results=100,  # Limit the number of results
    sort_by=arxiv.SortCriterion.Relevance  # Sort by relevance
)

currentlist=search.results()
print(len(list(search.results())))
download_directory = "/Users/wenzheng/Desktop/LLM CS quant/ZZW-LLM/RAGAnalyzer"
downloadeded_list_path = "./downloadeded_list"

from itertools import islice
start_index = 0
currentlist = islice(search.results(), start_index, None)


# Check if the file "files_to_download.txt" exists
 # Ensure the download directory exists
os.makedirs(download_directory, exist_ok=True)

# Check if the downloaded list file exists, if not, create it
if not os.path.exists(downloadeded_list_path):
    with open(downloadeded_list_path, 'w') as f:
        pass  # Create an empty file

# Read the downloaded list into a set for quick lookup
with open(downloadeded_list_path, 'r') as f:
    downloaded_list = set(line.strip() for line in f)
count=0



# Process the search results
for result in currentlist:
    paper_id = result.entry_id.split('/')[-1]
    if paper_id in downloaded_list:
        print(f"Paper already downloaded, skipping: {result.title}")
        continue

    try:
        # Download the paper
        pdf_filename = f"{paper_id}.pdf"
        pdf_path = os.path.join(download_directory, pdf_filename)
        result.download_pdf(dirpath=download_directory)
        print(f"Downloaded: {pdf_filename}")

        # Add the paper ID to the downloaded list
        with open(downloadeded_list_path, 'a') as f:
            f.write(paper_id + '\n')
        downloaded_list.add(paper_id)
        count += 1
        print("-" * 80 + str(count))
    except (FileNotFoundError, HTTPError, RequestException, urllib.error.HTTPError) as e:
        print(f"Error downloading {result.title}: {e}")
        print("Skipping to the next paper.")
        continue
    except Exception as e:
        print(f"Unexpected error downloading {result.title}: {e}")
        continue


  currentlist=search.results()
  print(len(list(search.results())))


UnexpectedEmptyPageError: Page of results was unexpectedly empty (https://export.arxiv.org/api/query?search_query=cat%3Acs.%2A+AND+%22Retrieval+Augmented+Generation%22&id_list=&sortBy=relevance&sortOrder=descending&start=100&max_results=100)

## 2). Json feed

In [None]:
# I was recently working on a project to plot historical trends showing mentions of a keyword in a given field of interest.
#  For this project, I needed to retrieve abstracts from specific categories (i.e. cs.CV) on Arxiv over periods of time (i.e. the last 180 days). 
# I wrote a script in Python to do this:

#set PYTHONUTF8=1
#pip install granary



import json
import requests
from granary import jsonfeed, rss
from datetime import datetime, timedelta
from tqdm import tqdm
from bs4 import BeautifulSoup

days = 10
day_timestamps = [(datetime.now() - timedelta(days=i)).strftime('%Y%m%d') for i in range(1, days)]

BASE_URL = 'http://export.arxiv.org/api/query?search_query=cat:cs.CV+AND+submittedDate:[{day}0000+TO+{day}2359]&max_results=1000'

feeds = [BASE_URL.format(day=date) for date in day_timestamps]

for feed in tqdm(feeds):
    try:
        resp = requests.get(
            feed, headers={"User-Agent": "arxiv-poll"}
        )
        resp.raise_for_status()
    except requests.RequestException:
        print("Failed to fetch", feed)
        continue

    activities = jsonfeed.activities_to_jsonfeed(rss.to_activities(resp.text))

    print("Fetched", feed, "with", len(activities["items"]), "activities using feed type", content_type)

    activities["items"] = [
        {
            **activity,
            "content_html": BeautifulSoup(activity["content_html"], "html.parser").get_text()
        }
        for activity in activities["items"]
    ]

    with open("feed.jsonl", "a+") as f:
        for activity in activities["items"]:
            f.write(json.dumps(activity) + "\n")

ModuleNotFoundError: No module named 'granary'