# AI research Papers

> Turns out, the summary information is also provided by the arxiv RSS feed. However, that feed has a limited set of entries. This API allows for a much larger collection of data.

Since I am asked to look at AI feeds, I could scan those for links to papers etc. However, the main point was to get papers and then process them. Defaulting to arxiv for now.

Normally folks would keep doing this on a weekly or daily basis to keep track of trens _(which are invariably historical comparisons)_. Howver, I have to download a bunch of these to build up my history.

Arxiv TOS specifies a rate-limit of 1req/3s which applies to all the machines being controlled by an entity. In my case, just one machine so will see how long it'll take. Maybe cycle through each day/month so I'll atleast have some data for each month and it'll keep adding to it.

 - python arxiv package
 - python ratelimit package
 - PDF to text conversion. Save as json.

The result has the following useful attribs (fields/methods) for my use case

 - 'authors'
 - 'links'
 - ✔️'categories'
 - 'comment'
 - 'doi'
 - 'download_pdf' _method_
 - 'download_source' _method_
 - 'entry_id'
 - 'get_short_id' 
 - 'links', 
 - 'pdf_url', 
 - 'primary_category'
 - ✔️'published'
 - ✔️'summary'
 - ✔️'title'

 If there is a summary, then I don't need to deal with the PDFs, conveting them to text etc yet. Can do at the end if end-to-end gets done!

# What to download

There are tons of papers every day at arxiv. To manage the load but still get enough time-spread, sample daily.
 - 10 papers per day in `cs.AI`
 - Spread over 4 years

## Notebook setup 

In [1]:
# Setup paths to our libs
import os
import sys
from pathlib import Path

lib_path = (Path(os.getcwd()) / "lib").resolve()
sys.path.append(str(lib_path))

# Import jupyter utils
import logging
from util import jupyter_util
from util.jupyter_util import DisplayHTML as jh
from util.jupyter_util import DisplayMarkdown as jm

# Init jupyter env. Set to DEBUG if you want to see the gory details
# of schemas and such.
jupyter_util.setup_logging(logging.WARNING)

In [2]:
import arxiv
import logging
from ratelimit import limits, sleep_and_retry
from datetime import datetime, timedelta

ARXIV_STRFTIME_FMT = "%Y%m%d%H%M"

client = arxiv.Client()

# allow $calls within $period seconds.
# https://info.arxiv.org/help/api/tou.html says no more than 1 call every 3 seconds
@sleep_and_retry
@limits(calls=1,period=3)
def query_arxiv_for_day(start: datetime, max_results: int):
    # Day interval
    arxiv_from = start.strftime(ARXIV_STRFTIME_FMT)
    arxiv_to   = (start + timedelta(hours=23, minutes=59)).strftime(ARXIV_STRFTIME_FMT)

    search = arxiv.Search(        
        query=f"cat:cs.AI AND submittedDate:[{str(arxiv_from)} TO {str(arxiv_to)}]",
        max_results = max_results,
        sort_by = arxiv.SortCriterion.SubmittedDate
    )

    logging.debug(f"Got results for {start.strftime("%m/%d/%Y")}")
    return list(client.results(search))
    

In [24]:
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel

# Unused currently. Will simply load the JSON that is provided by feedreader.
class ArxivResultItem(BaseModel):
    authors: List[str]
    title : str
    summary: str
    published: str

    primary_category: str
    categories: List[str]

    pdf_url : str
    entry_url : str        
    
    summary: str    
    

In [29]:
from datetime import datetime, timedelta

client = arxiv.Client()

def get_arxiv_items(days_lookback:int = 1):        
    # Start with today but reset time to start of the day.
    today_0 = datetime.today().replace(hour=0, minute=0, second=0, microsecond=0)
    start_t = today_0 - timedelta(days = days_lookback)

    # Key this by date
    daily_results_dict = {}

    for t in [start_t + timedelta(days=d) for d in range(0,days_lookback)]: 
        day_results = query_arxiv_for_day(t, max_results=10)

        day_json_items = []
        for r in day_results:
            item = ArxivResultItem(
                title  = r.title,
                authors = [a.name for a in r.authors],
                summary = r.summary,

                categories = r.categories,
                primary_category = r.primary_category,

                published = r.published.strftime("%m/%d/%Y"), # good enough
                entry_url = r.entry_id,
                pdf_url = r.pdf_url
            )
            day_json_items.append(item)
                
        daily_results_dict[t] = day_json_items
    
    return daily_results_dict

In [33]:
# Load data
# while testing use just a day or two.
daily_results_dict = get_arxiv_items(
    #days_lookback=1
    days_lookback=3*365 # 4 years
    )

## Examine by printing it nicely formatted in markdown

In [34]:
def pretty_print_arxiv_entries(results_dict_by_day:dict):
    print(len(results_dict_by_day))

    for day, daily_items in results_dict_by_day.items():
        jm.h(f"Entries for {day.strftime("%m/%d/%Y")}", level=2)
        for e in daily_items:
            jm.md(f"""
**Title**: {e.title}

**Summary**: {e.summary}

----
""")    
            
#pretty_print_arxiv_entries(daily_results_dict)

## Save to a single JSON file

In [None]:
import os
import json
from pathlib import Path

DATA_DIR = Path(os.getcwd()) / "data"
FEED_RAW_DATA_DIR = DATA_DIR / "feed" / "raw"

os.makedirs(FEED_RAW_DATA_DIR, exist_ok=True)

def results_dict_to_json(results_dict):
    json_dict = {}

    for day, daily_items in results_dict.items():
        day_str = day.strftime("%m/%d/%Y")        
        json_dict[day_str] = [result.model_dump() for result in daily_items]
    
    return json_dict

# Write into the raw feeds dir but separate name from the arxiv rss feed.
out_path = FEED_RAW_DATA_DIR / "Arxiv_csAI_API_dailysampled_3y.json"
with open(str(out_path), 'w') as outfile:
    json_dict = results_dict_to_json(daily_results_dict)
    json.dump(json_dict, outfile, indent=4)

logging.debug(f"Wrote arxiv API dict out to {str(out_path)}")

# Scratchpad

In [11]:
#!pip install arxiv
import arxiv

# https://info.arxiv.org/help/api/user-manual.html#query_details
# cat: one of the categories
# submittedDate:[202501010000+TO+202501312359] - URL form, note the spaces replaced with +
# [YYYYMMDDTTTT+TO+YYYYMMDDTTTT]
client = arxiv.Client()
search = arxiv.Search(
    #query="cat:cs.AI AND ",
    query="cat:cs.AI AND submittedDate:[202501010000 TO 202501302359]",
    max_results = 1,
    sort_by = arxiv.SortCriterion.SubmittedDate
)
all_results = list(client.results(search))

In [31]:
# I have seen some 100 entries daily. Such productivity!!
# - Sample at 5 per day for the last two years

# The data in the search is like a digest. The actual PDfs have to be downloaded separately
# Still. it takes 40s to get and do a list(results)
# 3/10 to 3/18 there are 1000! entries and this took 38 seconds to pull!!
# extrapolates to roughly 4000 entries per month at this rate. 
# 
# Can I specify search range in the URL ?
if len(all_results):
    for r in all_results:
        print(f"""
 Title : {r.title}
Author : {r.authors}
Link   : {r.links}
Categories: {r.categories}
entry_id: {r.entry_id}
pdf_url:  {r.pdf_url}
primary_category:  {r.primary_category}
published:  {r.published}
summary:  {r.summary}
""")        


 Title : Every Image Listens, Every Image Dances: Music-Driven Image Animation
Author : [arxiv.Result.Author('Zhikang Dong'), arxiv.Result.Author('Weituo Hao'), arxiv.Result.Author('Ju-Chiang Wang'), arxiv.Result.Author('Peng Zhang'), arxiv.Result.Author('Pawel Polak')]
Link   : [arxiv.Result.Link('http://arxiv.org/abs/2501.18801v1', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/2501.18801v1', title='pdf', rel='related', content_type=None)]
Categories: ['cs.CV', 'cs.AI']
entry_id: http://arxiv.org/abs/2501.18801v1
pdf_url:  http://arxiv.org/pdf/2501.18801v1
primary_category:  cs.CV
published:  2025-01-30 23:38:51+00:00
summary:  Image animation has become a promising area in multimodal research, with a
focus on generating videos from reference images. While prior work has largely
emphasized generic video generation guided by text, music-driven dance video
generation remains underexplored. In this paper, we introduce MuseDance, an
innovative e

In [25]:
from datetime import datetime, timedelta

t = datetime.today()
t_minus_2y = t - timedelta(days=3*365)

print(t)
print(t_minus_2y)

2025-03-19 14:46:25.776500
2022-03-20 14:46:25.776500


In [15]:
# Older ratelimiter is broken in 3.11 as it's use of asyncio.coroutine 
# has been removed
from ratelimit import limits, sleep_and_retry

counter = 1

# allow $calls within $period seconds.
@sleep_and_retry
@limits(calls=1,period=1)
def do_foo():
    global counter
    print(counter)
    counter += 1

for i in range(1,10):
    do_foo()

1
2
3
4
5
6
7
8
9


In [40]:
import re
tail_pat = re.compile(r'^.*?/([^/]*)$')

for k,v in daily_results_dict.items():
    print(f"PDF for {k}")

    item = v[0]
    print(item.pdf_url)
    m = tail_pat.match(item.pdf_url)
    if m:        
        pdf_outfile = f"{m.group(1)}.pdf"
        print(f"Saving to {pdf_outfile}")
    break

PDF for 2022-03-20 00:00:00
http://arxiv.org/pdf/2203.10675v1
Saving to 2203.10675v1.pdf
