<a href="https://colab.research.google.com/github/wizard339/article_finder/blob/main/article_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
import urllib.parse
import argparse
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = 'http://export.arxiv.org/api/query?search_query='
PREFIX = {'Title': 'ti',
          'Author': 'au',
          'Abstract': 'abs',
          'Comment': 'co',
          'Journal Reference': 'jr',
          'Subject Category': 'cat',
          'Report Number': 'rn',
          'ID': 'id',
          'All': 'all'}

# input_keywords = input('Please enter the keywords or search phrases separated by commas: ')
input_keywords = 'reinforcement learning'

In [11]:
def make_query(url=BASE_URL, prefix=PREFIX['All'], keywords=input_keywords):
    keywords = urllib.parse.quote_plus(keywords)
    url = f'{url}{prefix}:{keywords}&start=0&max_results=1000&sortBy=lastUpdatedDate&sortOrder=descending'
    print(url)
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        if response.status == 200:
            soup = BeautifulSoup(response, 'xml')
        else:
            raise ValueError('Please, check the correctness of the request')
    cols = ['updated', 'title', 'summary', 'author', 'link']
    articles = pd.DataFrame(columns=cols)

    for tag in soup.find_all('entry'):
        updated_to_df = pd.to_datetime(tag.updated.string.split('T')[0])
        authors_list = ", ".join([n.string for n in tag.find_all('name')])
        link_to_df = tag.find(title='pdf').get('href')
        row_to_concat = pd.DataFrame({'updated': updated_to_df,
                                      'title': tag.title,
                                      'summary': tag.summary,
                                      'author': authors_list,
                                      'link': link_to_df}, index=tag.id)
        articles = pd.concat([articles, row_to_concat])

    return articles

In [12]:
articles = make_query()

http://export.arxiv.org/api/query?search_query=all:reinforcement+learning&start=0&max_results=1000&sortBy=lastUpdatedDate&sortOrder=descending


In [15]:
articles.sample(10)

Unnamed: 0,updated,title,summary,author,link
http://arxiv.org/abs/2202.09954v2,2022-08-27,Theoretical Analysis of Deep Neural Networks i...,"Recently, deep neural network (DNN)-based ph...","Jun Liu, Haitao Zhao, Dongtang Ma, Kai Mei, Ji...",http://arxiv.org/pdf/2202.09954v2
http://arxiv.org/abs/2203.01376v3,2022-08-24,Homogeneous ice nucleation in an ab initio mac...,Molecular simulations have provided valuable...,"Pablo M. Piaggi, Jack Weis, Athanassios Z. Pan...",http://arxiv.org/pdf/2203.01376v3
http://arxiv.org/abs/2208.11907v1,2022-08-25,Time Series Clustering with an EM algorithm fo...,"In this paper, we consider the task of clust...","Ryohei Umatani, Takashi Imai, Kaoru Kawamoto, ...",http://arxiv.org/pdf/2208.11907v1
http://arxiv.org/abs/2208.09735v2,2022-08-26,How Small Amount of Data Sharing Benefits Dist...,While distributed optimization algorithms ha...,"Mingxi Zhu, Yinyu Ye",http://arxiv.org/pdf/2208.09735v2
http://arxiv.org/abs/2208.13027v1,2022-08-27,Improving debris flow evacuation alerts in Tai...,Taiwan has the highest susceptibility to and...,"Yi-Lin Tsai, Jeremy Irvin, Suhas Chundi, João ...",http://arxiv.org/pdf/2208.13027v1
http://arxiv.org/abs/2208.13600v1,2022-08-29,Towards Robust Face Recognition with Comprehen...,"Data cleaning, architecture, and loss functi...","Manyuan Zhang, Guanglu Song, Yu Liu, Hongsheng Li",http://arxiv.org/pdf/2208.13600v1
http://arxiv.org/abs/2203.15722v2,2022-08-23,Transformer Network-based Reinforcement Learni...,"In this article, for the first time, we prop...","Hyunwook Park, Minsu Kim, Seongguk Kim, Keunwo...",http://arxiv.org/pdf/2203.15722v2
http://arxiv.org/abs/2208.13154v1,2022-08-28,Asynchronous Training Schemes in Distributed L...,"In the context of distributed deep learning,...","Haoxiang Wang, Zhanhong Jiang, Chao Liu, Soumi...",http://arxiv.org/pdf/2208.13154v1
http://arxiv.org/abs/2208.13685v1,2022-08-29,FedEgo: Privacy-preserving Personalized Federa...,As special information carriers containing b...,"Taolin Zhang, Chuan Chen, Yaomin Chang, Lin Sh...",http://arxiv.org/pdf/2208.13685v1
http://arxiv.org/abs/2203.07825v2,2022-08-29,SPA-VAE: Similar-Parts-Assignment for Unsuperv...,This paper addresses the problem of unsuperv...,"Shidi Li, Christian Walder, Miaomiao Liu",http://arxiv.org/pdf/2203.07825v2


In [17]:
articles.updated.value_counts()

2022-08-24    213
2022-08-26    197
2022-08-25    192
2022-08-29    149
2022-08-23     86
2022-08-27     82
2022-08-28     81
Name: updated, dtype: int64

In [14]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, http://arxiv.org/abs/2208.13764v1 to http://arxiv.org/abs/2208.10993v1
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   updated  1000 non-null   datetime64[ns]
 1   title    1000 non-null   object        
 2   summary  1000 non-null   object        
 3   author   1000 non-null   object        
 4   link     1000 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 46.9+ KB
