<a href="https://colab.research.google.com/github/wizard339/article_finder/blob/main/article_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
import urllib.parse
import argparse
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = 'http://export.arxiv.org/api/query?search_query='
PREFIX = {'Title': 'ti',
          'Author': 'au',
          'Abstract': 'abs',
          'Comment': 'co',
          'Journal Reference': 'jr',
          'Subject Category': 'cat',
          'Report Number': 'rn',
          'ID': 'id',
          'All': 'all'}

# input_keywords = input('Please enter the keywords or search phrases separated by commas: ')
input_keywords = 'reinforcement learning'

In [25]:
def make_query(url=BASE_URL, prefix=PREFIX['All'], keywords=input_keywords):
    keywords = urllib.parse.quote_plus(keywords)
    url = f'{url}{prefix}:{keywords}&sortBy=lastUpdatedDate&sortOrder=descending'
    print(url)
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        if response.status == 200:
            soup = BeautifulSoup(response, 'xml')
        else:
            raise ValueError('Please, check the correctness of the request')
    cols = ['updated', 'title', 'summary', 'author', 'link']
    articles = pd.DataFrame(columns=cols)

    for tag in soup.find_all('entry'):
        updated_to_df = pd.to_datetime(tag.updated.string.split('T')[0])
        authors_list = ", ".join([n.string for n in tag.find_all('name')])
        link_to_df = tag.find(title='pdf').get('href')
        row_to_concat = pd.DataFrame({'updated': updated_to_df,
                                      'title': tag.title,
                                      'summary': tag.summary,
                                      'author': authors_list,
                                      'link': link_to_df}, index=tag.id)
        articles = pd.concat([articles, row_to_concat])

    return articles

In [26]:
articles = make_query()

http://export.arxiv.org/api/query?search_query=all:reinforcement+learning&sortBy=lastUpdatedDate&sortOrder=descending


In [27]:
articles

Unnamed: 0,updated,title,summary,author,link
http://arxiv.org/abs/2208.12262v1,2022-08-25,MaskCLIP: Masked Self-Distillation Advances Co...,This paper presents a simple yet effective f...,"Xiaoyi Dong, Yinglin Zheng, Jianmin Bao, Ting ...",http://arxiv.org/pdf/2208.12262v1
http://arxiv.org/abs/2208.12261v1,2022-08-25,Synthetic End-User Testing: Modeling Realistic...,For software interacting directly with real-...,"Pasquale Salza, Marco Edoardo Palma, Harald C....",http://arxiv.org/pdf/2208.12261v1
http://arxiv.org/abs/2207.01567v2,2022-08-25,Back to MLP: A Simple Baseline for Human Motio...,This paper tackles the problem of human moti...,"Wen Guo, Yuming Du, Xi Shen, Vincent Lepetit, ...",http://arxiv.org/pdf/2207.01567v2
http://arxiv.org/abs/2208.12251v1,2022-08-25,A Gis Aided Approach for Geolocalizing an Unma...,The Global Positioning System (GPS) has beco...,"Jianli Wei, Deniz Karakay, Alper Yilmaz",http://arxiv.org/pdf/2208.12251v1
http://arxiv.org/abs/2208.12246v1,2022-08-25,Guarantees for Spontaneous Synchronization on ...,The Kuramoto model is an important mathemati...,"Pedro Abdalla, Afonso S. Bandeira, Clara Inver...",http://arxiv.org/pdf/2208.12246v1
http://arxiv.org/abs/2208.12242v1,2022-08-25,DreamBooth: Fine Tuning Text-to-Image Diffusio...,Large text-to-image models achieved a remark...,"Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yae...",http://arxiv.org/pdf/2208.12242v1
http://arxiv.org/abs/2208.12238v1,2022-08-25,Supervised Contrastive Learning for Affect Mod...,"Affect modeling is viewed, traditionally, as...","Kosmas Pinitas, Konstantinos Makantasis, Anton...",http://arxiv.org/pdf/2208.12238v1
http://arxiv.org/abs/2208.12232v1,2022-08-25,Skin Lesion Analysis: A State-of-the-Art Surve...,The Computer-aided Diagnosis (CAD) system fo...,"Md. Kamrul Hasan, Md. Asif Ahamad, Choon Hwai ...",http://arxiv.org/pdf/2208.12232v1
http://arxiv.org/abs/2208.12230v1,2022-08-25,Semantic Preserving Adversarial Attack Generat...,Widely used deep learning models are found t...,"Xinyi Wang, Simon Yusuf Enoch, Dong Seong Kim",http://arxiv.org/pdf/2208.12230v1
http://arxiv.org/abs/2207.08272v2,2022-08-25,BIP: Boost Invariant Polynomials for Efficient...,Deep Learning approaches are becoming the go...,"Jose M Munoz, Ilyes Batatia, Christoph Ortner",http://arxiv.org/pdf/2207.08272v2


In [6]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, http://arxiv.org/abs/2001.09608v1 to http://arxiv.org/abs/2011.13577v1
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   updated  10 non-null     datetime64[ns]
 1   title    10 non-null     object        
 2   summary  10 non-null     object        
 3   author   10 non-null     object        
 4   link     10 non-null     object        
dtypes: datetime64[ns](1), object(4)
memory usage: 480.0+ bytes
