In [1]:
def arxiv(arxiv_id, dirname='../', seq=',', output=True):
    import os
    import urllib.request as request
    from bs4 import BeautifulSoup
    import re
    import json
    
    arxiv_id = arxiv_id.strip()
    
    # URL
    url_index = 'https://arxiv.org/'
    url_abs = url_index +'abs/' + arxiv_id
    url_pdf = url_index + 'pdf/' + arxiv_id + '.pdf'

    # Set BS4
    rqt = request.urlopen(url_abs)
    print(url_abs)
    soup = BeautifulSoup(rqt, "lxml")

    # Set File name
    title = soup.find('head')
    title = title.find('title')
    title = title.get_text()
    pattern = re.compile('\n')
    title = re.sub(pattern, ' ', title)
    pattern = re.compile('\s+')
    title = re.sub(pattern, ' ', title)
    title_new = title + '.pdf'
    
    pattern = re.compile('^\[[a-z-]+/[0-9]+\]')
    title = re.sub(pattern, '', title, count=1)
    
    pattern = re.compile('^\[([0-9]{4}\.[0-9]+)\]')
    title = re.sub(pattern, '', title, count=1)
    
    # Set authers
    authers = soup.find('div', class_='authors')
    authers = authers.find_all('a')
    f = lambda x:x.get_text()
    authers = list(map(f, authers))
    
    # Set citation date
    citation_date = soup.find('meta', attrs={'name':'citation_date'})
    citation_date = citation_date.get('content')
    ####### Data Type from String Type #######            
    def str2time(time):
        import datetime as dt
        try:
            time = dt.datetime.strptime(time, '%Y/%m/%d')
        except TypeError:
            time = dt.datetime(1900, 1, 1, 0, 0)
        return time
    citation_date = str2time(citation_date)
    citation_year = str(citation_date.year)

    dirname = os.path.join(dirname, citation_year)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    pattern = '¥|/|:|\*|<|>|\|'
    title_new = re.sub(pattern, '_', title_new)
    filename = os.path.join(dirname, title_new)
    
    if not os.path.exists(filename):
        request.urlretrieve(url_pdf, '{0}'.format(filename))
        print('Save: ', filename)
    else:
        print('Allready, the file exists. ')
    contents = {
        arxiv_id:{
            'Year':citation_year, 
            'title':title, 
            'url':url_pdf, 
            'authers': authers
        }
    }
    
    for key in contents.keys():
        print(key, contents[key])
    print('')

    if output == True:
        with open('paper_list.txt', 'a') as f:
            text_line = arxiv_id + seq + title + seq + str(authers) + '\n'
            f.write(text_line)
    else:
        pass
        
    text_line = '|'+citation_year+'|'+arxiv_id + '|' + '['+title+']'+'('+url_pdf+')' + '|' + ','.join(authers)+'|' + '\n'
    print(text_line)
    return contents

In [7]:
arxivstr = """
    1709.01066
    1310.4546
    1301.3781
    1512.01237
    1702.02138
    1703.0687
    1312.5258
    1611.07074
    1512.01237
    1512.01237
    1601.07558
    1001.0785
    1701.04579
    0911.3635
    1312.5258
    1612.01928
    1612.03809
    1612.07837
    1702.08431
    1703.08002
    1706.05394
    1409.0473
    1504.00702
    hep-th/9810056
    1511.06410
    1409.4842
    1512.03385
"""


import re
pattern = re.compile('\s+')
arxivlist = re.split(pattern, arxivstr)
arxivlist = list(filter(lambda x:x!='', arxivlist))
list(map(arxiv, arxivlist))

In [5]:
L = [
    '1709.01066',
    '1310.4546',
    '1301.3781',
    '1512.01237',
    '1702.02138',
    '1703.0687',
    '1312.5258',
    '1611.07074',
    '1512.01237',
    '1512.01237',
    '1601.07558',
    '1001.0785',
    '1701.04579',
    '0911.3635',
    '1312.5258',
    '1612.01928',
    '1612.03809',
    '1612.07837',
    '1702.08431',
    '1703.08002',
    '1706.05394',
    '1409.0473',
    '1504.00702',
    'hep-th/9810056',
    '1511.06410', 
    '1409.4842', 
    '1512.03385', 
]

In [6]:
for i in L:
    arxiv(i, output=False)

https://arxiv.org/abs/1709.01066
Allready, the file exists. 
1709.01066 {'Year': '2017', 'title': ' Quantum Decimation in Hilbert Space: Coarse-Graining without Structure', 'url': 'https://arxiv.org/pdf/1709.01066.pdf', 'authers': ['Ashmeet Singh', 'Sean M. Carroll']}

|2017|1709.01066|[ Quantum Decimation in Hilbert Space: Coarse-Graining without Structure](https://arxiv.org/pdf/1709.01066.pdf)|Ashmeet Singh,Sean M. Carroll|

https://arxiv.org/abs/1310.4546
Allready, the file exists. 
1310.4546 {'Year': '2013', 'title': ' Distributed Representations of Words and Phrases and their Compositionality', 'url': 'https://arxiv.org/pdf/1310.4546.pdf', 'authers': ['Tomas Mikolov', 'Ilya Sutskever', 'Kai Chen', 'Greg Corrado', 'Jeffrey Dean']}

|2013|1310.4546|[ Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/pdf/1310.4546.pdf)|Tomas Mikolov,Ilya Sutskever,Kai Chen,Greg Corrado,Jeffrey Dean|

https://arxiv.org/abs/1301.3781
Allready, the file exist

https://arxiv.org/abs/1703.08002
Allready, the file exists. 
1703.08002 {'Year': '2017', 'title': ' A network of deep neural networks for distant speech recognition', 'url': 'https://arxiv.org/pdf/1703.08002.pdf', 'authers': ['Mirco Ravanelli', 'Philemon Brakel', 'Maurizio Omologo', 'Yoshua Bengio']}

|2017|1703.08002|[ A network of deep neural networks for distant speech recognition](https://arxiv.org/pdf/1703.08002.pdf)|Mirco Ravanelli,Philemon Brakel,Maurizio Omologo,Yoshua Bengio|

https://arxiv.org/abs/1706.05394
Allready, the file exists. 
1706.05394 {'Year': '2017', 'title': ' A Closer Look at Memorization in Deep Networks', 'url': 'https://arxiv.org/pdf/1706.05394.pdf', 'authers': ['Devansh Arpit', 'Stanisław Jastrzębski', 'Nicolas Ballas', 'David Krueger', 'Emmanuel Bengio', 'Maxinder S. Kanwal', 'Tegan Maharaj', 'Asja Fischer', 'Aaron Courville', 'Yoshua Bengio', 'Simon Lacoste-Julien']}

|2017|1706.05394|[ A Closer Look at Memorization in Deep Networks](https://arxiv.org/pdf

In [35]:
while True:
    arxiv(input('arXiv'))

arXiv1504.00702
https://arxiv.org/abs/1504.00702
Save:  ../2015/[1504.00702] End-to-End Training of Deep Visuomotor Policies.pdf
1504.00702 {'title': ' End-to-End Training of Deep Visuomotor Policies', 'authers': ['Sergey Levine', 'Chelsea Finn', 'Trevor Darrell', 'Pieter Abbeel']}



KeyboardInterrupt: 

In [24]:
arxiv('hep-th/9810056')
arxiv('1709.01066')
arxiv('1310.4546')
arxiv('1301.3781')
arxiv('1512.01237')
arxiv('1702.02138')
arxiv('1703.06870')
arxiv('1312.5258')
arxiv('1611.07074')
arxiv('1512.01237')
arxiv('1512.01237')
arxiv('1601.07558')
arxiv('1001.0785')
arxiv('1701.04579')
arxiv('0911.3635')
arxiv('1312.5258')
arxiv('1612.01928')
arxiv('1612.03809')
arxiv('1612.07837')
arxiv('1702.08431')
arxiv('1703.08002')
arxiv('1706.05394')
arxiv('1409.0473')

https://arxiv.org/abs/hep-th/9810056
Allready, the file exists. 
hep-th/9810056 {'title': '[hep-th/9810056] Wilson Renormalization Group and Continuum Effective Field Theories', 'authers': ['Chanju Kim']}

https://arxiv.org/abs/1709.01066
Allready, the file exists. 
1709.01066 {'title': ' Quantum Decimation in Hilbert Space: Coarse-Graining without Structure', 'authers': ['Ashmeet Singh', 'Sean M. Carroll']}

https://arxiv.org/abs/1310.4546
Allready, the file exists. 
1310.4546 {'title': ' Distributed Representations of Words and Phrases and their Compositionality', 'authers': ['Tomas Mikolov', 'Ilya Sutskever', 'Kai Chen', 'Greg Corrado', 'Jeffrey Dean']}

https://arxiv.org/abs/1301.3781
Allready, the file exists. 
1301.3781 {'title': ' Efficient Estimation of Word Representations in Vector Space', 'authers': ['Tomas Mikolov', 'Kai Chen', 'Greg Corrado', 'Jeffrey Dean']}

https://arxiv.org/abs/1512.01237
Allready, the file exists. 
1512.01237 {'title': ' Quantum mechanics of 4-derivat

{'1409.0473': {'title': ' Neural Machine Translation by Jointly Learning to Align and Translate',
  'authers': ['Dzmitry Bahdanau', 'Kyunghyun Cho', 'Yoshua Bengio']}}

In [36]:
import pandas as pd
list(map(lambda x:x.split(',')[0], pd.read_table('./paper_list.txt', header=-1)[0]))

['hep-th/9810056',
 '1709.01066',
 '1310.4546',
 '1301.3781',
 '1512.01237',
 '1702.02138',
 '1703.06870',
 '1312.5258',
 '1611.07074',
 '1512.01237',
 '1512.01237',
 '1601.07558',
 '1001.0785',
 '1701.04579',
 '0911.3635',
 '1312.5258',
 '1612.01928',
 '1612.03809',
 '1612.07837',
 '1702.08431',
 '1703.08002',
 '1706.05394',
 '1409.0473',
 '1504.00702']