In [14]:
import pandas as pd

infile = '' # input file name (final data ready for OA retrieval and XML creation)
#data = pd.read_excel(infile, sheet_name='Sheet4') # change/remove sheet name if necessary
#data = pd.read_csv(infile, encoding='utf-8-sig')

In [17]:
# Split names and return a list of individual authors, so they can be specified in the XML to properly be entered as individual creators
def split_names(names):
    names_list = names.split(', ')
    split_list = []
    for name in names_list:
        split_list.append(name.replace(' ', ', ', 1))
    return split_list

In [84]:
# Need to remove certain characters from item text to prevent errors in DSpace batch import. An initial list is here:
# https://www.tdl.org/wp-content/uploads/2009/04/DSpaceBatchImportFormat.pdf
# and may receive more from Felicity.
# So we create a dictionary to map the special character to the escaped version (or other change to problematic characters)
# then create a translation table with str.maketrans. This will then be used in the 'build_xml' function to translate each field
# as it is inserted into the XML.
trans_dict = {
    '&': '&amp;',
    '\'': '&apos;',
    '\"': '&quot;',
    '<': '&lt;',
    '>': '&gt;',
    '%': ' percent',
    '°': ' degrees',
    '≥': '[greater than or equal to]',
    '≤': '[less than or equal to]',
    '©': '[copyright]',
    '™': '[trademark]',
    '—': '--',
    'α': '[alpha]',
    'β': '[beta]',
    'μ': '&#181;',
    '×': 'x',
    '±': '[plus or minus]',
    '~': '&#126;',
    '♭': '[flat]',
    '’': "'"
}

trans_table = str.maketrans(trans_dict)

# based on https://www.semicolonworld.com/question/56290/how-do-convert-a-pandas-dataframe-to-xml
def build_xml_block(row):
    xml = ['<?xml version="1.0" encoding="UTF-8"?>\n<dublin_core>']
    citation = {}
#    print(row)
    for field in row.index:
        if field in ["Title"]:
            xml.append('    <dcvalue element="title" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))
            citation['title'] = row[field].translate(trans_table)
        elif field in ["Authors"]:
            xml.append('    <dcvalue element="contributor" qualifier="author" language="eng">{0}</dcvalue>'.format('||'.join(split_names(row[field])).translate(trans_table)))
            citation['authors'] = ', '.join(split_names(row[field])).translate(trans_table)
        elif field == "corr department":
            xml.append('    <dcvalue element="contributor" qualifier="deptlab" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))            
        elif field == "Abstract":
            xml.append('    <dcvalue element="description" qualifier="abstract" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))
        elif field == "DOI":
            xml.append('    <dcvalue element="identifier" qualifier="uri" language="eng">https://dx.doi.org/{0}</dcvalue>'.format(row[field]))
            xml.append('    <dcvalue element="identifier" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field]))
            citation['doi'] = row[field].translate(trans_table)
        elif field == "Source title":
            xml.append('    <dcvalue element="source" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))
            citation['source'] = row[field].translate(trans_table)
        elif field == "Document Type":
            xml.append('    <dcvalue element="type" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field]))
        #elif field == "pubdate":
        elif field == "Year":            
            xml.append('    <dcvalue element="date" qualifier="issued" language="eng">{0}</dcvalue>'.format(row[field]))
            citation['year'] = row[field]
        elif field == "Publisher":
            xml.append('    <dcvalue element="publisher" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))
        elif field == "best_oa_license":
            xml.append('    <dcvalue element="rights" qualifier="license" language="eng">{0}</dcvalue>'.format(row[field]))
            licenses = {
                'cc-by': 'https://creativecommons.org/licenses/by/4.0',
                'cc-by-nc': 'https://creativecommons.org/licenses/by-nc/4.0',
                'cc-by-nc-nd': 'https://creativecommons.org/licenses/by-nc-nd/4.0',
                'cc0': 'https://creativecommons.org/publicdomain/zero/1.0/',
                'cc-by-nc-sa': 'https://creativecommons.org/licenses/by-nc-sa/4.0'
            }
            xml.append('    <dcvalue element="rights" qualifier="license" language="eng">{0}</dcvalue>'.format(licenses.get(row[field]), ''))
        elif field == "Author Keywords":
            if not pd.isnull(row[field]):
                xml.append('    <dcvalue element="subject" qualifier="none" language="eng">{0}</dcvalue>'.format(row[field].translate(trans_table)))
        elif field == "Volume":
            if not pd.isnull(row[field]):
                citation['volume'] = row[field]
        elif field == "Issue":
            if not pd.isnull(row[field]):
                citation['issue'] = '('+str(row[field])+')'
            ## For now, replacing null/NaN issue with blank, eventually probably want to build citation by appending values if they exist,
            ## rather than assuming they exist and appending empty string if they don't...
            else:
                citation['issue'] = ''
    
    citation_string = '{0}. ({1}). {2}. {3}, {4}{5}. {6}'.format(citation['authors'], citation['year'], citation['title'], citation['source'], 
                                                                                 citation['volume'], citation['issue'], citation['doi']).translate(trans_table)

    xml.append('    <dcvalue element="source" qualifier="none" language="eng">{0}</dcvalue>'.format(citation_string))
    xml.append('</dublin_core>')
    return '\n'.join(xml)

In [None]:
print('\n\n'.join(data[35:38].apply(build_xml_block, axis=1))) #take a look/print/output XML

In [None]:
data['best_oa_url_for_pdf'][0] #just looking at this field

In [135]:
# https://likegeeks.com/downloading-files-using-python/
import requests, os
# https://stackoverflow.com/questions/34446172/open-url-in-new-tab-from-ipython-notebook-jupyter-cell
import webbrowser

def get_articles(start, stop):
    manual_download = []
    myfile = 0

    for index, row in data.iterrows():
        if index < start:
            continue
        elif index >=stop:
            break

        unsanitized_local_path = os.path.join('pdfs', row['corr surname']+' - '+row['Title'][:20])
        local_path = "".join(i for i in unsanitized_local_path.rstrip() if i not in r':*?"<>|/')

        print(index, row['corr surname']+' - '+row['Title'])            

        if os.path.exists(os.path.join(local_path, row['corr surname']+'.pdf')):
            print('File already downloaded...skipping\n')
            continue
        
        # OUP seems to close connection immediately to non-browser requests? Check both for no direct pdf link AND that it's not
        # an OUP link. If either of those, must download manually.
        if not pd.isnull(row['best_oa_url_for_pdf']) and not 'academic.oup.com' in str(row['best_oa_url_for_pdf']):
            print(row['best_oa_url_for_pdf'], '\n')
            
            if not os.path.exists(local_path):
                os.makedirs(local_path)            
            
            try:
                myfile = requests.get(row['best_oa_url_for_pdf'])
                myfile.raise_for_status
                open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb').write(myfile.content)
                print('Response code: ', myfile.status_code)
                print('Complete: ', len(myfile.content), 'bytes\n')
            except requests.exceptions.HTTPError as errh:
                print ("HTTP Error:",errh, '\n')
                open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb')
            except requests.exceptions.ConnectionError as errc:
                print ("Error Connecting:",errc, '\n')
                open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb')
            except requests.exceptions.Timeout as errt:
                print ("Timeout Error:",errt, '\n')
                open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb')
            except requests.exceptions.RequestException as err:
                print ("Error:",err, '\n')
                open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb')
                print(err.response.text)

            open(os.path.join(local_path, 'dublin_core.xml'), 'w', encoding='UTF-8').write(build_xml_block(row))
            open(os.path.join(local_path, 'contents'), 'w').write(row['corr surname']+'.pdf')
        else:
            if not os.path.exists(local_path):
                os.makedirs(local_path)
            print('NO DIRECT URL')
            print(row['best_oa_url'])
            manual_download.append(row)
            #webbrowser.open(row['best_oa_url'])
            open(os.path.join(local_path, row['corr surname']+'.pdf'), 'wb')
            open(os.path.join(local_path, 'dublin_core.xml'), 'w', encoding='UTF-8').write(build_xml_block(row))
            open(os.path.join(local_path, 'contents'), 'w', encoding='UTF-8').write(row['corr surname']+'.pdf')
            print('Browser tab opened, overwrite empty pdf: ', os.path.join(local_path, row['corr surname']+'.pdf'),'\n')

    #    print('\n\n'.join(data[10:20].apply(func, axis=1)))


In [None]:
get_articles(0, 50) #get first 50 articles

In [None]:
data.iloc[-1] #take a look at last entry

In [41]:
data.keys() #check keys

Index(['Number', 'Authors', 'Author(s) ID', 'Title', 'Year', 'Source title',
       'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count',
       'Cited by', 'DOI', 'Link', 'Affiliations', 'Authors with affiliations',
       'Abstract', 'Author Keywords', 'Funding Details',
       'Correspondence Address', 'Publisher', 'ISSN', 'ISBN', 'CODEN',
       'Document Type', 'Publication Stage', 'Access Type', 'Source', 'EID',
       'corr email', 'corr given name', 'corr surname', 'corr department',
       'best_oa_license', 'best_oa_url', 'best_oa_url_for_pdf',
       'best_oa_version', 'best_oa_evidence', 'librarian_name',
       'librarian_email'],
      dtype='object')