In [7]:
import requests
import time

In [134]:
def collect_papers(target, query, limit, offset=0):
    open_access_papers = []
    non_open_access_papers = []
    papers_in_error_list = []
    counter = 1
    
    while len(open_access_papers) + len(non_open_access_papers) + len(papers_in_error_list) < target:
        
        # fetch paper list
        paper_list = search_papers(query=query, limit=limit, offset=offset)
        
        # fetch paper details
        for basic_paper_info in paper_list:
            paper_id = basic_paper_info['paperId']
            additional_paper_info = fetch_paper_details(paper_id)

            # categorize papers in open access, non open access and error list
            if additional_paper_info is not None:
                paper_profile = {**basic_paper_info, **additional_paper_info}
                if paper_profile['isOpenAccess']:
                    paper_serial = {'serial': counter}
                    paper_profile_numbered = {**paper_serial, **paper_profile}
                    open_access_papers.append(paper_profile_numbered)
                    print(f'Retrieved => Paper {counter}')
                else:
                    paper_serial = {'serial': counter}
                    paper_profile_numbered = {**paper_serial, **paper_profile}
                    non_open_access_papers.append(paper_profile_numbered)
                    print(f'Non-open Access => Paper {counter}')
            else:
                paper_serial = {'serial': counter}
                basic_paper_info_numbered = {**paper_serial, **basic_paper_info}
                papers_in_error_list.append(basic_paper_info_numbered)
                print(f'==> ERROR => Paper {counter}')

            time.sleep(1)
            counter += 1

        if not paper_list:
            break

        offset += limit
        time.sleep(1)
    
    # fetch paper details for error list
    print('\nChecking error list...')
    print(f'\nPapers in error list: {len(papers_in_error_list)}\n')
    if papers_in_error_list:
        while papers_in_error_list:
            open_, non_open, error = execute_error_list(papers_in_error_list)
            open_access_papers.extend(open_)
            non_open_access_papers.extend(non_open)
            papers_in_error_list = error.copy()
        print('\nError list cleared!')
        
    # Rearrange the list in descending order based on 'citationCount'
    global open_access_papers_sorted
    open_access_papers_sorted = sorted(open_access_papers, key=lambda x: x['citationCount'], reverse=True)
    
    print(f'\n{len(open_access_papers_sorted)} Open access papers: {open_access_papers_sorted}')
    print(f'\n{len(non_open_access_papers)} Non-open access papers: {non_open_access_papers}')

    print('\nDownloading PDFs of open access papers...\n')
    download_pdf(open_access_papers_sorted)
    print('\nPapers dowloaded!')

In [135]:
def search_papers(query, limit, offset):
    api_key = 'R4SSSz3eshKj9OjFSzre9QIzp6syQQbaCNmkuUA5'
    search_url = 'https://api.semanticscholar.org/graph/v1/paper/search'
    search_parameters = {
        'query': query,
        'limit': limit,
        'offset': offset
    }
    headers = {
        'x-api-key': api_key
    }

    response = requests.get(search_url, params=search_parameters, headers=headers)

    if response.status_code == 200:
        paper_list = response.json().get('data')
        return paper_list
    else:
        print(f"Request failed with status code {response.status_code}: {response.text}")

In [136]:
def fetch_paper_details(paper_id):
    api_key = 'R4SSSz3eshKj9OjFSzre9QIzp6syQQbaCNmkuUA5'
    base_url = 'https://api.semanticscholar.org/graph/v1/paper/' + paper_id
    paper_data = {
        'fields': 'citationCount,influentialCitationCount,isOpenAccess,openAccessPdf'
    }
    headers = {
        'x-api-key': api_key
    }

    response = requests.get(base_url, params=paper_data, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        return None
    

In [137]:
def execute_error_list(paper_list):
    open_access_papers = []
    non_open_access_papers = []
    papers_in_error_list = []
    
    # fetch paper details
    for basic_paper_info in paper_list:
        paper_id = basic_paper_info['paperId']
        additional_paper_info = fetch_paper_details(paper_id)

        # categorize papers in open access, non open access and error list
        if additional_paper_info is not None:
            paper_profile = {**basic_paper_info, **additional_paper_info}
            if paper_profile['isOpenAccess']:
                open_access_papers.append(paper_profile)
                print(f'Retrieved => Paper {basic_paper_info["serial"]}')
            else:
                non_open_access_papers.append(paper_profile)
                print(f'Non-open Access => Paper {basic_paper_info["serial"]}')
        else:
            papers_in_error_list.append(basic_paper_info)
            print(f'==> ERROR => Paper {basic_paper_info["serial"]}')

        time.sleep(2)
    return open_access_papers, non_open_access_papers, papers_in_error_list

In [138]:
def download_pdf(open_access_papers_sorted):

    for _ in open_access_papers_sorted:
        
        # URL of the PDF file
        url = _['openAccessPdf']['url']
        pdf_locator = url

        # Send a GET request to the URL
        response = requests.get(pdf_locator)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the content of the response (PDF content)
            pdf_content = response.content

            # Save the PDF content to a file
            with open(f'downloads/sl_{_["serial"]}_citation_{_["citationCount"]}.pdf', 'wb') as pdf_file:
                pdf_file.write(pdf_content)
            print("PDF downloaded successfully.")
        else:
            print("Failed to download the PDF.")

In [139]:
collect_papers(target=10, query='cellulose materials', limit=10, offset=0)

==> ERROR => Paper 1
==> ERROR => Paper 2
Non-open Access => Paper 3
Non-open Access => Paper 4
Retrieved => Paper 5
Non-open Access => Paper 6
Retrieved => Paper 7
Retrieved => Paper 8
Non-open Access => Paper 9
Non-open Access => Paper 10

Checking error list...

Papers in error list: 2

Retrieved => Paper 1
Retrieved => Paper 2

Error list cleared!

5 Open access papers: [{'serial': 8, 'paperId': 'c74c1051ef4b6b0fe44278a4f569e8c853b15727', 'title': 'Recent Strategies in Preparation of Cellulose Nanocrystals and Cellulose Nanofibrils Derived from Raw Cellulose Materials', 'citationCount': 208, 'influentialCitationCount': 7, 'isOpenAccess': True, 'openAccessPdf': {'url': 'https://downloads.hindawi.com/journals/ijps/2018/7923068.pdf', 'status': 'GOLD'}}, {'serial': 5, 'paperId': 'd519be630c979d637adfb29f7db02f8fe398ee3b', 'title': '3D printing with cellulose materials', 'citationCount': 201, 'influentialCitationCount': 1, 'isOpenAccess': True, 'openAccessPdf': {'url': 'https://link.spr

In [102]:
import datetime as dt

print(dt.datetime.now())

2024-04-20 11:55:19.072094
