<a href="https://colab.research.google.com/github/tonydiccion/Literature-Search-Using-Scopus-Scrapus/blob/main/Literature_Review_Scopus_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Article Scraping** **bold text**

In [None]:
!pip install ScopusScrapus
import requests.exceptions
from ScopusScrapus import ScopusSearchQuery
import numpy as np
import pandas as pd

In [None]:
MY_API_KEY = "INSERT API KEY HERE FROM ELSEVIER DEV PORTAL" #https://dev.elsevier.com/index.html

In [None]:
from typing_extensions import Concatenate

API_KEY = MY_API_KEY
# Define your search parameters.
# The 'query' uses Scopus search syntax (e.g., TITLE-ABS-KEY, AU-ID, AFFIL, etc.)
search_params = {
        'query':'TITLE-ABS-KEY("computer vision" OR " intelligent transportation systems" OR "traffic monitoring" AND "transport*" AND "Philippines")',
    # Date range is optional
    'date':'2015-2025',
    # 'view' can be 'STANDARD' or 'COMPLETE'.
    # 'COMPLETE' requires a Scopus subscription. Use 'STANDARD' if unsure.
    'view':'STANDARD',
    # 'count' sets results per page (max 100)
    'count': 25
}

# Define the maximum number of papers to retrieve in total.
MAX_PAPERS_TO_RETRIEVE = 5000


print(f"--- Starting Scopus Search for: {search_params['query']} ---")
print(f"Retrieving a maximum of {MAX_PAPERS_TO_RETRIEVE} papers...")

paper_count = -1
paper_list = []


try:
    # Instantiate the search query object
    ssq = ScopusSearchQuery(API_KEY, search_params)

    # Iterate through the results. ScopusScrapus handles pagination automatically.
    for paper in ssq:

        paper_data = {}
        paper_list.append([])
        paper_count += 1

        # Extract useful metadata fields from the paper object (dictionary)
        title = paper.get('dc:title', 'No Title Found')
        doi = paper.get('prism:doi', 'No DOI Found')
        publication_name = paper.get('prism:publicationName', 'Unknown Journal')
        first_authorname = paper.get('dc:creator', 'Unknown Journal')
        affiliation = paper.get('affiliation', [{}])[0].get('affilname', 'Unknown Affiliation')
        abstract = paper.get('dc:description', 'Unknown Journal')

        paper_list[paper_count].append(title)
        paper_list[paper_count].append(publication_name)
        paper_list[paper_count].append(doi.replace(' ', ''))
        paper_list[paper_count].append("https://doi.org/"+doi.replace(' ', ''))
        paper_list[paper_count].append(first_authorname)
        paper_list[paper_count].append(affiliation)
        paper_list[paper_count].append(abstract)



        # Stop after reaching the defined limit
        if paper_count >= MAX_PAPERS_TO_RETRIEVE:
            print("\n--- Reached retrieval limit. Stopping search. ---")

            break

except requests.exceptions.ReadTimeout:
    print("\n API Request timed out. Check your network connection or try a shorter search.")
except requests.exceptions.HTTPError as e:
    # This often catches 401 Unauthorized (invalid key) or 400 Bad Request (invalid query syntax)
    print(f"\n HTTP Error occurred: {e}")
    if '401' in str(e):
        print("  -> Authentication Failed. **Check your API Key.**")
    elif '400' in str(e):
        print("  -> Bad Request. **Check your Scopus Query syntax.**")
except Exception as e:
    print(f"\n An unexpected error occurred: {e}")

print(f"\n--- Search Complete. Total papers processed: {paper_count} ---")

In [None]:

my_array = np.array(paper_list)

# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(my_array, columns=['Title', 'Journal', 'DOI','DOI_HTTP', 'First Author','Affiliation','Abstract'])

print(df)

In [None]:
try:
    df.to_excel('output.xlsx', index=False)
    print("\n Successfully extracted DataFrame to 'output.xlsx'")
except Exception as e:
    print(f"\n An error occurred while writing to Excel: {e}")

# **You can extend the analysis using test analysis tools like LDA, BERT and other tools.**

**Reference**


Estrada, P. (2018). ScopusScrapus: A few small routines to scrape the data from Elsevier's Scopus API (Version 0.0.2). GitHub. https://github.com/pabloem/ScopusScrapus