In [1]:
import requests
import random
from datetime import datetime
import pandas as pd



In [2]:
# Function to parse JSON response
def parse_scopus_response(json_response):
    entries = []
    for entry in json_response.get('search-results', {}).get('entry', []):
        cover_date = entry.get('prism:coverDate')
        # Ensure cover_date is present and not in the future
        if cover_date and datetime.strptime(cover_date, "%Y-%m-%d").date() <= datetime.now().date():
            article = {
                'title': entry.get('dc:title'),
                'author': entry.get('dc:creator'),
                'publicationName': entry.get('prism:publicationName'),
                # 'volume': entry.get('prism:volume'),
                'cover_date': cover_date,
                # 'doi': entry.get('prism:doi'),
                'scopus_id': entry.get('dc:identifier'),
                'cited_by_count': entry.get('citedby-count'),
                'open_access': entry.get('openaccessFlag')
            }
            entries.append(article)
    return entries


In [3]:
# Function to fetch data from Scopus API with pagination
def fetch_scopus_data(api_key, query, max_records=1000, count=25):
    base_url = "https://api.elsevier.com/content/search/scopus"
    all_entries = []
    start_index = 0

    while len(all_entries) < max_records:
        params = {
            'query': query,
            'start': start_index,
            'count': count,
            'apiKey': api_key
        }
        headers = {'Accept': 'application/json'}
        response = requests.get(base_url, headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            articles = parse_scopus_response(json_response)
            if not articles:
                print("No more articles found or all articles have future dates.")
                break  # Stop if no articles are found in the response
            all_entries.extend(articles)
            start_index += count
        else:
            print(f"Error: {response.status_code} - {response.text}")
            break

        # Avoid exceeding the total max_records
        if len(all_entries) >= max_records:
            all_entries = all_entries[:max_records]
            break

    return all_entries

In [4]:
# Function to randomly pick articles
def pick_random_articles(articles, total=1000):
    if len(articles) < total:
        print(f"Warning: Only {len(articles)} articles available. Returning all.")
        return articles
    return random.sample(articles, total)

In [5]:
# Main script
if __name__ == "__main__":
    API_KEY = "c8690de363626d560cf56cc17f9369d6"  # Replace with your Scopus API key
    QUERY = 'TITLE("data science")'
    TOTAL_ARTICLES = 1000

    # Fetch data from API
    articles = fetch_scopus_data(API_KEY, QUERY, max_records=TOTAL_ARTICLES)

    # Randomly pick 1000 articles (or fewer if total articles are less than 1000)
    selected_articles = pick_random_articles(articles, total=TOTAL_ARTICLES)

    df = pd.DataFrame(selected_articles)
    print(df.head())  

                                               title        author  \
0  Emerging Trends in Data Science and Big Data A...   Nageye A.Y.   
1  Applications of Data Science and Artificial In...    Costa C.J.   
2      Advancing Social Justice through Data Science    Bartucz J.   
3  Data-science-guided calibration curve predicti...   Howard J.R.   
4  Constraint-Driven Complexity-Aware Data Scienc...  Siriweera A.   

                                     publicationName  cover_date  \
0  SSRG International Journal of Electronics and ...  2024-05-01   
1                     Applied Sciences (Switzerland)  2023-08-01   
2  SIGCSE 2024 - Proceedings of the 55th ACM Tech...  2024-03-14   
3                                               Chem  2024-07-11   
4                      IEEE Transactions on Big Data  2023-12-01   

               scopus_id cited_by_count  open_access  
0  SCOPUS_ID:85196083856              0         True  
1  SCOPUS_ID:85167914055              2         True  
2  SC

In [6]:
df.shape

(1000, 7)