### Scrape from DBLP

In [7]:
import requests
import xml.etree.ElementTree as ET
import time

# Define base URL for DBLP API
BASE_URL = 'https://dblp.org/search/publ/api'

# Query parameters
query_params = {
    'q': 'computer science',   # Query for computer science papers
    'format': 'xml',           # Response format
    'h': 100,                  # Number of results per page (max is 100)
    'f': 0                     # Starting index for pagination
}

# Storage for results
all_papers = []
total_records = 1000  # Target number of papers
records_per_request = 100  # Number of records per request

while len(all_papers) < total_records:
    # Make API request
    response = requests.get(BASE_URL, params=query_params)
    response.raise_for_status()

    # Parse XML response
    root = ET.fromstring(response.text)

    # Find and store metadata for each paper
    for hit in root.findall(".//hit"):
        paper_data = {
            "title": hit.find(".//title").text,
            "year": hit.find(".//year").text if hit.find(".//year") is not None else "Unknown",
            "authors": [author.text for author in hit.findall(".//author")],
            "venue": hit.find(".//venue").text if hit.find(".//venue") is not None else "Unknown",
            "url": hit.find(".//ee").text if hit.find(".//ee") is not None else "No URL"
        }
        all_papers.append(paper_data)

    print(f"Retrieved {len(all_papers)} papers so far...")

    # Update for pagination
    query_params['f'] += records_per_request  # Increment starting index for next batch

    # Respect rate limits
    time.sleep(60)  # Add delay to avoid overwhelming the server

# Trim to the number of records
all_papers = all_papers[:total_records]

# Save results to a JSON file
import json
with open('dblp_computer_science_papers.json', 'w') as f:
    json.dump(all_papers, f, indent=2)

print(f"Total papers retrieved and saved: {len(all_papers)}")

Retrieved 100 papers so far...
Retrieved 200 papers so far...
Retrieved 300 papers so far...
Retrieved 400 papers so far...
Retrieved 500 papers so far...
Retrieved 600 papers so far...
Retrieved 700 papers so far...
Retrieved 800 papers so far...
Retrieved 900 papers so far...
Retrieved 1000 papers so far...
Total papers retrieved and saved: 1000


### Scrape from Scopus

In [2]:
import requests
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Set up your API key and base URL
API_KEY = os.getenv('ELSEVIER_API_KEY')
BASE_URL = 'https://api.elsevier.com/content/search/scopus'

# Define query parameters for computer science papers
query_params = {
    'query': 'SUBJAREA(COMP)',     # Scopus field code for computer science
    'count': 25,                   # Max 25 results per request
    'start': 0,                     # Starting index for pagination
    'sort': 'citedby-count',
}

headers = {
    'X-ELS-APIKey': API_KEY
}

# Initialize storage for results
all_papers = []
total_records = 1000               # Target number of records
records_per_request = 25           # Number of records per request

# Retrieve data in batches
while len(all_papers) < total_records:
    # Make API request
    response = requests.get(BASE_URL, headers=headers, params=query_params)
    data = response.json()
    
    # Extract and store results
    if 'search-results' in data:
        entries = data['search-results'].get('entry', [])
        all_papers.extend(entries)
        print(f"Retrieved {len(all_papers)} papers so far...")

        # Update start index for the next batch
        query_params['start'] += records_per_request
    else:
        print("Error retrieving data:", data)
        break

# Display summary of results
print(f"Total papers retrieved: {len(all_papers)}")

# Save results to a file
import json
with open('scopus_computer_science_papers.json', 'w') as f:
    json.dump(all_papers, f, indent=2)

Retrieved 25 papers so far...
Retrieved 50 papers so far...
Retrieved 75 papers so far...
Retrieved 100 papers so far...
Retrieved 125 papers so far...
Retrieved 150 papers so far...
Retrieved 175 papers so far...
Retrieved 200 papers so far...
Retrieved 225 papers so far...
Retrieved 250 papers so far...
Retrieved 275 papers so far...
Retrieved 300 papers so far...
Retrieved 325 papers so far...
Retrieved 350 papers so far...
Retrieved 375 papers so far...
Retrieved 400 papers so far...
Retrieved 425 papers so far...
Retrieved 450 papers so far...
Retrieved 475 papers so far...
Retrieved 500 papers so far...
Retrieved 525 papers so far...
Retrieved 550 papers so far...
Retrieved 575 papers so far...
Retrieved 600 papers so far...
Retrieved 625 papers so far...
Retrieved 650 papers so far...
Retrieved 675 papers so far...
Retrieved 700 papers so far...
Retrieved 725 papers so far...
Retrieved 750 papers so far...
Retrieved 775 papers so far...
Retrieved 800 papers so far...
Retrieved 8