In [18]:
import requests
import json

SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
HEADERS = {"User-Agent": "DeepCite/1.0"}

In [19]:
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount", limit=1000):
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(limit - len(papers), 100),  #Fetch in batches of 100
            "offset": offset
        }
        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100  # Move the offset to get the next set of papers
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

def save_papers_to_json(query, filename="papers.json"):
    papers = fetch_papers_by_keywords(query)
    
    if not papers:
        print("No papers retrieved.")
        return
    
    # Format data to only include necessary fields
    formatted_papers = []
    for paper in papers:
        formatted_papers.append({
            "title": paper.get("title", "No title available"),
            "abstract": paper.get("abstract", "No abstract available"),
            "year": paper.get("year", "Unknown"),
            "url": paper.get("url", "Unknown"),
            "citation_count": paper.get("citationCount", 0)
        })
    
    # Save to JSON file
    with open(filename, 'w') as f:
        json.dump(formatted_papers, f, indent=4)

# Example usage
query = "Supervised machine learning use in healthcare"
save_papers_to_json(query, "top_1000_papers.json")

print("Papers saved to top_1000_papers.json")

Papers saved to top_1000_papers.json


In [21]:
def fetch_papers_by_keywords(keywords, fields=None, limit=10):
    
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "limit": min(limit - len(papers), 100),
            "offset": offset
        }
        if fields:
            params["fields"] = fields  # only add fields if specified

        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            
            # Print only the first paper for inspection
            if data.get("data"):
                print(json.dumps(data["data"][0], indent=4))
                # Optional: print keys only if you want to see the field names
                print("Fields available:", list(data["data"][0].keys()))
            
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

In [25]:
import requests
import json

SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
HEADERS = {"Accept": "application/json"}

def fetch_and_save_papers(keywords, fields=None, limit=1000, output_file="papers.json"):
    papers = []
    offset = 0

    while len(papers) < limit:
        params = {
            "query": keywords,
            "limit": min(limit - len(papers), 100),  # API max is 100 per request
            "offset": offset
        }
        if fields:
            params["fields"] = fields

        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            if not new_papers:
                print("No more papers found.")
                break

            papers.extend(new_papers)
            offset += 100
            print(f"Fetched {len(papers)} papers so far...")
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break

    # Save the results to a JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(papers[:limit], f, ensure_ascii=False, indent=4)

    print(f"Saved {len(papers[:limit])} papers to '{output_file}'.")



fetch_and_save_papers(
    keywords="Supervised machine learning use in healthcare",
    fields="title,abstract,year,url,citationCount,influentialCitationCount,authors",
    limit=1000,
    output_file = "papers.json"
)

Fetched 100 papers so far...
Fetched 200 papers so far...
Fetched 300 papers so far...
Fetched 400 papers so far...
Fetched 500 papers so far...
Fetched 600 papers so far...
Fetched 700 papers so far...
Fetched 800 papers so far...
Fetched 900 papers so far...
Fetched 1000 papers so far...
Saved 1000 papers to 'papers.json'.
