In [1]:
import requests
from urllib.parse import urlparse
import urllib.request
import json
from dotenv import dotenv_values

In [2]:
# Environment variables in a .env file
# S2_API_KEY - For api access to dataset links
# DATA_DIR - directory to download data to (e.g. external harddrive)
config = dotenv_values()

### Get info on the various datasets

In [4]:
res_datasets = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest')

In [5]:
for dataset in res_datasets.json()['datasets']:
    print(dataset['name'])
    print(dataset['description'])
    print('----------------------')

abstracts
Paper abstract text, where available.
100M records in 30 1.8GB files.
----------------------
authors
The core attributes of an author (name, affiliation, paper count, etc.). Authors have an "authorId" field, which can be joined to the "authorId" field of the members of a paper's "authors" field.
75M records in 30 100MB files.
----------------------
citations
Instances where the bibliography of one paper (the "citingPaper") mentions another paper (the "citedPaper"), where both papers are identified by the "paperId" field. Citations have attributes of their own, (influential classification, intent classification, and citation context).
2.4B records in 30 8.5GB files.
----------------------
embeddings
A dense vector embedding representing the contents of the paper.
120M records in 30 28GB files.
----------------------
paper-ids
Mapping from sha-based ID to paper corpus ID.
450M records in 30 500MB files
----------------------
papers
The core attributes of a paper (title, authors

### Get download links for papers

In [4]:
headers = {
    'x-api-key': config['S2_API_KEY']
}

In [7]:
res_papers = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest/dataset/papers', headers=headers)

### Download all files

In [10]:
for i, file in enumerate(res_papers.json()['files']):
    url = urlparse(file)
    filename = url.path.split('/')[-1]
    ### formats a string for where to download file to
    target = f'{config["DATA_DIR"]}papers/{i}-{filename}'
    urllib.request.urlretrieve(file, filename=target)
    print(f'{i}-{filename}')

0-20230804_070310_00012_kr7cr_07dff03f-3fdb-40f4-b729-3fc954d82738.gz
1-20230804_070310_00012_kr7cr_10932806-30dc-445e-a2e7-ac7cfbc23dd4.gz
2-20230804_070310_00012_kr7cr_12114a8a-03a7-4c10-a1c4-42b6ae1c5839.gz
3-20230804_070310_00012_kr7cr_12def696-4de5-438e-9784-fa83fee96b8d.gz
4-20230804_070310_00012_kr7cr_14cb4a81-b587-41f7-b656-c43846057864.gz
5-20230804_070310_00012_kr7cr_1856420b-7d30-47e3-bfd5-51e552680403.gz
6-20230804_070310_00012_kr7cr_1dc79604-f7d8-4e5b-b553-ff3ccb79aea9.gz
7-20230804_070310_00012_kr7cr_1ea12c4e-0ce8-4d7f-a27a-0d16609d9b44.gz
8-20230804_070310_00012_kr7cr_20a44ff7-fb15-4daf-84bc-06cc1c412ff6.gz
9-20230804_070310_00012_kr7cr_3323dd31-9e7b-448e-8592-0ba78e793cea.gz
10-20230804_070310_00012_kr7cr_3919a642-52a1-4139-a501-00fc7555dcc6.gz
11-20230804_070310_00012_kr7cr_3f0e7bce-6e75-4658-b52c-dc51a4652934.gz
12-20230804_070310_00012_kr7cr_4352662c-54fe-48d5-82e2-868be584ea4f.gz
13-20230804_070310_00012_kr7cr_43b9221e-b83a-4dc9-87fd-aa7f238c0c42.gz
14-20230804_0703

### Get download links for citations

In [13]:
res_citations = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest/dataset/citations', headers=headers)

In [15]:
## Download all citation files
for i, file in enumerate(res_citations.json()['files']):
    url = urlparse(file)
    filename = url.path.split('/')[-1]
    ### formats a string for where to download file to
    target = f'{config["DATA_DIR"]}citations/{i}-{filename}'
    urllib.request.urlretrieve(file, filename=target)
    print(f'{i}-{filename}')

0-20230804_071510_00042_uwzpg_07cef557-7075-4065-a2e4-3b44e12cbcdd.gz
1-20230804_071510_00042_uwzpg_2ca7e89b-7a77-4a44-8389-fb30a235c3d0.gz
2-20230804_071510_00042_uwzpg_359771b2-942e-4fd9-ab89-153ca89d115e.gz
3-20230804_071510_00042_uwzpg_49037c8e-2ff3-4aab-bf3c-36903681df98.gz
4-20230804_071510_00042_uwzpg_52f12080-b201-47c9-8188-b4d2103c0923.gz
5-20230804_071510_00042_uwzpg_54a20633-91e8-4721-9bac-0bd89600601f.gz
6-20230804_071510_00042_uwzpg_62501a92-1154-4723-b8d1-3cc0f86e8e41.gz
7-20230804_071510_00042_uwzpg_6b3eab58-5757-413d-8ec0-5184a975beb5.gz
8-20230804_071510_00042_uwzpg_6ccee109-be58-4b33-9e54-9bb5413074e9.gz
9-20230804_071510_00042_uwzpg_80fbdf77-0ad4-406e-bde3-79204bcb3d30.gz
10-20230804_071510_00042_uwzpg_8429dfd4-f8d1-48ed-a8e5-b35b0b4c319d.gz
11-20230804_071510_00042_uwzpg_857d7e25-a18e-4880-a51b-391f820f8e29.gz
12-20230804_071510_00042_uwzpg_8cdac8fd-518c-4918-9f74-87e17e794976.gz
13-20230804_071510_00042_uwzpg_91a79e35-4463-4fe1-9694-f68031287be2.gz
14-20230804_0715