In [None]:
# Follow instructions here:
# https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/overview/siim-cloud-healthcare-api-tutorial

# 1. Install google cloud package
# 2. join the competition group
# 3. Authenticate through bash using:
# gcloud auth application-default login

# after that the script below should work
# Or they are also available online but might be outdated:
# https://drive.google.com/drive/folders/1vNLfdw-X3Z2-n1j-KPGLbPQ5ciDydsiX?usp=drive_open

In [2]:
!pip install google-auth

Collecting google-auth
[?25l  Downloading https://files.pythonhosted.org/packages/c5/9b/ed0516cc1f7609fb0217e3057ff4f0f9f3e3ce79a369c6af4a6c5ca25664/google_auth-1.6.3-py2.py3-none-any.whl (73kB)
[K    100% |████████████████████████████████| 81kB 3.7MB/s ta 0:00:01
[?25hCollecting pyasn1-modules>=0.2.1 (from google-auth)
[?25l  Downloading https://files.pythonhosted.org/packages/91/f0/b03e00ce9fddf4827c42df1c3ce10c74eadebfb706231e8d6d1c356a4062/pyasn1_modules-0.2.5-py2.py3-none-any.whl (74kB)
[K    100% |████████████████████████████████| 81kB 8.7MB/s eta 0:00:01    96% |██████████████████████████████▊ | 71kB 14.4MB/s eta 0:00:01
[?25hCollecting rsa>=3.1.4 (from google-auth)
  Downloading https://files.pythonhosted.org/packages/02/e5/38518af393f7c214357079ce67a317307936896e961e35450b70fad2a9cf/rsa-4.0-py2.py3-none-any.whl
Collecting cachetools>=2.0.0 (from google-auth)
  Downloading https://files.pythonhosted.org/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d

In [3]:
!pip install retrying



In [1]:
"""Script to download all instances in a DICOM Store."""
import os
import posixpath
from concurrent import futures
from retrying import retry
import google.auth
from google.auth.transport.requests import AuthorizedSession

# URL of CHC API
CHC_API_URL = 'https://healthcare.googleapis.com/v1beta1'
PROJECT_ID = 'kaggle-siim-healthcare'
REGION = 'us-central1'
DATASET_ID = 'siim-pneumothorax'
TRAIN_DICOM_STORE_ID = 'dicom-images-train'
TEST_DICOM_STORE_ID = 'dicom-images-test'


@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def download_instance(dicom_web_url, dicom_store_id, study_uid, series_uid,
                      instance_uid, credentials):
    """Downloads a DICOM instance and saves it under the current folder."""
    instance_url = posixpath.join(dicom_web_url, 'studies', study_uid, 'series',
                                  series_uid, 'instances', instance_uid)
    authed_session = AuthorizedSession(credentials)
    response = authed_session.get(
        instance_url, headers={'Accept': 'application/dicom; transfer-syntax=*'})
    file_path = posixpath.join(dicom_store_id, study_uid, series_uid,
                               instance_uid)
    filename = '%s.dcm' % file_path
    if not os.path.exists(filename):
        os.makedirs(os.path.dirname(filename))
    with open(filename, 'wb') as f:
        f.write(response.content)


def download_all_instances(dicom_store_id, credentials):
    """Downloads all DICOM instances in the specified DICOM store."""
    # Get a list of all instances.
    dicom_web_url = posixpath.join(CHC_API_URL, 'projects', PROJECT_ID,
                                   'locations', REGION, 'datasets', DATASET_ID,
                                   'dicomStores', dicom_store_id, 'dicomWeb')
    qido_url = posixpath.join(dicom_web_url, 'instances')
    authed_session = AuthorizedSession(credentials)
    response = authed_session.get(qido_url, params={'limit': '15000'})
    if response.status_code != 200:
        print(response.text)
        return
    content = response.json()
    # DICOM Tag numbers
    study_instance_uid_tag = '0020000D'
    series_instance_uid_tag = '0020000E'
    sop_instance_uid_tag = '00080018'
    value_key = 'Value'
    with futures.ThreadPoolExecutor() as executor:
        future_to_study_uid = {}
        for instance in content:
            study_uid = instance[study_instance_uid_tag][value_key][0]
            series_uid = instance[series_instance_uid_tag][value_key][0]
            instance_uid = instance[sop_instance_uid_tag][value_key][0]
            future = executor.submit(download_instance, dicom_web_url, dicom_store_id,
                                     study_uid, series_uid, instance_uid, credentials)
            future_to_study_uid[future] = study_uid
        processed_count = 0
        for future in futures.as_completed(future_to_study_uid):
            try:
                future.result()
                processed_count += 1
                if not processed_count % 100 or processed_count == len(content):
                    print('Processed instance %d out of %d' %
                          (processed_count, len(content)))
            except Exception as e:
                print('Failed to download a study. UID: %s \n exception: %s' %
                      (future_to_study_uid[future], e))



credentials, _ = google.auth.default()
print('Downloading all instances in %s DICOM store' % TRAIN_DICOM_STORE_ID)
download_all_instances(TRAIN_DICOM_STORE_ID, credentials)
print('Downloading all instances in %s DICOM store' % TEST_DICOM_STORE_ID)
download_all_instances(TEST_DICOM_STORE_ID, credentials)



Downloading all instances in dicom-images-train DICOM store
Processed instance 100 out of 10675
Processed instance 200 out of 10675
Processed instance 300 out of 10675
Processed instance 400 out of 10675
Processed instance 500 out of 10675
Processed instance 600 out of 10675
Processed instance 700 out of 10675
Processed instance 800 out of 10675
Processed instance 900 out of 10675
Processed instance 1000 out of 10675
Processed instance 1100 out of 10675
Processed instance 1200 out of 10675
Processed instance 1300 out of 10675
Processed instance 1400 out of 10675
Processed instance 1500 out of 10675
Processed instance 1600 out of 10675
Processed instance 1700 out of 10675
Processed instance 1800 out of 10675
Processed instance 1900 out of 10675
Processed instance 2000 out of 10675
Processed instance 2100 out of 10675
Processed instance 2200 out of 10675
Processed instance 2300 out of 10675
Processed instance 2400 out of 10675
Processed instance 2500 out of 10675
Processed instance 2600 