# install packages

In [1]:
!pip install pyyaml==5.3.1
!pip install gen3

Collecting pyyaml==5.3.1
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.4/269.4 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pyyaml
  Building wheel for pyyaml (setup.py) ... [?25ldone
[?25h  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp310-cp310-linux_x86_64.whl size=44635 sha256=1692dbaa7d1da4560f8fc7e844bd8f6e456cfec44561058fc7dd7ab33aa26281
  Stored in directory: /home/jovyan/.cache/pip/wheels/0b/a9/6a/d0a6981a8dbb698845178818642f72ce179f14336908c7df01
Successfully built pyyaml
Installing collected packages: pyyaml
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
Successfully installed pyyaml-5.3.1
Collecting gen3
  Downloading gen3-4.21.0-py3-none-any.whl (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

# import packages

In [2]:
import gen3
import os
import time

# define individual process code

In [3]:
%%writefile download_data.py

import gen3
import os

output_dir = 'data'

guids = ['dg.MD1R/ea669b5e-ae51-40ba-b375-ed23a9cd1855',
         'dg.MD1R/a745ed98-0cb9-4537-826b-13b2e354e8bb',
         'dg.MD1R/e604979a-c71b-4ec6-b8a0-959837b86384',
         'dg.MD1R/b5cee98d-46ff-4438-aa00-90727a383340',
         'dg.MD1R/8a5a5579-7925-432d-a614-3ed208f1c182',
         'dg.MD1R/33034812-47f3-4c0e-b60b-fa7a2a04ecda',
         'dg.MD1R/5ca987c5-c660-4785-a67d-a3424cc8ec6e',
         'dg.MD1R/44148117-1858-49ef-b30f-d239abfaff80',
         'dg.MD1R/9ea205e8-a774-4318-a323-95eadda9bc5c',
         'dg.MD1R/09ece36f-a0fa-48e8-8fc2-62110eaae570']

midrc_creds_file = "/home/jovyan/pd/midrc_credentials.json"
endpoint = 'data.midrc.org'

for guid in guids:
    print('pulling guid {}'.format(guid))
    os.system("gen3 --auth {} --endpoint {} drs-pull object {}".format(midrc_creds_file, endpoint, guid))

Overwriting download_data.py


In [7]:
%%writefile main.nf
#!/usr/bin/env nextflow

process DownloadData {
    
    label 'download_data'
    
    tag 'initial_task'
    
    output:
    stdout emit: download_data_log
    path('**/*.dcm'), emit: dicom_files
    
    script:
    """
    python3 ${baseDir}/download_data.py
    """
}

process dicom_to_png {
    
    label 'dcm2png'
    
    input:
    path(dicom_files)
    
    output:
    stdout emit: dicom_to_png_log
    path('*.png'), emit: png_files
    
    script:
    """
    #!/usr/bin/env python3

    import pydicom
    import argparse
    import numpy as np
    from PIL import Image
    import os
    
    dicom_input = '$dicom_files'
    
    def main(dicom_input):
        png_out = dicom_input.split('.dcm')[0] + '.png'
        dicom_dataset = pydicom.dcmread(dicom_input)
        transformed_image = dicom_dataset.pixel_array.astype(float)
        scaled_image = np.uint8((np.maximum(transformed_image, 0) / transformed_image.max()) * 255.0)
        final_image = Image.fromarray(scaled_image)
        final_image.save(png_out)

    if __name__ == '__main__':
        main(dicom_input)
        """
}

process extract_metadata {
    
    label 'ext_metadata'
    
    input:
    path(dicom_files)
    
    output:
    stdout emit: extract_metadata_log
    path('*.csv'), emit: csv_files
    
    script:
    """
    #!/usr/bin/env python3

    import pandas as pd
    import os
    import argparse
    from dicom_csv import join_tree
    
    dicom_input = '$dicom_files'
    metadata_csv = 'dicom-metadata.csv'

    def main(dicom_input, metadata_csv):
        metadata_df = join_tree('.', verbose=2)
        dicom_metadata_df = metadata_df.loc[metadata_df['PixelRepresentation'].notnull()]
        dicom_metadata_df.drop_duplicates(inplace=True)
        return dicom_metadata_df.to_csv(metadata_csv)

    if __name__ == '__main__':
        main(dicom_input, metadata_csv)    
    """

}
    
// Define the nextflow workflow
workflow {
    DownloadData()
    dicom_to_png(DownloadData.out.dicom_files)
    extract_metadata(DownloadData.out.dicom_files)  
}



Overwriting main.nf


In [8]:
%%writefile nextflow.config

process {
    withLabel: download_data {
        executor = 'local'
    }
    withLabel: dcm2png {
        executor = 'awsbatch'
        queue = 'placeholder'
        container = 'public.ecr.aws/l5b8a5z6/nextflow-approved:batch_poc2'
    }
    withLabel: ext_metadata {
        executor = 'awsbatch'
        queue = 'placeholder'
        container = 'public.ecr.aws/l5b8a5z6/nextflow-approved:batch_poc2'
    } 
}

aws {
    region = 'us-east-1'
    batch {
        cliPath = '/home/ec2-user/miniconda/bin/aws'
        jobRole = 'placeholder'
    }
}

workDir = 'placeholder'

docker.enabled = true


Overwriting nextflow.config


In [1]:
!nextflow run main.nf -dsl2