In [2]:
"""CT_Lymph_Nodes dataset."""

import tensorflow_datasets.public_api as tfds
import tensorflow as tf
from tensorflow_datasets.core import utils
import numpy as np
import os
import io
import pydicom
import nibabel 

# BibTeX citation
_CITATION = """
\@misc{CT_Lymph_Nodes_Citation,
  doi = {10.1007/978-3-319-10404-1_65},
  url = {https://wiki.cancerimagingarchive.net/display/Public/CT+Lymph+Nodes#12d41e510fe547b59000cd90afb8dbf2},
  author = {Roth, Holger R., Lu, Le, Seff, Ari, Cherry, Kevin M., Hoffman, Joanne, Wang, Shijun, Liu, Jiamin, Turkbey, Evrim and Summers, Ronald M.},
  title = {A New 2.5D Representation for Lymph Node Detection Using Random Sets of Deep Convolutional Neural Network Observations},
  publisher = {Springer International Publishing},
  year = {2014},
}
@article{TCIA_Citation,
  author = {
    K. Clark and B. Vendt and K. Smith and J. Freymann and J. Kirby and
    P. Koppel and S. Moore and S. Phillips and D. Maffitt and M. Pringle and
    L. Tarbox and F. Prior
  },
  title = {{The Cancer Imaging Archive (TCIA): Maintaining and Operating a
  Public Information Repository}},
  journal = {Journal of Digital Imaging},
  volume = {26},
  month = {Decembear},
  year = {2013},
  pages = {1045-1057},
}
"""

# Data Description
_DESCRIPTION = """
This dataset contains 110,013 Computed Tomography (CT) images of the mediastinum 
and abdomen in which lymph node positions are marked by radiologists at the 
National Institutes of Health, Clinical Center. These 10,013 images consist of 
388 mediastinal lymph nodes that come from 90 patients and a total of 595 
abdominal lymph nodes in 86 patients. All images are of 512*512 pixel arrays. 
"""



class CT_Lymph_Nodes(tfds.core.GeneratorBasedBuilder):
  """This is a dataset containing CT images of lymph nodes from NIH"""

  #Set up version.
  VERSION = tfds.core.Version('1.0.0')

  MANUAL_DOWNLOAD_INSTRUCTIONS = """\
  You can download the images from
  https://console.cloud.google.com/storage/browser/bme590/jingjing
  Please put all files in manual_dir.
  """
  

  def _info(self):
    # Specifies the tfds.core.DatasetInfo object
    return tfds.core.DatasetInfo(
        builder=self,
        # This is the description that will appear on the datasets page.
        description=_DESCRIPTION,
        # tfds.features.FeatureConnectors
        features=tfds.features.FeaturesDict({
        
        #The CT image
        'image' : tfds.features.Tensor(shape=(512,512),dtype=tf.int16),
        ## The mask
        'mask' : tfds.features.Tensor(shape=(512,512),dtype = tf.int16),
        ## Patient id
        'id'   : tf.string,
        ## Patient Age
        'age'  : tf.string,
        ## Patient Sex
        'sex'  : tf.string,
        ## Body Part Examined
        'body_part'  : tf.string
        
            
        }),
        supervised_keys=('image','mask'),
        # Homepage of the dataset for documentation
        homepage='https://dataset-homepage/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager):
    """Returns SplitGenerators."""
    
    if not tf.io.gfile.exists(dl_manager.manual_dir):
        msg = "You must download the dataset files manually and place them in: "
        msg += dl_manager.manual_dir
        raise AssertionError(msg)
        
    # There is no predefined train/val/test split for this dataset
    return [
        tfds.core.SplitGenerator(
            name=tfds.Split.TRAIN,
            gen_kwargs={
                "filepath": 
            }
        ),  
    ]

  def _generate_examples(self,filepath):
    
    """Yields examples for the CT lymph nodes dataset
    Args:
        filepath: path to the CT lymph nodes files
    Yields:
        Dictionaries with images and masks
    
    """
    ## Each patient has his own folder of masks and images, and the patient id is the same in masks and images
    patients = tf.io.gfile.listdir(os.path.join(filepath,'MED_ABD_LYMPH_MASKS'))
    patients.sort()
    

    ## iterate over all masks folders
    mask_lst = []
    for patient_id in patients:
        try:
            mask = tf.io.gfile.listdir(os.path.join(filepath,'MED_ABD_LYMPH_MASKS',patient_id))
            if mask[0].endswith('.nii.gz'):
                file_name = os.path.join(filepath,'MED_ABD_LYMPH_MASKS',patient_id,mask[0])
                mask_lst.append((patient_id,nibabel.load(file_name)))
        except:
            pass

    ## iterate over all images folders
    for patient_id in patients:
        try:
            mask_file = [item for item in mask_lst if item[0] == patient_id ][0][1]
            ## files are stored in sub-directories, so go into the sub-directory where stores the images
            first = tf.io.gfile.listdir(os.path.join(filepath,'MED_ABD_LYMPH_IMAGES',patient_id))[0]
            second = tf.io.gfile.listdir(os.path.join(filepath,'MED_ABD_LYMPH_IMAGES',patient_id,first))[0]
            third = tf.io.gfile.listdir(os.path.join(filepath,'MED_ABD_LYMPH_IMAGES',patient_id,first,second))[0]
            file_name = os.path.join(filepath,'MED_ABD_LYMPH_IMAGES',patient_id,first,second,third)
            if file_name.endswith('dcm'):
                image_file = pydicom.read_file(file_name)
                yield file_name,
                {
                    'image':image_file.pixel_array,
                    'mask' : mask_file.get_fdata(),
                    'id' : image_file.PatientID,
                    'age' : image_file.PatientAge,
                    'sex' :image_file.PatientSex,
                    'body_part': image_file.BodyPartExamined

                }
        except:
            pass

    


SyntaxError: invalid syntax (<ipython-input-2-30cc1c8dbd1e>, line 106)

In [4]:
!pwd

/Users/Jingjing/project 1/datasets/tensorflow_datasets


In [1]:

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow_datasets.core import utils
import numpy as np
import os
import io
import pydicom
import nibabel 
tfds.load('',data_dir = None)

ValueError: Parsing builder name string  failed.
The builder name string must be of the following format:
  dataset_name[/config_name][:version][/kwargs]

  Where:

    * dataset_name and config_name are string following python variable naming.
    * version is of the form x.y.z where {x,y,z} can be any digit or *.
    * kwargs is a comma list separated of arguments and values to pass to
      builder.

  Examples:
    my_dataset
    my_dataset:1.2.*
    my_dataset/config1
    my_dataset/config1:1.*.*
    my_dataset/config1/arg1=val1,arg2=val2
    my_dataset/config1:1.2.3/right=True,foo=bar,rate=1.2


In [3]:
tfds.load('CT_Lymph_Nodes',download_and_prepare_kwargs = {'download_config' :
    tfds.download.DownloadConfig(manual_dir = None)})

DatasetNotFoundError: Dataset Failed to construct dataset ct__lymph__nodesDataset ct__lymph__nodes not found. Available datasets:
	- abstract_reasoning
	- aeslc
	- aflw2k3d
	- ai2_arc
	- amazon_us_reviews
	- anli
	- arc
	- bair_robot_pushing_small
	- beans
	- big_patent
	- bigearthnet
	- billsum
	- binarized_mnist
	- binary_alpha_digits
	- blimp
	- c4
	- caltech101
	- caltech_birds2010
	- caltech_birds2011
	- cars196
	- cassava
	- cats_vs_dogs
	- celeb_a
	- celeb_a_hq
	- cfq
	- chexpert
	- cifar10
	- cifar100
	- cifar10_1
	- cifar10_corrupted
	- citrus_leaves
	- cityscapes
	- civil_comments
	- clevr
	- clinc_oos
	- cmaterdb
	- cnn_dailymail
	- coco
	- coil100
	- colorectal_histology
	- colorectal_histology_large
	- common_voice
	- cos_e
	- cosmos_qa
	- covid19sum
	- crema_d
	- curated_breast_imaging_ddsm
	- cycle_gan
	- deep_weeds
	- definite_pronoun_resolution
	- dementiabank
	- diabetic_retinopathy_detection
	- div2k
	- dmlab
	- downsampled_imagenet
	- dsprites
	- dtd
	- duke_ultrasound
	- emnist
	- eraser_multi_rc
	- esnli
	- eurosat
	- fashion_mnist
	- flic
	- flores
	- food101
	- forest_fires
	- fuss
	- gap
	- geirhos_conflict_stimuli
	- german_credit_numeric
	- gigaword
	- glue
	- groove
	- higgs
	- horses_or_humans
	- i_naturalist2017
	- imagenet2012
	- imagenet2012_corrupted
	- imagenet2012_real
	- imagenet2012_subset
	- imagenet_a
	- imagenet_resized
	- imagenet_v2
	- imagenette
	- imagewang
	- imdb_reviews
	- irc_disentanglement
	- iris
	- kitti
	- kmnist
	- lfw
	- librispeech
	- librispeech_lm
	- libritts
	- ljspeech
	- lm1b
	- lost_and_found
	- lsun
	- malaria
	- math_dataset
	- mctaco
	- mnist
	- mnist_corrupted
	- movie_lens
	- movie_rationales
	- moving_mnist
	- multi_news
	- multi_nli
	- multi_nli_mismatch
	- my_dataset
	- natural_questions
	- newsroom
	- nsynth
	- nyu_depth_v2
	- omniglot
	- open_images_challenge2019_detection
	- open_images_v4
	- openbookqa
	- opinion_abstracts
	- opinosis
	- opus
	- oxford_flowers102
	- oxford_iiit_pet
	- para_crawl
	- patch_camelyon
	- pet_finder
	- pg19
	- places365_small
	- plant_leaves
	- plant_village
	- plantae_k
	- qa4mre
	- quickdraw_bitmap
	- reddit
	- reddit_disentanglement
	- reddit_tifu
	- resisc45
	- robonet
	- rock_paper_scissors
	- rock_you
	- samsum
	- savee
	- scan
	- scene_parse150
	- scicite
	- scientific_papers
	- shapes3d
	- smallnorb
	- snli
	- so2sat
	- speech_commands
	- squad
	- stanford_dogs
	- stanford_online_products
	- starcraft_video
	- stl10
	- sun397
	- super_glue
	- svhn_cropped
	- ted_hrlr_translate
	- ted_multi_translate
	- tedlium
	- tf_flowers
	- the300w_lp
	- tiny_shakespeare
	- titanic
	- trivia_qa
	- uc_merced
	- ucf101
	- vctk
	- vgg_face2
	- visual_domain_decathlon
	- voc
	- voxceleb
	- voxforge
	- waymo_open_dataset
	- web_questions
	- wider_face
	- wiki40b
	- wikihow
	- wikipedia
	- wikipedia_toxicity_subtypes
	- winogrande
	- wmt14_translate
	- wmt15_translate
	- wmt16_translate
	- wmt17_translate
	- wmt18_translate
	- wmt19_translate
	- wmt_t2t_translate
	- wmt_translate
	- wordnet
	- xnli
	- xsum
	- yelp_polarity_reviews
Check that:
    - if dataset was added recently, it may only be available
      in `tfds-nightly`
    - the dataset name is spelled correctly
    - dataset class defines all base class abstract methods
    - dataset class is not in development, i.e. if IN_DEVELOPMENT=True
    - the module defining the dataset class is imported
 not found. Available datasets:
	- abstract_reasoning
	- aeslc
	- aflw2k3d
	- ai2_arc
	- amazon_us_reviews
	- anli
	- arc
	- bair_robot_pushing_small
	- beans
	- big_patent
	- bigearthnet
	- billsum
	- binarized_mnist
	- binary_alpha_digits
	- blimp
	- c4
	- caltech101
	- caltech_birds2010
	- caltech_birds2011
	- cars196
	- cassava
	- cats_vs_dogs
	- celeb_a
	- celeb_a_hq
	- cfq
	- chexpert
	- cifar10
	- cifar100
	- cifar10_1
	- cifar10_corrupted
	- citrus_leaves
	- cityscapes
	- civil_comments
	- clevr
	- clinc_oos
	- cmaterdb
	- cnn_dailymail
	- coco
	- coil100
	- colorectal_histology
	- colorectal_histology_large
	- common_voice
	- cos_e
	- cosmos_qa
	- covid19sum
	- crema_d
	- curated_breast_imaging_ddsm
	- cycle_gan
	- deep_weeds
	- definite_pronoun_resolution
	- dementiabank
	- diabetic_retinopathy_detection
	- div2k
	- dmlab
	- downsampled_imagenet
	- dsprites
	- dtd
	- duke_ultrasound
	- emnist
	- eraser_multi_rc
	- esnli
	- eurosat
	- fashion_mnist
	- flic
	- flores
	- food101
	- forest_fires
	- fuss
	- gap
	- geirhos_conflict_stimuli
	- german_credit_numeric
	- gigaword
	- glue
	- groove
	- higgs
	- horses_or_humans
	- i_naturalist2017
	- imagenet2012
	- imagenet2012_corrupted
	- imagenet2012_real
	- imagenet2012_subset
	- imagenet_a
	- imagenet_resized
	- imagenet_v2
	- imagenette
	- imagewang
	- imdb_reviews
	- irc_disentanglement
	- iris
	- kitti
	- kmnist
	- lfw
	- librispeech
	- librispeech_lm
	- libritts
	- ljspeech
	- lm1b
	- lost_and_found
	- lsun
	- malaria
	- math_dataset
	- mctaco
	- mnist
	- mnist_corrupted
	- movie_lens
	- movie_rationales
	- moving_mnist
	- multi_news
	- multi_nli
	- multi_nli_mismatch
	- my_dataset
	- natural_questions
	- newsroom
	- nsynth
	- nyu_depth_v2
	- omniglot
	- open_images_challenge2019_detection
	- open_images_v4
	- openbookqa
	- opinion_abstracts
	- opinosis
	- opus
	- oxford_flowers102
	- oxford_iiit_pet
	- para_crawl
	- patch_camelyon
	- pet_finder
	- pg19
	- places365_small
	- plant_leaves
	- plant_village
	- plantae_k
	- qa4mre
	- quickdraw_bitmap
	- reddit
	- reddit_disentanglement
	- reddit_tifu
	- resisc45
	- robonet
	- rock_paper_scissors
	- rock_you
	- samsum
	- savee
	- scan
	- scene_parse150
	- scicite
	- scientific_papers
	- shapes3d
	- smallnorb
	- snli
	- so2sat
	- speech_commands
	- squad
	- stanford_dogs
	- stanford_online_products
	- starcraft_video
	- stl10
	- sun397
	- super_glue
	- svhn_cropped
	- ted_hrlr_translate
	- ted_multi_translate
	- tedlium
	- tf_flowers
	- the300w_lp
	- tiny_shakespeare
	- titanic
	- trivia_qa
	- uc_merced
	- ucf101
	- vctk
	- vgg_face2
	- visual_domain_decathlon
	- voc
	- voxceleb
	- voxforge
	- waymo_open_dataset
	- web_questions
	- wider_face
	- wiki40b
	- wikihow
	- wikipedia
	- wikipedia_toxicity_subtypes
	- winogrande
	- wmt14_translate
	- wmt15_translate
	- wmt16_translate
	- wmt17_translate
	- wmt18_translate
	- wmt19_translate
	- wmt_t2t_translate
	- wmt_translate
	- wordnet
	- xnli
	- xsum
	- yelp_polarity_reviews
Check that:
    - if dataset was added recently, it may only be available
      in `tfds-nightly`
    - the dataset name is spelled correctly
    - dataset class defines all base class abstract methods
    - dataset class is not in development, i.e. if IN_DEVELOPMENT=True
    - the module defining the dataset class is imported


In [None]:
from tensorflow_datasets.core import SplitGenerator
test_generator = SplitGenerator(name='test', gen_kwargs={"filepath": test_dir})

In [None]:
builder._split_generators = lambda _: [test_generator]

In [None]:
builder.download_and_prepare()

In [None]:
_masks_url = """https://wiki.cancerimagingarchive.net/download/attachments/19726546/MED_ABD_LYMPH_MASKS.zip?version=1&modificationDate=1449684916503&api=v2
"""