Skip to content

pathology-index

Compare
Choose a tag to compare
@vkt1414 vkt1414 released this 11 Feb 11:39
· 193 commits to main since this release

pathology_index (v1) compressed by using default 'snappy' compression algorithm

sql='''
SELECT DISTINCT
sopInstanceuid,
SeriesInstanceUID,
crdc_instance_uuid,
ROUND((SAFE_CAST(instance_size AS float64))/1000000, 2) AS instance_size_MB,
ContainerIdentifier,
pms.PixelSpacing[0] PixelSpacing,
`Rows`,
`Columns`,
TotalPixelMatrixRows,
TotalPixelMatrixColumns,
it ImageType,
TransferSyntaxUID,
pass.CodeValue PrimaryAnatomicStructureSequence_CodeValue,
pass.CodeMeaning PrimaryAnatomicStructureSequence_CodeMeaning,
pass.CodingSchemeDesignator PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
pasms.CodeValue PrimaryAnatomicStructureModifierSequence_CodeValue,
pasms.CodeMeaning PrimaryAnatomicStructureModifierSequence_CodeMeaning,
pasms.CodingSchemeDesignator PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
sds.SpecimenUID,
spscis.ValueType SpecimenPreparationStepContentItemSequence_ValueType,
spscis_cncs.CodeValue SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
spscis_cncs.CodeMeaning SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
spscis_cncs.CodingSchemeDesignator SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
spscis_ccs.CodeValue SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
spscis_ccs.CodeMeaning SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
spscis_ccs.CodingSchemeDesignator SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
ops.LightPathFilterPassThroughWavelength,
ops.IlluminationWavelength,
ops_itcs.CodeValue OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
ops_itcs.CodeMeaning OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
ops_itcs.CodingSchemeDesignator OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
ops_iccs.CodeValue OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
ops_iccs.CodeMeaning OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
ops_iccs.CodingSchemeDesignator OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
from `bigquery-public-data.idc_current.dicom_all` idc
left join unnest (ImageType) it
left join unnest (idc.SharedFunctionalGroupsSequence) sfgs
left join unnest (sfgs.PixelMeasuresSequence) pms
left join unnest (SpecimenDescriptionSequence) sds
left join unnest (sds.PrimaryAnatomicStructureSequence) pass
left join unnest (pass.PrimaryAnatomicStructureModifierSequence) pasms
left join unnest (sds.SpecimenPreparationSequence) sps
left join unnest (sps.SpecimenPreparationStepContentItemSequence) spscis
left join unnest (spscis.ConceptNameCodeSequence) spscis_cncs
left join unnest (spscis.ConceptCodeSequence) spscis_ccs
left join unnest (OpticalPathSequence) ops
left join unnest (ops.IlluminationTypeCodeSequence) ops_itcs
left join unnest (ops.IlluminationColorCodeSequence) ops_iccs
WHERE
  Modality in ('SM')
'''
client.query(sql).to_dataframe().to_parquet('path_index.parquet') 

pathology_index (v2) compressed using zstd level 22.

sql="""
SELECT
  DISTINCT
  --sopInstanceUID,
  SeriesInstanceUID,
  FrameOfReferenceUID,
  crdc_instance_uuid,
  ContainerIdentifier,
  pms.PixelSpacing[0] AS PixelSpacing,
  `Rows`,
  `Columns`,
  TotalPixelMatrixRows,
  TotalPixelMatrixColumns,
  it ImageType,
  TransferSyntaxUID,
  pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
  pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
  pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
  pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
  pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
  pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
  sds.SpecimenUID,
  spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
  spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
  spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
  spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
  spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
  spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
  spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
  ops.LightPathFilterPassThroughWavelength,
  ops.IlluminationWavelength,
  ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
  ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
  ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
  ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
  ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
  ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
FROM
  `bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
  UNNEST (ImageType) AS it
LEFT JOIN
  UNNEST (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
  UNNEST (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
  UNNEST (SpecimenDescriptionSequence) AS sds
LEFT JOIN
  UNNEST (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
  UNNEST (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
  UNNEST (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
  UNNEST (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
  UNNEST (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
  UNNEST (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
  UNNEST (OpticalPathSequence) AS ops
LEFT JOIN
  UNNEST (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
  UNNEST (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
  Modality IN ('SM')

"""

client.query(sql).to_dataframe().to_parquet('path_index_v2.parquet',compression="zstd", compression_level=22)  

nest_pathology_index (v1) compressed by using default 'snappy'

sql='''
SELECT
  collection_id,
  source_DOI,
  PatientID,
  PatientAge,
  PatientSex,
  StudyInstanceUID,
  StudyDate,
  StudyDescription,
  BodyPartExamined,
  SeriesInstanceUID,
  Modality,
  -- Manufacturer,
  -- ManufacturerModelName,
  SeriesDate,
  SeriesDescription,
  SeriesNumber,
  license_short_name,
  CONCAT("s3://", aws_bucket, "/", crdc_series_uuid, "/*") AS series_aws_url,
  sum(ROUND((SAFE_CAST(instance_size AS float64))/1000000, 2)) AS series_size_MB,
  ARRAY_AGG(
  STRUCT(sopInstanceuid,
  ContainerIdentifier,
  pms.PixelSpacing[0] AS PixelSpacing,
  `Rows`,
  `Columns`,
  TotalPixelMatrixRows,
  TotalPixelMatrixColumns,
  it AS ImageType,
  TransferSyntaxUID,
  pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
  pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
  pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
  pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
  pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
  pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
  sds.SpecimenUID,
  spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
  spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
  spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
  spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
  spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
  spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
  spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
  ops.LightPathFilterPassThroughWavelength,
  ops.IlluminationWavelength,
  ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
  ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
  ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
  ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
  ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
  ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator)) AS Attributes
FROM
  `bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
  unnest (ImageType) AS it
LEFT JOIN
  unnest (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
  unnest (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
  unnest (SpecimenDescriptionSequence) AS sds
LEFT JOIN
  unnest (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
  unnest (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
  unnest (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
  unnest (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
  unnest (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
  unnest (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
  unnest (OpticalPathSequence) AS ops
LEFT JOIN
  unnest (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
  unnest (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
  Modality in ('SM')
GROUP BY
  collection_id,
  source_DOI,
  PatientID,
  PatientAge,
  PatientSex,
  StudyInstanceUID,
  StudyDate,
  StudyDescription,
  BodyPartExamined,
  SeriesInstanceUID,
  Modality,
  SeriesDate,
  SeriesDescription,
  SeriesNumber,
  license_short_name,
  aws_bucket,
  crdc_series_uuid
'''
client.query(sql).to_dataframe().to_parquet('path_nested_index.parquet')  

nest_pathology_index (v2) compressed by using zstd level 22

sql='''
SELECT
  DISTINCT
  --sopInstanceUID,
  SeriesInstanceUID,
  FrameOfReferenceUID,
  crdc_instance_uuid,
  ContainerIdentifier,
  pms.PixelSpacing[0] AS PixelSpacing,
  `Rows`,
  `Columns`,
  TotalPixelMatrixRows,
  TotalPixelMatrixColumns,
  it ImageType,
  TransferSyntaxUID,
  pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
  pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
  pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
  pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
  pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
  pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
  sds.SpecimenUID,
  spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
  spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
  spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
  spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
  spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
  spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
  spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
  ops.LightPathFilterPassThroughWavelength,
  ops.IlluminationWavelength,
  ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
  ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
  ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
  ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
  ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
  ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
FROM
  `bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
  UNNEST (ImageType) AS it
LEFT JOIN
  UNNEST (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
  UNNEST (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
  UNNEST (SpecimenDescriptionSequence) AS sds
LEFT JOIN
  UNNEST (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
  UNNEST (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
  UNNEST (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
  UNNEST (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
  UNNEST (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
  UNNEST (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
  UNNEST (OpticalPathSequence) AS ops
LEFT JOIN
  UNNEST (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
  UNNEST (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
  Modality IN ('SM')

'''

client.query(sql).to_dataframe().to_parquet('path_nested_index_v2.parquet',compression="zstd", compression_level=22)