pathology-index
pathology_index (v1) compressed by using default 'snappy' compression algorithm
sql='''
SELECT DISTINCT
sopInstanceuid,
SeriesInstanceUID,
crdc_instance_uuid,
ROUND((SAFE_CAST(instance_size AS float64))/1000000, 2) AS instance_size_MB,
ContainerIdentifier,
pms.PixelSpacing[0] PixelSpacing,
`Rows`,
`Columns`,
TotalPixelMatrixRows,
TotalPixelMatrixColumns,
it ImageType,
TransferSyntaxUID,
pass.CodeValue PrimaryAnatomicStructureSequence_CodeValue,
pass.CodeMeaning PrimaryAnatomicStructureSequence_CodeMeaning,
pass.CodingSchemeDesignator PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
pasms.CodeValue PrimaryAnatomicStructureModifierSequence_CodeValue,
pasms.CodeMeaning PrimaryAnatomicStructureModifierSequence_CodeMeaning,
pasms.CodingSchemeDesignator PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
sds.SpecimenUID,
spscis.ValueType SpecimenPreparationStepContentItemSequence_ValueType,
spscis_cncs.CodeValue SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
spscis_cncs.CodeMeaning SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
spscis_cncs.CodingSchemeDesignator SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
spscis_ccs.CodeValue SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
spscis_ccs.CodeMeaning SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
spscis_ccs.CodingSchemeDesignator SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
ops.LightPathFilterPassThroughWavelength,
ops.IlluminationWavelength,
ops_itcs.CodeValue OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
ops_itcs.CodeMeaning OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
ops_itcs.CodingSchemeDesignator OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
ops_iccs.CodeValue OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
ops_iccs.CodeMeaning OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
ops_iccs.CodingSchemeDesignator OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
from `bigquery-public-data.idc_current.dicom_all` idc
left join unnest (ImageType) it
left join unnest (idc.SharedFunctionalGroupsSequence) sfgs
left join unnest (sfgs.PixelMeasuresSequence) pms
left join unnest (SpecimenDescriptionSequence) sds
left join unnest (sds.PrimaryAnatomicStructureSequence) pass
left join unnest (pass.PrimaryAnatomicStructureModifierSequence) pasms
left join unnest (sds.SpecimenPreparationSequence) sps
left join unnest (sps.SpecimenPreparationStepContentItemSequence) spscis
left join unnest (spscis.ConceptNameCodeSequence) spscis_cncs
left join unnest (spscis.ConceptCodeSequence) spscis_ccs
left join unnest (OpticalPathSequence) ops
left join unnest (ops.IlluminationTypeCodeSequence) ops_itcs
left join unnest (ops.IlluminationColorCodeSequence) ops_iccs
WHERE
Modality in ('SM')
'''
client.query(sql).to_dataframe().to_parquet('path_index.parquet')
pathology_index (v2) compressed using zstd level 22.
sql="""
SELECT
DISTINCT
--sopInstanceUID,
SeriesInstanceUID,
FrameOfReferenceUID,
crdc_instance_uuid,
ContainerIdentifier,
pms.PixelSpacing[0] AS PixelSpacing,
`Rows`,
`Columns`,
TotalPixelMatrixRows,
TotalPixelMatrixColumns,
it ImageType,
TransferSyntaxUID,
pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
sds.SpecimenUID,
spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
ops.LightPathFilterPassThroughWavelength,
ops.IlluminationWavelength,
ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
FROM
`bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
UNNEST (ImageType) AS it
LEFT JOIN
UNNEST (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
UNNEST (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
UNNEST (SpecimenDescriptionSequence) AS sds
LEFT JOIN
UNNEST (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
UNNEST (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
UNNEST (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
UNNEST (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
UNNEST (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
UNNEST (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
UNNEST (OpticalPathSequence) AS ops
LEFT JOIN
UNNEST (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
UNNEST (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
Modality IN ('SM')
"""
client.query(sql).to_dataframe().to_parquet('path_index_v2.parquet',compression="zstd", compression_level=22)
nest_pathology_index (v1) compressed by using default 'snappy'
sql='''
SELECT
collection_id,
source_DOI,
PatientID,
PatientAge,
PatientSex,
StudyInstanceUID,
StudyDate,
StudyDescription,
BodyPartExamined,
SeriesInstanceUID,
Modality,
-- Manufacturer,
-- ManufacturerModelName,
SeriesDate,
SeriesDescription,
SeriesNumber,
license_short_name,
CONCAT("s3://", aws_bucket, "/", crdc_series_uuid, "/*") AS series_aws_url,
sum(ROUND((SAFE_CAST(instance_size AS float64))/1000000, 2)) AS series_size_MB,
ARRAY_AGG(
STRUCT(sopInstanceuid,
ContainerIdentifier,
pms.PixelSpacing[0] AS PixelSpacing,
`Rows`,
`Columns`,
TotalPixelMatrixRows,
TotalPixelMatrixColumns,
it AS ImageType,
TransferSyntaxUID,
pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
sds.SpecimenUID,
spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
ops.LightPathFilterPassThroughWavelength,
ops.IlluminationWavelength,
ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator)) AS Attributes
FROM
`bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
unnest (ImageType) AS it
LEFT JOIN
unnest (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
unnest (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
unnest (SpecimenDescriptionSequence) AS sds
LEFT JOIN
unnest (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
unnest (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
unnest (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
unnest (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
unnest (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
unnest (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
unnest (OpticalPathSequence) AS ops
LEFT JOIN
unnest (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
unnest (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
Modality in ('SM')
GROUP BY
collection_id,
source_DOI,
PatientID,
PatientAge,
PatientSex,
StudyInstanceUID,
StudyDate,
StudyDescription,
BodyPartExamined,
SeriesInstanceUID,
Modality,
SeriesDate,
SeriesDescription,
SeriesNumber,
license_short_name,
aws_bucket,
crdc_series_uuid
'''
client.query(sql).to_dataframe().to_parquet('path_nested_index.parquet')
nest_pathology_index (v2) compressed by using zstd level 22
sql='''
SELECT
DISTINCT
--sopInstanceUID,
SeriesInstanceUID,
FrameOfReferenceUID,
crdc_instance_uuid,
ContainerIdentifier,
pms.PixelSpacing[0] AS PixelSpacing,
`Rows`,
`Columns`,
TotalPixelMatrixRows,
TotalPixelMatrixColumns,
it ImageType,
TransferSyntaxUID,
pass.CodeValue AS PrimaryAnatomicStructureSequence_CodeValue,
pass.CodeMeaning AS PrimaryAnatomicStructureSequence_CodeMeaning,
pass.CodingSchemeDesignator AS PrimaryAnatomicStructureSequence_CodingSchemeDesignator,
pasms.CodeValue AS PrimaryAnatomicStructureModifierSequence_CodeValue,
pasms.CodeMeaning AS PrimaryAnatomicStructureModifierSequence_CodeMeaning,
pasms.CodingSchemeDesignator AS PrimaryAnatomicStructureModifierSequence_CodingSchemeDesignator,
sds.SpecimenUID,
spscis.ValueType AS SpecimenPreparationStepContentItemSequence_ValueType,
spscis_cncs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeValue,
spscis_cncs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodeMeaning,
spscis_cncs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptNameCodeSequence_CodingSchemeDesignator,
spscis_ccs.CodeValue AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeValue,
spscis_ccs.CodeMeaning AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodeMeaning,
spscis_ccs.CodingSchemeDesignator AS SpecimenPreparationStepContentItemSequence_ConceptCodeSequence_CodingSchemeDesignator,
ops.LightPathFilterPassThroughWavelength,
ops.IlluminationWavelength,
ops_itcs.CodeValue AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeValue,
ops_itcs.CodeMeaning AS OpticalPathSequence_IlluminationTypeCodeSequence_CodeMeaning,
ops_itcs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationTypeCodeSequence_CodingSchemeDesignator,
ops_iccs.CodeValue AS OpticalPathSequence_IlluminationColorCodeSequence_CodeValue,
ops_iccs.CodeMeaning AS OpticalPathSequence_IlluminationColorCodeSequence_CodeMeaning,
ops_iccs.CodingSchemeDesignator AS OpticalPathSequence_IlluminationColorCodeSequence_CodingSchemeDesignator
FROM
`bigquery-public-data.idc_current.dicom_all` idc
LEFT JOIN
UNNEST (ImageType) AS it
LEFT JOIN
UNNEST (idc.SharedFunctionalGroupsSequence) AS sfgs
LEFT JOIN
UNNEST (sfgs.PixelMeasuresSequence) AS pms
LEFT JOIN
UNNEST (SpecimenDescriptionSequence) AS sds
LEFT JOIN
UNNEST (sds.PrimaryAnatomicStructureSequence) AS pass
LEFT JOIN
UNNEST (pass.PrimaryAnatomicStructureModifierSequence) AS pasms
LEFT JOIN
UNNEST (sds.SpecimenPreparationSequence) AS sps
LEFT JOIN
UNNEST (sps.SpecimenPreparationStepContentItemSequence) AS spscis
LEFT JOIN
UNNEST (spscis.ConceptNameCodeSequence) AS spscis_cncs
LEFT JOIN
UNNEST (spscis.ConceptCodeSequence) AS spscis_ccs
LEFT JOIN
UNNEST (OpticalPathSequence) AS ops
LEFT JOIN
UNNEST (ops.IlluminationTypeCodeSequence) AS ops_itcs
LEFT JOIN
UNNEST (ops.IlluminationColorCodeSequence) AS ops_iccs
WHERE
Modality IN ('SM')
'''
client.query(sql).to_dataframe().to_parquet('path_nested_index_v2.parquet',compression="zstd", compression_level=22)