In [9]:
# Display images
import os
import glob
import pydicom
import numpy as np
import pandas as pd

#pd.options.display.max_rows = 999
pd.set_option('display.max_colwidth', None)

In [10]:
from definitions import *

print(f"Dataset path: {data_root}")
print(f"Meta path: {meta_path}")
print(f"Mass training data path: {mass_train_path}")
print(f"Mass test data path: {mass_test_path}")
print(f"Calc training data path: {calc_train_path}")
print(f"Calc test data path: {calc_test_path}")

Dataset path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688
Meta path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688\metadata.csv
Mass training data path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688\mass_case_description_train_set.csv
Mass test data path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688\mass_case_description_test_set.csv
Calc training data path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688\calc_case_description_train_set.csv
Calc test data path: C:\Users\lejam\Desktop\CBIS-DDSM\manifest-1748122768688\calc_case_description_test_set.csv


In [29]:
df_meta = pd.read_csv(meta_path)
df_meta.head(3)

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,File Size,File Location,Download Timestamp
0,1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,27.84 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC\08-29-2017-DDSM-NA-96009\1.000000-full mammogram images-63992,2025-05-24T22:44:39.334
1,1.3.6.1.4.1.9590.100.1.2.174390361112646747718661211471328897934,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.384159464510350889125645400702639717613,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,28.97 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO\08-29-2017-DDSM-NA-17613\1.000000-full mammogram images-97934,2025-05-24T22:44:43.456
2,1.3.6.1.4.1.9590.100.1.2.188613955710170417803011787532523988680,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.29112199613143138535387754440942211739,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.62 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO_1\08-29-2017-DDSM-NA-11739\1.000000-ROI mask images-88680,2025-05-24T22:44:44.687


In [61]:
df_meta.shape

(6775, 17)

In [66]:
class ImagePathUpdater:
    def __init__(self, meta_data, target_data, column, root_path, number=None):
        self.meta_data = meta_data
        self.target_data = target_data
        self.column = column
        self.root_path = root_path
        self.number = number
    def worker(self, index, current_path, subject_id):
        try:
            for p in glob.glob(current_path):
                if os.path.isdir(p):
                    new_path = os.path.join(p, '*')
                    self.worker(new_path)
                else:
                    # Parse each DICOM file
                    ds = pydicom.dcmread(p)
        
                    # Categorise the file as either a cropped image or an ROI mask
                    # ROI mask files contain only two pixel values: 0 (background) and 255 (region of interest)
                    diff = np.setdiff1d(np.unique(ds.pixel_array), np.array([0, 255]))
                    if self.column == 'cropped image file path' and diff.size == 0: continue
                    if self.column == 'ROI mask file path' and diff.size > 0: continue
        
                    image_path_without_absolute_root = p[p.find(subject_id):]
                    self.target_data.at[index, self.column] = image_path_without_absolute_root
        except Exception as ex:
            print(ex)
    def update(self):
        try:
            data_num = self.number
            if self.number is None:
                data_num = self.target_data.shape[0]
            # loop through images for display
            for index, row in self.target_data.head(data_num).iterrows():
                # Validate the file path
                splitted_path = row[self.column].split('/')
                if len(splitted_path) == 0: continue
        
                # Skip if no metadata is found
                subject_id = splitted_path[0]
                meta_data = self.meta_data[self.meta_data['Subject ID']==subject_id]
                if meta_data.shape[0] == 0: continue
        
                # Iterate through available file locations for this subject
                for d in meta_data['File Location']:
                    path = os.path.join(self.root_path, d[2:], "*")
    
                    # Recursively search through folders to collect all image files
                    self.worker(index, path, subject_id)
        except Exception as ex:
            print(ex)

In [67]:
# Load mass datasets
mass_train = pd.read_csv(mass_train_path)
mass_test = pd.read_csv(mass_test_path)

In [68]:
print(f"mass data size: {mass_train.shape[0] + mass_test.shape[0]}")

mass data size: 1696


In [69]:
test = df_meta[df_meta['Subject ID']=='Mass-Training_P_02092_LEFT_MLO_1']
for t in test['File Location']:
    print(t)

.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\07-20-2016-DDSM-NA-20213\1.000000-cropped images-58924
.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\07-21-2016-DDSM-NA-14412\1.000000-ROI mask images-30086


### Update image paths

In [70]:
# Make a list of the image paths
cols = ['image file path', 'cropped image file path', 'ROI mask file path']

##### Training data

In [71]:
train_data_updater = ImagePathUpdater(df_meta, mass_train, cols[0], data_root)
train_data_updater.update()

In [72]:
train_data_updater = ImagePathUpdater(df_meta, mass_train, cols[1], data_root)
train_data_updater.update()

In [73]:
train_data_updater = ImagePathUpdater(df_meta, mass_train, cols[2], data_root)
train_data_updater.update()

In [74]:
# Save the updated data
mass_train.to_csv(new_mass_train_path, index=False)

##### Test data

In [75]:
test_data_updater = ImagePathUpdater(df_meta, mass_test, cols[0], data_root)
test_data_updater.update()

In [76]:
test_data_updater = ImagePathUpdater(df_meta, mass_test, cols[1], data_root)
test_data_updater.update()

In [77]:
test_data_updater = ImagePathUpdater(df_meta, mass_test, cols[2], data_root)
test_data_updater.update()

In [78]:
# Save the updated data
mass_test.to_csv(new_mass_test_path, index=False)

### Results

In [79]:
rst_cols = ['pathology'] + cols
rst_cols

['pathology',
 'image file path',
 'cropped image file path',
 'ROI mask file path']

In [80]:
mass_train[rst_cols].head(2)

Unnamed: 0,pathology,image file path,cropped image file path,ROI mask file path
0,MALIGNANT,Mass-Training_P_00001_LEFT_CC\07-20-2016-DDSM-NA-74994\1.000000-full mammogram images-24515\1-1.dcm,Mass-Training_P_00001_LEFT_CC_1\07-21-2016-DDSM-NA-39106\1.000000-ROI mask images-99650\1-2.dcm,Mass-Training_P_00001_LEFT_CC_1\07-21-2016-DDSM-NA-39106\1.000000-ROI mask images-99650\1-1.dcm
1,MALIGNANT,Mass-Training_P_00001_LEFT_MLO\07-20-2016-DDSM-NA-90988\1.000000-full mammogram images-80834\1-1.dcm,Mass-Training_P_00001_LEFT_MLO_1\07-21-2016-DDSM-NA-82526\1.000000-ROI mask images-86053\1-1.dcm,Mass-Training_P_00001_LEFT_MLO_1\07-21-2016-DDSM-NA-82526\1.000000-ROI mask images-86053\1-2.dcm


In [81]:
mass_train[rst_cols].tail(2)

Unnamed: 0,pathology,image file path,cropped image file path,ROI mask file path
1316,MALIGNANT,Mass-Training_P_02092_LEFT_CC\07-20-2016-DDSM-NA-80250\1.000000-full mammogram images-67422\1-1.dcm,Mass-Training_P_02092_LEFT_CC_1\07-20-2016-DDSM-NA-75288\1.000000-cropped images-03171\1-1.dcm,Mass-Training_P_02092_LEFT_CC_1\07-21-2016-DDSM-NA-18548\1.000000-ROI mask images-89402\1-1.dcm
1317,MALIGNANT,Mass-Training_P_02092_LEFT_MLO\07-20-2016-DDSM-NA-65216\1.000000-full mammogram images-91615\1-1.dcm,Mass-Training_P_02092_LEFT_MLO_1\07-20-2016-DDSM-NA-20213\1.000000-cropped images-58924\1-1.dcm,Mass-Training_P_02092_LEFT_MLO_1\07-21-2016-DDSM-NA-14412\1.000000-ROI mask images-30086\1-1.dcm


In [82]:
mass_test[cols].head(2)

Unnamed: 0,image file path,cropped image file path,ROI mask file path
0,Mass-Test_P_00016_LEFT_CC\10-04-2016-DDSM-NA-30104\1.000000-full mammogram images-14172\1-1.dcm,Mass-Test_P_00016_LEFT_CC_1\10-04-2016-DDSM-NA-09887\1.000000-cropped images-26184\1-2.dcm,Mass-Test_P_00016_LEFT_CC_1\10-04-2016-DDSM-NA-09887\1.000000-cropped images-26184\1-1.dcm
1,Mass-Test_P_00016_LEFT_MLO\10-04-2016-DDSM-NA-54392\1.000000-full mammogram images-35518\1-1.dcm,Mass-Test_P_00016_LEFT_MLO_1\10-04-2016-DDSM-NA-15563\1.000000-cropped images-77287\1-2.dcm,Mass-Test_P_00016_LEFT_MLO_1\10-04-2016-DDSM-NA-15563\1.000000-cropped images-77287\1-1.dcm


In [83]:
mass_test[cols].tail(2)

Unnamed: 0,image file path,cropped image file path,ROI mask file path
376,Mass-Test_P_01912_RIGHT_CC\10-04-2016-DDSM-NA-59644\1.000000-full mammogram images-23281\1-1.dcm,Mass-Test_P_01912_RIGHT_CC_1\10-04-2016-DDSM-NA-61275\1.000000-ROI mask images-45359\1-1.dcm,Mass-Test_P_01912_RIGHT_CC_1\10-04-2016-DDSM-NA-61275\1.000000-ROI mask images-45359\1-2.dcm
377,Mass-Test_P_01912_RIGHT_MLO\10-04-2016-DDSM-NA-46673\1.000000-full mammogram images-74983\1-1.dcm,Mass-Test_P_01912_RIGHT_MLO_1\10-04-2016-DDSM-NA-41042\1.000000-ROI mask images-74314\1-1.dcm,Mass-Test_P_01912_RIGHT_MLO_1\10-04-2016-DDSM-NA-41042\1.000000-ROI mask images-74314\1-2.dcm


### Evaluation

In [85]:
# Load mass datasets again
unchanged_mass_train = pd.read_csv(mass_train_path)
unchanged_mass_test = pd.read_csv(mass_test_path)

In [96]:
unchanged_mass_train[rst_cols].head(1)

Unnamed: 0,pathology,image file path,cropped image file path,ROI mask file path
0,MALIGNANT,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.422112722213189649807611434612228974994/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515/000000.dcm,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.108268213011361124203859148071588939106/1.3.6.1.4.1.9590.100.1.2.296736403313792599626368780122205399650/000000.dcm,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.108268213011361124203859148071588939106/1.3.6.1.4.1.9590.100.1.2.296736403313792599626368780122205399650/000001.dcm\n


In [97]:
unchanged_mass_test[rst_cols].head(1)

Unnamed: 0,pathology,image file path,cropped image file path,ROI mask file path
0,MALIGNANT,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.416403281812750683720028031170500130104/1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172/000000.dcm,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.259596319110047779433501728143778409887/1.3.6.1.4.1.9590.100.1.2.30820586311062570442302321942433426184/000000.dcm,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.259596319110047779433501728143778409887/1.3.6.1.4.1.9590.100.1.2.30820586311062570442302321942433426184/000001.dcm\n


In [103]:
all_cols = list(unchanged_mass_train.columns)
print(f"columns: {all_cols} (size: {len(all_cols)})")

columns: ['patient_id', 'breast_density', 'left or right breast', 'image view', 'abnormality id', 'abnormality type', 'mass shape', 'mass margins', 'assessment', 'pathology', 'subtlety', 'image file path', 'cropped image file path', 'ROI mask file path'] (size: 14)


In [107]:
sliced_cols = list(unchanged_mass_train.columns)[:-3]
print(f"columns without paths: {sliced_cols} (size: {len(sliced_cols)})")

columns without paths: ['patient_id', 'breast_density', 'left or right breast', 'image view', 'abnormality id', 'abnormality type', 'mass shape', 'mass margins', 'assessment', 'pathology', 'subtlety'] (size: 11)


##### Compare values column-wise between datasets

In [109]:
# Training data
df_train = pd.merge(
    unchanged_mass_train.reset_index(), mass_train.reset_index(),
    how='outer', on=sliced_cols)
df_train.head(3)

Unnamed: 0,index_x,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path_x,cropped image file path_x,ROI mask file path_x,index_y,image file path_y,cropped image file path_y,ROI mask file path_y
0,0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.422112722213189649807611434612228974994/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515/000000.dcm,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.108268213011361124203859148071588939106/1.3.6.1.4.1.9590.100.1.2.296736403313792599626368780122205399650/000000.dcm,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.108268213011361124203859148071588939106/1.3.6.1.4.1.9590.100.1.2.296736403313792599626368780122205399650/000001.dcm\n,0,Mass-Training_P_00001_LEFT_CC\07-20-2016-DDSM-NA-74994\1.000000-full mammogram images-24515\1-1.dcm,Mass-Training_P_00001_LEFT_CC_1\07-21-2016-DDSM-NA-39106\1.000000-ROI mask images-99650\1-2.dcm,Mass-Training_P_00001_LEFT_CC_1\07-21-2016-DDSM-NA-39106\1.000000-ROI mask images-99650\1-1.dcm
1,1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.9590.100.1.2.319478999311971442426185353560182990988/1.3.6.1.4.1.9590.100.1.2.359308329312397897125630708681441180834/000000.dcm,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9590.100.1.2.188473874511440575807446266233629582526/1.3.6.1.4.1.9590.100.1.2.227955274711225756835838775062793186053/000000.dcm,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9590.100.1.2.188473874511440575807446266233629582526/1.3.6.1.4.1.9590.100.1.2.227955274711225756835838775062793186053/000001.dcm\n,1,Mass-Training_P_00001_LEFT_MLO\07-20-2016-DDSM-NA-90988\1.000000-full mammogram images-80834\1-1.dcm,Mass-Training_P_00001_LEFT_MLO_1\07-21-2016-DDSM-NA-82526\1.000000-ROI mask images-86053\1-1.dcm,Mass-Training_P_00001_LEFT_MLO_1\07-21-2016-DDSM-NA-82526\1.000000-ROI mask images-86053\1-2.dcm
2,2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.347107867812656628709864319310977895697/1.3.6.1.4.1.9590.100.1.2.89180046211022531834352631483669346540/000000.dcm,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.414182170112396175925115449620455230167/1.3.6.1.4.1.9590.100.1.2.429120414011832984817094399141838850375/000000.dcm,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.414182170112396175925115449620455230167/1.3.6.1.4.1.9590.100.1.2.429120414011832984817094399141838850375/000001.dcm\n,2,Mass-Training_P_00004_LEFT_CC\07-20-2016-DDSM-NA-95697\1.000000-full mammogram images-46540\1-1.dcm,Mass-Training_P_00004_LEFT_CC_1\07-21-2016-DDSM-NA-30167\1.000000-ROI mask images-50375\1-2.dcm,Mass-Training_P_00004_LEFT_CC_1\07-21-2016-DDSM-NA-30167\1.000000-ROI mask images-50375\1-1.dcm


In [133]:
# Compare two related columns
# The values in each column don't have to match exactly
df_train['image result'] = np.where(
    (df_train['image file path_x']==df_train['image file path_y']),
    True, False)

In [134]:
if False in list(df_train['image result'].unique()):
    print(f"[ TEST PASS ] All image file paths were successfully updated")
else:
    print(f"[ TEST FAIL ] Some image file paths do not match between the original and updated datasets")

[ TEST PASS ] All image file paths were successfully updated


In [135]:
# Compare two related columns
# The values in each column don't have to match exactly
df_train['cropped image result'] = np.where(
    (df_train['cropped image file path_x']==df_train['cropped image file path_y']),
    True, False)

In [136]:
if False in list(df_train['cropped image result'].unique()):
    print(f"[ TEST PASS ] All cropped image file paths were successfully updated")
else:
    print(f"[ TEST FAIL ] Some cropped image file paths do not match between the original and updated datasets")

[ TEST PASS ] All cropped image file paths were successfully updated


In [138]:
# Compare two related columns
# The values in each column don't have to match exactly
df_train['ROI mask result'] = np.where(
    (df_train['ROI mask file path_x']==df_train['ROI mask file path_y']),
    True, False)

In [139]:
if False in list(df_train['ROI mask result'].unique()):
    print(f"[ TEST PASS ] All ROI mask file paths were successfully updated")
else:
    print(f"[ TEST FAIL ] Some ROI mask file paths do not match between the original and updated datasets")

[ TEST PASS ] All ROI mask file paths were successfully updated


### Summary

The CBIS-DDSM dataset includes a Series Description column that indicates the image type, such as 'full mammogram images', 'ROI mask images', or 'cropped images'. However, some of these descriptions are incorrect. In particular, ROI mask images and cropped images are stored in the same folder, but their Series Descriptions are inconsistently labelled as either 'ROI mask images' or 'cropped images'. Each image should be accurately labelled, as this information is crucial for proper dataset preparation and effective model training.

To address this issue, a simple rule was used to distinguish between the two image types: ROI mask images contain only pixel values of 0 and 255 (i.e., black and white), whereas cropped images contain a wider range of values. This characteristic allows for reliable reclassification. Preliminary tests with a sample set of images confirmed that this method successfully corrects the labels.

To validate this approach, the Test_DataQuality.ipynb was designed to feed the corrected data into a simple CNN model and evaluate its performance. If the model achieves high accuracy (close to 99%) despite its simplicity, it indicates that the dataset itself is well-prepared and of good quality.