# Prepare and Categorise Different Cases of MMO Images

## Import and Prepare Neceesary Libraries

In [2]:
import os
import pandas as pd
import shutil
from pathlib import Path
from tqdm import tqdm
import pydicom
import numpy as np
from PIL import Image

## Folders and Relative Path Setup 

In [37]:
CSV_DIR = "metadata"#FOLDERS THAT STORE THE CSV FILE FROM CBIS-DDSM
IMG_DIR = "raw_images/CBIS-DDSM"#FLIES THAT WE USED TO EXTRACT MMO IMAGES OF DIFFERENT CLASSES
OUT_DIR = "dataset"#STORE DIFFERENT CASES OF MMO IMAGED

In [38]:
#Type of cases being created
categories = {
    'MALIGNANT': 'malignant',
    'BENIGN_WITHOUT_CALLBACK': 'benign_no_callback',
    'BENIGN': 'benign_callback',
    'NORMAL': 'normal'
}

In [39]:
#Ensure different tyoe of cases created 
for cat in categories:
    os.makedirs(os.path.join(OUT_DIR, cat), exist_ok=True)

## CSV Files Preparation 

In [40]:
#Combine csv files from CBIS-DDSM
cal_train_df = pd.read_csv(os.path.join(CSV_DIR, 'calc_case_description_train_set.csv'))
mass_train_df = pd.read_csv(os.path.join(CSV_DIR, 'mass_case_description_train_set.csv'))
cal_test_df = pd.read_csv(os.path.join(CSV_DIR, 'calc_case_description_test_set.csv'))
mass_test_df = pd.read_csv(os.path.join(CSV_DIR, 'mass_case_description_test_set.csv'))
df = pd.concat([cal_train_df, mass_train_df, cal_test_df, mass_test_df], ignore_index = True)

### Preprocess of Data in CSV File 

In [41]:
df.count

<bound method DataFrame.count of      patient_id  breast density left or right breast image view  \
0       P_00005             3.0                RIGHT         CC   
1       P_00005             3.0                RIGHT        MLO   
2       P_00007             4.0                 LEFT         CC   
3       P_00007             4.0                 LEFT        MLO   
4       P_00008             1.0                 LEFT         CC   
...         ...             ...                  ...        ...   
3563    P_01825             NaN                RIGHT        MLO   
3564    P_01833             NaN                RIGHT        MLO   
3565    P_01865             NaN                 LEFT        MLO   
3566    P_01912             NaN                RIGHT         CC   
3567    P_01912             NaN                RIGHT        MLO   

      abnormality id abnormality type    calc type calc distribution  \
0                  1    calcification    AMORPHOUS         CLUSTERED   
1                 

### Extract and Classify MLO-View DICOM Mammograms from CBIS-DDSM Dataset

Filters the full dataset df to only include rows where the mammogram image view is MLO (Mediolateral Oblique), and saves the result into a new DataFrame called "df_mlo".

In [42]:
#Ensure only MLO format of images bieng filtered 
df_mlo = df[df['image view'] == 'MLO']

### Index and Map All DICOM Files in CBIS-DDSM Directory

Scan through the entire IMG_DIR directory and recursiely all folders contain .dcm files.It builds a dictionary, dicom_file_map, that maps the relative path (matching the structure used in the CSV metadata) to the absolute file path on disk.

In [43]:
# Dictionary to store mapping of relative paths to full DICOM file paths
print("Indexing all DICOM files...")
dicom_file_map = {}

# Walk through the IMG_DIR directory and all its subdirectories
for root, _, files in os.walk(IMG_DIR):
    for file in files:
        # Only process files with .dcm extension (DICOM format)
        if file.endswith(".dcm"):
            # Get relative path to match the CSV entry
            rel_path = os.path.relpath(os.path.join(root, file), IMG_DIR)
            rel_path = rel_path.replace("\\", "/")  # Normalize slashes
            dicom_file_map[rel_path] = os.path.join(root, file)

Indexing all DICOM files...


### Convert of .dcm images format into PNG format

In [44]:
# Function to convert a DICOM file (.dcm) into a PNG image
def dicom_to_png(dcm_path, png_path):
    try:
        # Read the DICOM file using pydicom
        dcm = pydicom.dcmread(dcm_path)
        # Extract the pixel array and convert it to float for normalization
        img = dcm.pixel_array.astype(float)
        # Normalize the pixel values to the 0–255 range
        img = (np.maximum(img, 0) / img.max()) * 255.0
        # Convert the pixel values to unsigned 8-bit integers
        img = np.uint8(img)
        # Create a PIL image from the numpy array
        im = Image.fromarray(img)
        # Save the image to the specified PNG path
        im.save(png_path)
    except Exception as e:
        print(f"Error converting {dcm_path}: {e}")

## Process and Finding Respective Path for .dcm Format and Perform PNG Conversion

The function will process and find the respective .dcm file from IMG_DIR. The .dmc files will convert to png format and being downloaded and saved in OUT_DIR according to the 4 different cases. 

In [36]:
print("Organizing images...")
not_found = 0

for _, row in tqdm(df_mlo.iterrows(), total=len(df_mlo)):
    pathology = row['pathology']
    target_class = categories.get(pathology)
    if not target_class:
        continue

    patient_id = row['patient_id']
    image_view = row['image view']

    # Try to find a .dcm path that includes both patient_id and image view
    matches = [
        path for path in dicom_file_map.keys()
        if patient_id in path and f"{image_view}" in path and "full mammogram" in path
    ]

    if not matches:
        not_found += 1
        print(f"No match for: {patient_id}, {image_view}")
        continue

    # Take first match
    image_rel_path = matches[0]
    source_path = dicom_file_map[image_rel_path]

    # Construct path to save
    dest_file = f"{patient_id}_{image_view}.png"
    class_folder = os.path.join(OUT_DIR, target_class)
    dest_path = os.path.join(class_folder, dest_file)

    # Ensure class folder exists
    os.makedirs(class_folder, exist_ok=True)

    # Convert to PNG
    dicom_to_png(source_path, dest_path)

print(f"Finished. Total not found: {not_found}")



Organizing images...


100%|██████████| 1896/1896 [1:09:41<00:00,  2.21s/it]

Finished. Total not found: 0



