# DATASET CURATION - MASKED ROI PROJECT


**Objectives**: 

To create the following groups:
1. **Positive group**: BIRADS 0 that became BIRADS 3, 4, 5, 6 in the subsequent diagnostic study
2. **Negative group**: BIRADS 1, 2 and BIRADS 0 that became BIRADS 1, 2 in the subsequent diagnostic study


## 1. Prep

In [None]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from IPython.display import display

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

In [None]:
def get_stats(df, suffix=None):
    """Provides a quick summary of a dataframe."""
    try:
        print(f"DF shape: {df.shape}")
        print(f"# Patients: {df.empi_anon.nunique()}")
        print(f"# Cases: {df.acc_anon.nunique()}\n")
        print(f"# Images: {df.png_path.nunique()}\n")
    except Exception as e:
        print(e)

In [None]:
# EMBED
metadata_full = pd.read_csv("/fsx1/emory-mammo/tables/filtered_metadata.csv", dtype=str) #TODO: change back
magview_full = pd.read_csv("/fsx1/emory-mammo/tables/filtered_magview.csv", dtype=str) #TODO: change back

In [None]:
# Selecting the following columns

meta_cols = [
    "empi_anon",
    "acc_anon",
    "ImageLateralityFinal",
    "ViewPosition",
    "study_date_anon",
    "FinalImageType",
    "png_path",
    "StudyDescription",
    #"match_level",
    "num_roi",
    #"ROI_coords" #TODO: may be important later if there is no ROI coords
]

mag_cols = [
    "empi_anon",
    "acc_anon",
    "study_date_anon",
    "desc",
    "side",
    "asses", # The BI-RADS score of the exam
    "path_severity", # most severe pathology result from a given specimen, abstracted from path1 – path10
    "bside",
    'procdate_anon',
    'pdate_anon',
]

In [None]:
metadata = metadata_full[meta_cols].copy()
magview = magview_full[mag_cols].copy()

In [None]:
metadata.study_date_anon = pd.to_datetime(metadata.study_date_anon)
magview.study_date_anon = pd.to_datetime(magview.study_date_anon)

In [None]:
metadata.num_roi = metadata.num_roi.astype(int)

## 2. METADATA: 2D MLO & CC
**MLO (Mediolateral Oblique)** and **CC (Craniocaudal)** are standard mammographic views:

- **MLO:** Captures breast tissue from the upper outer to the lower inner parts, including the chest wall area. Essential for detecting abnormalities in a large portion of the breast.
- **CC:** Provides a straight-on view from above, crucial for assessing central and inner breast tissue.

These views together offer a comprehensive evaluation of the breast in mammography.


In [None]:
# Filter metadata to include only 2D mammograms with views MLO or CC
meta_2d = metadata.loc[
    (metadata.FinalImageType == "2D") &  # Select only 2D images
    (metadata.ViewPosition.isin(["MLO", "CC"]))  # Include images with ViewPosition of MLO or CC
]

# Display statistics for the filtered 2D mammogram data
get_stats(meta_2d)

In [None]:
def get_image_stats(df):
    """Provides a quick summary of the number of unique images and the ROIs."""
    temp_df = pd.merge(df, meta_2d, on=["empi_anon", "acc_anon"], how="left")
    temp_df = temp_df.loc[
        (temp_df.side==temp_df.ImageLateralityFinal)
    ]
    temp_df.drop_duplicates(subset="png_path", inplace=True)
    print(f"# PNG PATH: {int(temp_df.png_path.nunique())}")
    print(f"# ROI: {int(temp_df.num_roi.sum())}")
    print(f"{temp_df.num_roi.value_counts()}")
    del temp_df

## 3. Screening
### 3.1 Filter the 'magview' DataFrame for exams where the 'desc' column indicates a screening procedure.

In [None]:
# Filter the 'magview' DataFrame to include only rows where the 'desc' column contains "screen"
# The 'desc' column includes study descriptions like "screening" or "diagnostic" mammogram
# The search is case-insensitive to match any variation of "screen"
screening_magview = magview.loc[magview.desc.str.contains("screen", case=False)].copy() 

# Display statistics for the filtered screening mammogram data
get_stats(screening_magview)

### 3.2 Creating entries for the negative contralateral breast in bilateral examinations
MAGVIEW only has entries if a finding exists. 

This means that if an exam is a bilateral exam and only one of the breast has a finding, the contralateral breast (negative) won't have an entry. 

This would be problematic at the time when we need to merge with METADATA, because the contralateral breast would be excluded. 

Therefore, we would need to create rows for the negative contralateral breast.

In [None]:
def get_exam_laterality(row):
    """
    A convenient function to get the exam laterality to be used with DF.apply() instead of iterating over each row.
    
    Function to determine the laterality of the exam based on the 'desc' column.
    This is intended to be used with DataFrame.apply() for efficient row-wise operations.
    
    Returns:
    - "B" for bilateral exams (if "bilat" is found in the description).
    - "L" for left-sided exams (if "left" is found in the description).
    - "R" for right-sided exams (if "right" is found in the description).
    - None if no laterality is specified in the description.
    """
    if ("bilat" in row.desc.lower()):
        return "B"
    elif ("left" in row.desc.lower()):
        return "L"
    elif ("right" in row.desc.lower()):
        return "R"
    else:
        return None

#### 3.2a: Processing screening_magview
This section processes the `screening_magview` DataFrame to manage bilateral exams:

- **Determine Laterality**: Classifies exams as bilateral (B), left (L), or right (R).
- **Split Bilateral Entries**: Fills missing `side` values as "B" and splits into separate left and right entries.
- **Aggregate and Complete Data**: Identifies and adds missing contralateral sides.
- **Finalize DataFrame**: Merges original with new entries, sorts, and removes duplicates.
- **Image Statistics**: Displays statistics for the processed data.

In [None]:
# Determine exam laterality (B, L, R)
screening_magview["exam_laterality"] = screening_magview.apply(get_exam_laterality, axis=1)
screening_magview.exam_laterality.value_counts(dropna=False)
screening_magview.side.value_counts(dropna=False)

# Fill missing 'side' values with "B" (bilateral)
screening_magview.side = screening_magview.side.fillna("B")

# Create a copy for the right side, replacing "B" with "R"
screening_magview_r = screening_magview.loc[screening_magview.side=="B"].copy()
screening_magview_r.side = screening_magview.side.str.replace("B", "R")

# Replace "B" with "L" for the left side in the original DataFrame
screening_magview.side = screening_magview.side.str.replace("B", "L")

# Combine left and right side DataFrames
screening_magview = pd.concat([screening_magview, screening_magview_r])
print(screening_magview.side.value_counts(dropna=False))
print(screening_magview.shape)

# Sort and remove duplicates
screening_magview = screening_magview.sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview

# Select bilateral exams for further processing
exam_lat_b = screening_magview.loc[screening_magview.exam_laterality=="B"]
exam_lat_b.sample(2)

# We want to aggregate all the sides for each bilateral exam so that we can filter those having only a single side.
exam_lat_b_agg = exam_lat_b.groupby('acc_anon')['side'].apply(''.join).reset_index()
exam_lat_b_agg.sample(2)
exam_lat_b_agg.side.value_counts()

# Identify exams missing the left side and create entries for them
exam_lat_b_side_r = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("L"))].copy()
exam_lat_b_side_l = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("R"))].copy()
screening_magview_right_to_left = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_r.acc_anon)].copy().drop_duplicates()


# Identify exams missing the right side and create entries for them
screening_magview_left_to_right = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_l.acc_anon)].copy().drop_duplicates()

# Create the negative left side
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="R", "side"] = "L"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "asses"] = "N"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "path_severity"] = np.nan

screening_magview_right_to_left

# Create the negative right side
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="L", "side"] = "R"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "asses"] = "N"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "path_severity"] = np.nan

screening_magview_left_to_right

# Merge original with contralateral entries, sort, and remove duplicates
screening_magview_with_contralat = pd.concat([screening_magview, screening_magview_left_to_right, screening_magview_right_to_left]).sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview_with_contralat.sample(2)

# Display image statistics for the final DataFrame
get_image_stats(screening_magview_with_contralat)

### 3.3 BIRADS 0 and BIRADS 1, 2

This section filters the data by BIRADS categories:

- **BIRADS 0 (A)**: Additional evaluation needed.
- **BIRADS 1 (N)**: Negative.
- **BIRADS 2 (B)**: Benign.

The filtered data for each category is analyzed with basic statistics and image details.

In [None]:
# BIRADS 0: Select cases with BIRADS 0 ("A" - Additional evaluation)
b0 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["A"])]

# Display statistics for BIRADS 0 cases
get_stats(b0)
get_image_stats(b0)

In [None]:
# BIRADS 1, 2: Select cases with BIRADS 1 ("N" - Negative) or BIRADS 2 ("B" - Benign)
b12 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["B", "N"])]

# Display statistics for BIRADS 1 and 2 cases
get_stats(b12)
get_image_stats(b12)

## 4. Diagnostic

In [None]:
# Extracting rows from magview where 'desc' contains the word 'diag' (case insensitive)
diag_magview = magview.loc[magview.desc.str.contains('diag', case=False)]

# Displaying basic statistics for the diagnostic magview data
get_stats(diag_magview)
print()

# Printing the counts of 'asses' (assessment) column values in the diagnostic magview data
print(f"Asses Counts:\n{diag_magview.asses.value_counts()}")

## 5. Screening BIRADS 0 and Diagnostic

In [None]:
# Merging BIRADS 0 data with diagnostic magview data based on 'empi_anon'
# Ensuring side consistency between screening and diagnostic records
b0_dx = pd.merge(b0, diag_magview, on='empi_anon', suffixes=[None, "_dx"])
b0_dx = b0_dx.loc[
    (b0_dx.side==b0_dx.side_dx)
    | (b0_dx.side_dx=="B")
    | (b0_dx.side_dx.isna())
]

# Calculate the difference in days between the diagnostic and screening study dates
b0_dx["delta_date_dx"] = (b0_dx.study_date_anon_dx - b0_dx.study_date_anon).dt.days

# Filter to get only subsequent diagnostic studies within 3 months of the screening
b0_dx_3mo = b0_dx.loc[b0_dx.delta_date_dx.isin(range(0, 91))]

# Randomly sample 1 record from the filtered data
b0_dx_3mo.sample(1)

### 5.1. BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)

In [None]:
# 5.1. BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)
# Filtering cases where BIRADS 0 (Screening) progressed to BIRADS 1 or 2 (Diagnostic)
b0_12dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["N", "B"])].copy()

# Displaying basic statistics and image statistics for BIRADS 1, 2 (Diagnostic) cases
get_stats(b0_12dx)
get_image_stats(b0_12dx)

### 5.2. BIRADS 0 (Screening) --> BIRADS 3, 4, 5, 6 (Diagnostic)

In [None]:
# Filtering cases where BIRADS 0 (Screening) progressed to BIRADS 3, 4, 5, or 6 (Diagnostic)
b0_3456dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["P", "S", "M", "K"])].copy()

# Displaying basic statistics and image statistics for BIRADS 3, 4, 5, 6 (Diagnostic) cases
get_stats(b0_3456dx)
get_image_stats(b0_3456dx)

## 6. Negative group
Negative group = BIRADS 1, 2 (Screening) + BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)

In [None]:
# Concatenating BIRADS 1, 2 (Screening) and BIRADS 0 --> BIRADS 1, 2 (Diagnostic) data
neg_group = pd.concat([b12, b0_12dx])
neg_group.drop_duplicates(inplace=True)

# Displaying basic statistics and image statistics for the negative group
get_stats(neg_group)
get_image_stats(neg_group)

In [None]:
# Include only those with a negative follow-up after 1 year
neg_group_b12 = pd.merge(neg_group, b12, on=["empi_anon"], suffixes=(None, "_1yrfu"))

# Ensuring side consistency between the initial and follow-up studies
neg_group_b12 = neg_group_b12.loc[
    (neg_group_b12.side==neg_group_b12.side_1yrfu)
]

# Calculate the difference in days between the 1-year follow-up study and the initial study
neg_group_b12["delta_date_1yrfu"] = (neg_group_b12.study_date_anon_1yrfu - neg_group_b12.study_date_anon).dt.days

# Displaying basic statistics and image statistics for the negative group with 1-year follow-up
get_stats(neg_group_b12)
get_image_stats(neg_group_b12)

# Randomly sample 2 records from the filtered data
neg_group_b12.sample(2)

In [None]:
# Further filter to include only those with follow-up more than 1 year later
neg_group_1yrfu = neg_group_b12.loc[
    (neg_group_b12.delta_date_1yrfu > 360)
]

# Displaying basic statistics and image statistics for the negative group with follow-up > 1 year
get_stats(neg_group_1yrfu)
get_image_stats(neg_group_1yrfu)

In [None]:
# Sort by patient and study date, then drop duplicates to keep only the first follow-up study
neg_group_1yrfu_first_study = neg_group_1yrfu.sort_values(["empi_anon", "acc_anon", "study_date_anon_1yrfu"]).drop_duplicates(subset=["acc_anon", "side"]) # to only get the first followup study

# Displaying basic statistics and image statistics for the first follow-up study in the negative group
get_stats(neg_group_1yrfu_first_study)
get_image_stats(neg_group_1yrfu_first_study)

In [None]:
# Counting the pathology severity in the first follow-up study of the negative group
neg_group_1yrfu_first_study.path_severity.value_counts()

In [None]:
# Exclude any patient with any biopsy result
neg_group_1yrfu_first_study_no_biopsy = neg_group_1yrfu_first_study.loc[neg_group_1yrfu_first_study.path_severity.isna()].copy()

In [None]:
# Merging with metadata to get the associated images
neg_group_1yrfu_first_study_no_biopsy_images = pd.merge(neg_group_1yrfu_first_study_no_biopsy, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])

# Ensure image and study side consistency
neg_group_1yrfu_first_study_no_biopsy_images = neg_group_1yrfu_first_study_no_biopsy_images.loc[
    (neg_group_1yrfu_first_study_no_biopsy_images.side == neg_group_1yrfu_first_study_no_biopsy_images.ImageLateralityFinal)
]

# Remove duplicate images based on the file path
neg_group_1yrfu_first_study_no_biopsy_images.drop_duplicates(subset="png_path", inplace=True)

# Displaying basic statistics and image statistics for the first follow-up study without biopsy results in the negative group
get_stats(neg_group_1yrfu_first_study_no_biopsy_images)

In [None]:
# Summing and displaying the number of regions of interest (ROIs) in the images
print(f"ROIs = {neg_group_1yrfu_first_study_no_biopsy_images.num_roi.sum()}")

# Displaying the count of ROIs
print(neg_group_1yrfu_first_study_no_biopsy_images.num_roi.value_counts())

## 7. Positive Group

In [None]:
# Merging the BIRADS 0 (Screening) --> BIRADS 3, 4, 5, 6 (Diagnostic) data with metadata to obtain associated images
pos_group_images = pd.merge(b0_3456dx, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])

# Ensure that the side of the study matches the image laterality
pos_group_images = pos_group_images.loc[
    (pos_group_images.side == pos_group_images.ImageLateralityFinal)
]

# Remove duplicate images based on the file path
pos_group_images.drop_duplicates(subset="png_path", inplace=True)

# Displaying basic statistics for the positive group images
get_stats(pos_group_images)

In [None]:
# Summing and displaying the total number of regions of interest (ROIs) in the images
print(f"ROIs  = {pos_group_images.num_roi.sum()}")

# Displaying the count of ROIs in the images
print(pos_group_images.num_roi.value_counts())

## 8. Excluding Images from the Negative Group that are found in the Positive Group using acc_anon and side

In [None]:
# Merge the negative group with the positive group based on 'empi_anon', 'acc_anon', and 'side'
# The suffixes "_neg" and "_pos" distinguish columns from the negative and positive groups, respectively
neg_pos = pd.merge(neg_group_1yrfu_first_study_no_biopsy_images, pos_group_images, on=["empi_anon", "acc_anon", "side"], suffixes=["_neg", "_pos"])
neg_pos.sample(2) # Randomly sample 2 records from the merged negative and positive group data

In [None]:
# Create a new column 'acc_anon_side' as a unique identifier by concatenating 'acc_anon' and 'side' in the merged data
neg_pos["acc_anon_side"] = neg_pos.acc_anon + neg_pos.side
neg_pos.sample(2) # Randomly sample 2 records to verify the new 'acc_anon_side' identifier

In [None]:
# Similarly, create 'acc_anon_side' in the negative group data for comparison
neg_group_1yrfu_first_study_no_biopsy_images["acc_anon_side"] = neg_group_1yrfu_first_study_no_biopsy_images.acc_anon + neg_group_1yrfu_first_study_no_biopsy_images.side
neg_group_1yrfu_first_study_no_biopsy_images.sample(2) # Randomly sample 2 records to verify the new identifier

In [None]:
# Exclude any images from the negative group that are also found in the positive group 
# by filtering out rows where 'acc_anon_side' matches any in the positive group
neg_group_final = neg_group_1yrfu_first_study_no_biopsy_images.loc[~neg_group_1yrfu_first_study_no_biopsy_images.acc_anon_side.isin(neg_pos.acc_anon_side)]
neg_group_final.sample(2) # Randomly sample 2 records from the final negative group after exclusion

In [None]:
# Displaying basic statistics for the final negative group after exclusion
get_stats(neg_group_final)

# Summing and displaying the total number of regions of interest (ROIs) in the final negative group images
print(f"ROIs  = {neg_group_final.num_roi.sum()}")

# Displaying the count of ROIs in the final negative group images
print(neg_group_final.num_roi.value_counts())

## 9. Saving and Exporting

    'empi_anon',               # Anonymous patient identifier
    'acc_anon',                # Anonymous accession number
    'desc',                    # Study description
    'asses',                   # Assessment result from the initial study
    'asses_dx',                # Assessment result from the diagnostic study
    'path_severity',           # Pathology severity
    'study_date_anon',         # Anonymous date of the initial study
    'study_date_anon_dx',      # Anonymous date of the diagnostic study
    'side',                    # Side of the body (e.g., left or right)
    'ImageLateralityFinal',    # Final image laterality
    'bside',                   # Biopsy side
    'ViewPosition',            # Position of the view in the image
    # 'match_level',           # (Commented out) Level of matching between studies
    'num_roi',                 # Number of regions of interest (ROIs)
    # 'ROI_coords',            # (Commented out) Coordinates of the ROIs
    'png_path',                # File path to the image

In [None]:
columns_to_save = [
    'empi_anon',
    'acc_anon',
    'desc',
    'asses',
    'asses_dx',
    'path_severity',
    'study_date_anon',
    'study_date_anon_dx',
    'side',
    'ImageLateralityFinal',
    'bside',
    'ViewPosition',
    #'match_level',
    'num_roi',
    #'ROI_coords',
    'png_path',
]

In [None]:
# Export the final negative group to a CSV file
neg_group_final[columns_to_save].to_csv("NEGATIVE_GROUP.csv", index=False)

In [None]:
# Export the positive group images to a CSV file
pos_group_images[columns_to_save].to_csv("POSITIVE_GROUP.csv", index=False)

# END