# This notebook performs the QC step to set up the image data for use with Image2Image translation model training and Evaluation. 
This entire data pre-processing step (and the repo in general) will be dependent on a local pediatric_cancer_atlas_profiling repo (https://github.com/WayScience/pediatric_cancer_atlas_profiling) that is ran up to 2.feature_extraction. The config.yml file will need to be configured with the correct path to the pediatric_cancer_atlas_profiling repo for this notebook to work.

This notebook relies on the whole_img_qc_output to obtain thesaturation and blur QC metrics and generates a collection of sites to be excluded from the training/evaluation. 

In [1]:
import pathlib
import yaml

import pandas as pd
import numpy as np
from scipy.stats import zscore

## Read config

In [2]:
with open(pathlib.Path('.').absolute().parent / "config.yml", "r") as file:
    config = yaml.safe_load(file)

## Define paths

In [3]:
## Access profiling repo path from config
PROFILING_DIR = pathlib.Path(config['paths']['pediatric_cancer_atlas_profiling_path'])

# Directory with QC CellProfiler outputs per plate
QC_DIR = PROFILING_DIR / "1.illumination_correction" / "whole_img_qc_output"
assert QC_DIR.exists()

# Output path for plate, well and site marked for exclusion
QC_OUTPUT_DIR = pathlib.Path('.') / 'preprocessing_output'
QC_OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

## Collect files containing plate specific QC Metrics from the profiling repo

In [4]:
# Create an empty dictionary to store data frames for each plate
all_qc_data_frames = {}

# List all plate directories
plates = [plate.name for plate in QC_DIR.iterdir() if plate.is_dir()]

# Loop through each plate
for plate in plates:
    # Read in CSV with all image quality metrics per image for the current plate
    qc_df = pd.read_csv(QC_DIR / plate / "Image.csv")

    # Store the data frame for the current plate in the dictionary
    all_qc_data_frames[plate] = qc_df

# Print the plate names to ensure they were loaded correctly
print(all_qc_data_frames.keys())

# Select the first plate in the list
first_plate = plates[0]
print(f"Showing example for the first plate: {first_plate}")

# Access the dataframe for the first plate
example_df = all_qc_data_frames[first_plate]

# Show the shape and the first few rows of the dataframe for the first plate
print(example_df.shape)

dict_keys(['BR00143979', 'BR00143978', 'BR00143980', 'BR00143977', 'BR00143976', 'BR00143981'])
Showing example for the first plate: BR00143979
(1259, 147)


## Create concatenated data frames combining blur and saturation metrics from all channels for all plates

In [5]:
# Create an empty dictionary to store data frames for each channel
all_combined_dfs = {}

# Iterate through each channel
for channel in config['data']['target_channel_keys']: # excluding input Brightfield since the metrics are not robust to this type of channel
    # Create an empty list to store data frames for each plate
    plate_dfs = []

    # Iterate through each plate and create the specified data frame for the channel
    for plate, qc_df in all_qc_data_frames.items():
        plate_df = qc_df.filter(like="Metadata_").copy()

        # Add PowerLogLogSlope column (blur metric)
        plate_df["ImageQuality_PowerLogLogSlope"] = qc_df[
            f"ImageQuality_PowerLogLogSlope_{channel}"
        ]

        # Add PercentMaximal column (saturation metric)
        plate_df["ImageQuality_PercentMaximal"] = qc_df[
            f"ImageQuality_PercentMaximal_{channel}"
        ]

        # Add "Channel" column
        plate_df["Channel"] = channel

        # Add "Metadata_Plate" column
        plate_df["Metadata_Plate"] = plate

        # Append the data frame to the list
        plate_dfs.append(plate_df)

    # Concatenate data frames for each plate for the current channel
    all_combined_dfs[channel] = pd.concat(
        plate_dfs, keys=list(all_qc_data_frames.keys()), names=["Metadata_Plate", None]
    )

# Concatenate the channel data frames together for plotting
df = pd.concat(list(all_combined_dfs.values()), ignore_index=True)

print(df.shape)
df.head()

(51240, 16)


Unnamed: 0,Metadata_AbsPositionZ,Metadata_ChannelID,Metadata_Col,Metadata_FieldID,Metadata_PlaneID,Metadata_Plate,Metadata_PositionX,Metadata_PositionY,Metadata_PositionZ,Metadata_Reimaged,Metadata_Row,Metadata_Site,Metadata_Well,ImageQuality_PowerLogLogSlope,ImageQuality_PercentMaximal,Channel
0,0.134972,6,3,2,1,BR00143979,-0.000646,0.000646,-2e-06,True,3,2,C03,-2.383981,8.6e-05,OrigDNA
1,0.134559,6,3,3,1,BR00143979,0.0,0.000646,-6e-06,False,3,3,C03,-2.230812,8.6e-05,OrigDNA
2,0.134559,6,3,4,1,BR00143979,0.000646,0.000646,-2e-06,True,3,4,C03,-1.71693,0.002486,OrigDNA
3,0.134558,6,3,5,1,BR00143979,0.000646,0.0,-2e-06,True,3,5,C03,-1.64223,0.01809,OrigDNA
4,0.134567,6,3,6,1,BR00143979,-0.000646,0.0,-2e-06,True,3,6,C03,-1.528777,0.000429,OrigDNA


## Apply Z-scores threshold on all columns (channels) with all plates, sites with any channel that falls beyond the threshold will be marked for exclusion

In [6]:
# Calculate Z-scores for the column with all plates
metric_z_thresh_dict = {
    "ImageQuality_PowerLogLogSlope": 2.5,
    "ImageQuality_PercentMaximal": 2,
}

total_plate_well_site = df[["Metadata_Plate", "Metadata_Well", "Metadata_Site"]].drop_duplicates()
removed_plate_well_site = pd.DataFrame()

for metric, z_thresh in metric_z_thresh_dict.items():
    z_scores = zscore(df[metric])
    outliers = df[abs(z_scores) > z_thresh]
    removed_plate_well_site = pd.concat(
        [removed_plate_well_site, outliers[["Metadata_Plate", "Metadata_Well", "Metadata_Site"]].drop_duplicates()]
    )

print(f"Out of a total of {total_plate_well_site.shape[0]} plate, well and site combos, {removed_plate_well_site.shape[0]} ({removed_plate_well_site.shape[0] * 100 / total_plate_well_site.shape[0]:.2f}%) removed due to low quality.")

Out of a total of 10248 plate, well and site combos, 894 (8.72%) removed due to low quality.


## Export sites to be excluded as a csv

In [7]:
removed_plate_well_site.to_csv(QC_OUTPUT_DIR / 'qc_exclusion.csv', index=False)