In [None]:
# Imports
import os
import logging

import numpy as np
import pandas as pd

import skimage.measure as measure

import line_utils
import image_utils
import file_utils
import pca_utils

logger = logging.getLogger('pseudotime')
logging.basicConfig(
    filename='pseudotime_run.log',
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.DEBUG,
    datefmt='%Y-%m-%d %H:%M:%S')
# logger.addHandler(logging.StreamHandler())

In [None]:
targets = file_utils.load_targets('targets.yaml')

# IN THE CASE OF USING THE AGGEGRATE TABLE LOADER (CellPose PCA), USE THIS INSTEAD OF TARGETS
file_path = r"C:\Users\AG Ewers\Desktop\Cytokinesis PCA Software\Final_Feature_Excel_File.xlsx"

# Stage to key on
stage_key = "Stage"

# Order of time stages
time_key = "Frame"

# binning
n_time_bins = 35         # How many bins do we want?
binning = 'equal-size' # 'equal-width' (split PCA fit into equal chunks along line) 
                        # or 'equal-size' (split fit so the same number of points 
                        # are in each bin)
overlap = 1          # Fraction of the bin size to overlap

# Channels per image (TODO: Auto detect)
n_ch = 4

# wavelengths to be found in the file names
# Sublists are grouped. First element of the sublist is a group name.
# NOTE: First element must be a number!
wvls = [488,[568, "orange"],[646,647,657]]

length = 500

# pixel sizes (we assume they are constant)
dx, dy, dz = 0.09, 0.09, 1

# What features should we perform PCA on?

features = [
    "areashape_maximumradius", "areashape_meanradius", "areashape_equivalentdiameter",
    "areashape_minoraxislength", "areashape_eccentricity", "septin_delta_ef",
    "diama_micron", "diamm_micron",  "ratio_diam", "abs_key",
    "intensity_integratedintensity_input_microtubule", "intensity_integratedintensity_input_septin",
]

In [None]:
# Go to each target's workbook and compute necessary additional metrics,
# including distance between septin rings, septin ring diameter, and
# microtubule bundle width.
for k, v in targets.items():
    try:
        logger.debug(f"Accessing {os.path.basename(v['workbook'])}")
    except KeyError:
        # Not a target with a workbook
        continue

    # Pre-cleaned metrics
    metrics = file_utils.load_workbooks({k: v})

    # Establish columns for septin peak locations (x_septin_1, x_septin_2) and distance between them (dx_septin)
    metrics['dx_septin'], metrics['x_septin_1'], metrics['x_septin_2'] = np.nan, np.nan, np.nan
    metrics['diamA_micron'] = 0

    # Establish empty columns for new metrics
    metrics['diamM_micron'] = 0
    metrics['delta_diam'] = 0
    metrics['ratio_diam'] = 0
   
    for i, ml in metrics.iterrows():
        logger.debug(f"  Septin ring fit for {os.path.basename(ml['filename'])}")

        # Get the image associated with this row and load it with the channels sorted from high to low
        im = image_utils.NDImage(ml["filename"], load_sorted=True)

        # get x, y, angle for this row
        x, y, angle = ml[["X", "Y", "Angle"]]

        # find wavelengths in file name and sort from high to low
        wvls_dict, binned_wvls = image_utils.extract_channel_targets_from_filename(ml["filename"], wvls=wvls)

        # Establish target names in this data set and sort from high to low to match image load
        channel_targets = [wvls_dict[str(wvl)] for wvl in sorted(binned_wvls)[::-1]]

        # the last channel is always DAPI, if unknown
        if len(channel_targets) < n_ch:
            channel_targets.append("DAPI") 

        # --------- Fit septin peaks ---------

        # ... get the septin peaks
        mt_ch = [i for i, t in enumerate(channel_targets) if any([t == n for n in image_utils.target_names(targets, "MTs")])][0]
        septin_ch = [i for i, t in enumerate(channel_targets) if any([t == n for n in image_utils.target_names(targets, "septin")])][0]
        im_proj = im[:].mean(1).squeeze()
        p0, p1, dX2 = line_utils.find_septin_peaks(im_proj, x, y, angle, length,
                                                    mt_ch=mt_ch, 
                                                    septin_ch=septin_ch)

        metrics.loc[i,['x_septin_1','x_septin_2','dx_septin']] = [p0, p1, dX2]

        # --------- Get the number of occupied pixels in the MT profile ---------
        # This is important for distinguishing abscission from others
        xl, xu, yl, yu = line_utils.get_line_profile_endpoints(x, y, angle, length)

        chs = measure.profile_line(im_proj, [xl, yu], [xu, yl], linewidth=25)

        mt, septin = chs[:,mt_ch], chs[:,septin_ch]

        mt_min, mt_max = np.min(mt), np.max(mt)
        mt_norm = (mt-mt_min)/(mt_max-mt_min)
        metrics.loc[i,'fill_microtubule'] = np.sum(mt_norm)/(length*25)

        # --------- Get the ratio of one side of the MT profile to the other ---------
        mt_mid = len(mt) // 2
        half1, half2 = mt_norm[:mt_mid], mt_norm[mt_mid:][::-1]

        # Demand the split be the same length (it should be, but in case it's odd line length)
        if len(half1) != len(half2):
            clipto = min(len(half1), len(half2))
            half1 = half1[:clipto]
            half2 = half2[:clipto]

        half1s, half2s = half1.sum(), half2.sum()
        metrics.loc[i,'balance_microtubule'] = min(half1s,half2s)/max(half1s,half2s)

        # Always put the half with more signal in the denominator 
        if half1s <= half2s:
            metrics.loc[i,'balance_microtubule2'] = np.nanmean(half1/(half2+1e-6))
            metrics.loc[i,'balance_microtubule3'] = np.nanmean(half2/(half1+1e-6))
        else:
            metrics.loc[i,'balance_microtubule2'] = np.nanmean(half2/(half1+1e-6))
            metrics.loc[i,'balance_microtubule3'] = np.nanmean(half1/(half2+1e-6))

        # --------- Get the ratio of septin signal/mt signal ---------
        sm_min = min(np.min(septin), np.min(mt))
        sm_max = max(np.max(septin), np.max(mt))
        septin_norm = (septin-sm_min)/(sm_max-sm_min)
        septin_norm2 = (septin-mt_min)/(mt_max-mt_min)
        mt_norm2 = (mt-sm_min)/(sm_max-sm_min)
        metrics.loc[i,'balance_septin'] = np.mean(septin_norm/(mt_norm2+1e-6))
        metrics.loc[i,'balance_septin2'] = np.mean(septin_norm**2/(mt_norm2+1e-6))
        metrics.loc[i,'balance_septin3'] = np.mean(septin_norm2/(mt_norm+1e-6))
        metrics.loc[i,'balance_septin4'] = np.mean(septin_norm2**2/(mt_norm+1e-6))

        # --------- Fit septin rings ---------
        p0x, p0y = line_utils.get_image_coordinate_from_distance_along_line(p0, xl, xu, yl, yu, len(septin))
        p1x, p1y = line_utils.get_image_coordinate_from_distance_along_line(p1, xl, xu, yl, yu, len(septin))

        # Now get the orthogonal line profile at the line center
        xl3, xu3, yl3, yu3 = line_utils.get_line_profile_endpoints(p0x, p0y, angle-90, length)
        chs = measure.profile_line(im_proj.T, [xl3, yu3], [xu3, yl3], linewidth=25)
        septin_ring0 = chs[:,septin_ch]

        ring0_diameter, res_lsq_ring0 = line_utils.fit_gaussian_fwhm(septin_ring0, return_dict=True)

        # Now get the orthogonal line profile at the line center
        xl4, xu4, yl4, yu4 = line_utils.get_line_profile_endpoints(p1x, p1y, angle-90, length)
        chs = measure.profile_line(im_proj.T, [xl4, yu4], [xu4, yl4], linewidth=25)
        septin_ring1 = chs[:,septin_ch]

        ring1_diameter, _ = line_utils.fit_gaussian_fwhm(septin_ring1, return_dict=True)

        metrics.loc[i,'diam_septin_ring'] = 0.5*(ring0_diameter + ring1_diameter)

        # --------- Fit center MT cross-section ---------
        # Now get the orthogonal line profile at the line center
        xl2, xu2, yl2, yu2 = line_utils.get_line_profile_endpoints(x, y, angle-90, length)
        chs = measure.profile_line(im_proj.T, [xl2, yu2], [xu2, yl2], linewidth=25)
        mt = chs[:,mt_ch]

        # Zach original
        # outer_diameter, res_lsq = line_utils.fit_tubule_diameter(mt, return_dict=True)
        # metrics.loc[i,'diam_microtubule'] = outer_diameter

        # outer diameter modified for Expansion Factor 24.04.2025
        try:
            # outer_diameter, res_lsq = line_utils.fit_tubule_diameter(mt, return_dict=True)
            outer_diameter, res_lsq = line_utils.fit_gaussian_fwhm(mt, return_dict=True)
        except ValueError:
            outer_diameter = 0
        metrics.loc[i,'diam_microtubule'] = (outer_diameter) * (dx/metrics.loc[i,'EF'])

        # --------- prepared from manually measured data ---------
        
        # Get diamA_micron
        metrics.loc[i,'diamA_micron'] = (metrics.loc[i,'FWHM'] / 1000) * (dx/metrics.loc[i,'EF'])
        
        # Get diamM_micron
        metrics.loc[i,'diamM_micron'] = (metrics.loc[i,'diamM'] / 1000) * (dx/metrics.loc[i,'EF'])

        # Get delta_diam
        metrics.loc[i,'delta_diam'] = ((metrics.loc[i,'diamM']) - (metrics.loc[i,'FWHM'])) / 1000 * (dx/metrics.loc[i,'EF'])

        # Get ratio_diam
        metrics.loc[i,'ratio_diam'] = (metrics.loc[i,'FWHM']) / (metrics.loc[i,'diamM'])

      

    # TODO: Should the behaviour be replace or new?
    with pd.ExcelWriter(v['workbook'], mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
        metrics.to_excel(writer, sheet_name=f"{v['workbook_sheet_name']}_processed")

In [None]:
# Alternative loader for aggregate metrics
# WARNING: IF YOU RUN THIS, DO NOT RUN THE CELL BELOW.
metrics = pd.read_excel(file_path)
metrics = metrics.dropna(subset=features + ["stage_key"])
stage_key = "stage"  # lowercase in this style

In [None]:
# Now let's go grab the new calculations
targets_processed = targets
for k, v in targets.items():
    try:
        targets_processed[k]["workbook_sheet_name"] = f"{v['workbook_sheet_name']}_processed"
        targets_processed[k]["workbook_header_row"] = 0
    except KeyError:
        continue

# Load aggregated data from workbooks
metrics = file_utils.load_workbooks(targets_processed)

In [None]:
# Compute PCA on the aggregated data and fit a polynomial through the space
metrics_features = metrics[features]
# metrics_features = metrics[metrics[stage_key] != 'A'][features]
pca, coords, fit = pca_utils.pca(metrics_features, transform=True, fit=True)

# bins is an index into coords, which 0-indexes whatever is fed as features to pca_utils.pca
bins_inds = metrics_features.index

# Now add the PCA results as columns
xx, yy = coords
metrics.loc[bins_inds,"pca0"] = xx
metrics.loc[bins_inds,"pca1"] = yy

bins = pca_utils.sort_by_point_plane_dist(xx, yy, fit, nbins=n_time_bins, binning=binning, overlap=overlap)

for i, bin in enumerate(bins):
    # Deal with the first instance
    try:
        curr_bin = metrics.loc[bins_inds[bins[i]],time_key]
    except KeyError:
        metrics.loc[bins_inds[bins[i]],time_key] = int(i)
        continue

    # Anything that's empty gets set to the current value
    metrics.loc[bins_inds[bins[i]][curr_bin.isna()],time_key] = int(i)
    # At least one of these is already in one bin, so lets make a duplicate
    for j in bin[~curr_bin.isna()]:
        metrics.loc[len(metrics)] = metrics.loc[bins_inds[j]]
        # metrics.index = metrics.index + 1
        # metrics = metrics.sort_index()
        metrics.loc[len(metrics)-1,time_key] = int(i)


In [None]:
# ALERT! IF YOU DECLINED TO FIT ONE OF THE STAGES
# e.g. you set metrics_features = metrics[metrics[stage_key] != 'A'][features]
# You must run this cell so they are assigned to the final time point
metrics.loc[metrics[metrics[time_key].isna()].index, time_key] = metrics[time_key].max() + 1

In [None]:
# Display the PCA results
import matplotlib.pyplot as plt
# import matplotlib.patches as patches

fig, ax = plt.subplots(1)

stage_keys = {v:k for k,v in enumerate(metrics[stage_key].unique())}
# stage_colors = [stage_keys[x] for x in metrics["Stage"]]

colors = {'A':'#DDCC77', 'BA':'#999933', 'CS':'#88CCEE', 'RC':'#332288',  'RS':'#44AA99', 'SM':'#117733' }


colors_ls = [colors[x] for x in metrics.loc[bins_inds, stage_key]]

# colors_ls = [list(colors.values())[x % len(colors)] for x in metrics.loc[bins_inds, time_key].astype(int)]

scatter = ax.scatter(xx, yy, c=colors_ls, s=5, alpha=0.7)
# scatter = ax.scatter(xx, yy, c=stage_colors, cmap='gist_rainbow_r', s=20)

xxx = np.linspace(np.min(xx),np.max(xx),300)
ax.plot(xxx, np.poly1d(fit)(xxx), linestyle='--', c='k')

dist, xp, yp = pca_utils.point_poly_dist(xx, yy, fit)
permutation = np.argsort(xp)

# for i in range(len(xp)):
#     ax.plot([xx[i], xp[i]], [yy[i], yp[i]], c='b', linewidth=0.5)

# ax.legend(scatter.legend_elements(num=len(stage_keys)-1)[0],list(stage_keys.keys()))
# ax.legend([patches.Circle((0,0), 1, fc=colors[x]) for x in metrics[stage_key]], list(stage_keys.keys()))
ax.set_xlabel('PC0')
ax.set_ylabel('PC1')
# ax.set_xlim([-4,7])
# ax.set_ylim([-2,14])

#fig.savefig('20250508_pca_color_s5_opacity7_EF_adjusted_diamM_new_2.svg')


In [None]:
# Show the features ranked from most important (left) to least important (right)
# per principle component (row number)
pd.DataFrame({f"feature{i} (explained variance {pca.explained_variance_ratio_[i]:.2f})": k for i, k in enumerate(np.array(features)[np.argsort(np.abs(pca.components_),axis=1)[:,::-1]])}|
             {f"feature{i}_importance": k for i, k in enumerate(np.sort(np.abs(pca.components_),axis=1)[:,::-1])})

In [None]:
# Alternative save for aggregate metrics
# WARNING: IF YOU RUN THIS, DO NOT RUN THE CELL BELOW.

# Drop unused columns
metrics_merged = metrics.dropna(how='all', axis=1)

# And write to file
with pd.ExcelWriter(file_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
    metrics_merged.to_excel(writer, sheet_name="processed")

In [None]:
# Now map the features back onto their original files
for k, v in targets_processed.items():
    try:
        logger.debug(f"Accessing {os.path.basename(v['workbook'])}")
    except KeyError:
        # Not a target with a workbook
        continue

    # Pre-cleaned metrics
    metrics_processed = file_utils.load_workbooks({k: v})

    # Now merge in the PCA information
    metrics_merged = pd.merge(metrics_processed, metrics, how="left")

    # Drop unused columns
    metrics_merged = metrics_merged.dropna(how='all', axis=1)

    # And write to file
    with pd.ExcelWriter(v['workbook'], mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
        metrics_merged.to_excel(writer, sheet_name=v['workbook_sheet_name'])