In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import norm
from scipy.stats import poisson
import copy
import textwrap
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import TheilSenRegressor
import math

import numpy as np
def SumNormal(u, Z, weight):
    """
    Calculates the mean and variance of a linear combination of normally distributed variables.

    Parameters:
    ----------
    u : np.ndarray, shape (n, 1)
        Mean vector of the multivariate normal distribution.

    Z : np.ndarray, shape (n, n)
        Covariance matrix of the multivariate normal distribution.

    weight : np.ndarray, shape (1, n)
        Weight vector used to compute the linear combination.

    Returns:
    ----------
    u_N : float
        The mean of the resulting linear combination.

    sigma_N : float
        The variance of the resulting linear combination.
    """
    # Compute the weighted mean: u_N = weight · u
    u_N = np.dot(weight, u)[0, 0]

    # Compute the weighted variance: sigma_N = weight · Z · weight.T
    sigma_N = np.dot(np.dot(weight, Z), weight.transpose())[0, 0]

    return u_N, sigma_N


In [None]:
import numpy as np

def calculate_coverage_rate(sampled_x, sampled_y, bins, lower_line_data, higher_line_data):
    """
    Compute whether sampled points fall between the upper and lower boundaries
    of each bin, and return the coverage statistics.

    Parameters
    ----------
    sampled_x : 1D array
        x-coordinates of the sampled points.
    sampled_y : 1D array
        y-coordinates of the sampled points.
    bins : 1D array
        Bin edges along the x-axis.
    lower_line_data : 1D array
        Lower boundary y-values for each bin (length = len(bins)).
    higher_line_data : 1D array
        Upper boundary y-values for each bin (length = len(bins)).

    Returns
    -------
    coverage_rate : float
        Proportion of points that fall within the boundaries.
    covered_count : int
        Number of points within the boundaries.
    total_count : int
        Total number of sampled points.
    """

    sampled_x = np.array(sampled_x)
    sampled_y = np.array(sampled_y)
    covered_count = 0

    for i in range(len(bins) - 1):
        x_start = bins[i]
        x_end = bins[i + 1]

        # Fit the lower boundary line
        y_lower_start = lower_line_data[i]
        y_lower_end = lower_line_data[i + 1]
        slope_lower = (y_lower_end - y_lower_start) / (x_end - x_start)
        intercept_lower = y_lower_start - slope_lower * x_start

        # Fit the upper boundary line
        y_upper_start = higher_line_data[i]
        y_upper_end = higher_line_data[i + 1]
        slope_upper = (y_upper_end - y_upper_start) / (x_end - x_start)
        intercept_upper = y_upper_start - slope_upper * x_start

        # Identify points belonging to the current bin
        bin_mask = (sampled_x >= x_start) & (sampled_x < x_end)
        x_bin = sampled_x[bin_mask]
        y_bin = sampled_y[bin_mask]

        # Compute the predicted lower and upper boundary values for each point
        y_lower_pred = slope_lower * x_bin + intercept_lower
        y_upper_pred = slope_upper * x_bin + intercept_upper

        # Determine whether points fall between the boundaries
        covered_mask = (y_bin >= y_lower_pred) & (y_bin <= y_upper_pred)
        covered_count += np.sum(covered_mask)

    total_count = len(sampled_x)
    coverage_rate = covered_count / total_count
    return coverage_rate, covered_count, total_count

def mkdir_silent(directory_name):
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

import matplotlib as mpl
import matplotlib.pyplot as plt

# Global styling configuration (set once)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'
mpl.rcParams['axes.linewidth'] = 0.8
mpl.rcParams['axes.labelsize'] = 9
mpl.rcParams['xtick.labelsize'] = 8
mpl.rcParams['ytick.labelsize'] = 8
mpl.rcParams['legend.fontsize'] = 8

def plot_publication_scatter(
        sampled_x, sampled_y,
        bins, u_line_data,
        lower_beads, upper_beads,
        lower_cells, upper_cells,
        xlabel, ylabel, title_text, save_path,
        ymin=None, ymax=None):

    fig, ax = plt.subplots(figsize=(5, 4), dpi=300)

   
    ax.scatter(sampled_x, sampled_y, s=2, alpha=0.25,
               color="#6A8A82", edgecolor="none")

    # prediction interval（beads）
    ax.plot(bins, lower_beads, color="#DD8452", lw=1.2)
    ax.plot(bins, upper_beads, color="#DD8452", lw=1.2,
            label="Beads 95% CI")

    # prediction interval（cells）
    ax.plot(bins, lower_cells, color="#4C72B0", lw=1.2)
    ax.plot(bins, upper_cells, color="#4C72B0", lw=1.2,
            label="Cells 95% CI")

    # mean line
    ax.plot(bins, u_line_data, color="black", lw=1.0, ls="--",
            label="Mean")


    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    # # Automatically set the y-axis range (if not provided)

    if ymin is None or ymax is None:
        all_y = np.concatenate([
            np.array(sampled_y),
            np.array(u_line_data),
            np.array(lower_beads),
            np.array(upper_beads),
            np.array(lower_cells),
            np.array(upper_cells)
        ])

        ymin_auto = np.min(all_y)
        ymax_auto = np.max(all_y)

        ymin = ymin if ymin is not None else ymin_auto
        ymax = ymax if ymax is not None else ymax_auto

    ax.set_ylim(ymin, ymax)

    if ymin is None:
        ymin = min(sampled_y)  # or np.min(sampled_y)
    if ymax is None:
        ymax = max(sampled_y)  # or np.max(sampled_y)

    ax.set_ylim(ymin, ymax)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.grid(alpha=0.15, lw=0.5)

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, 1.18),
        ncol=3,
        frameon=False)

    fig.suptitle(title_text, fontsize=9, y=1.02)

    plt.tight_layout()
    fig.savefig(save_path, dpi=600, bbox_inches="tight")
    plt.close()


In [9]:
#Set Data dir and Output dir
Raw_Data_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_external/Data'
Raw_Folder_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_external/Output'
Unmix_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_beads/Output'
Par_beads_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_beads/Output'
Par_cells_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_cells/Output'
Output_dir = 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output4'

Data_string = "_External_"#_External_
Unmix_string = "_Bead_"
Bead_string = "_Bead_"
Cell_string = "_Cell_"

Fluor_unmix_list = os.listdir(Unmix_dir)
Fluor_beads_list = [s.replace(Unmix_string, Bead_string) for s in Fluor_unmix_list]
Fluor_cells_list = [s.replace(Unmix_string, Cell_string) for s in Fluor_unmix_list]
Raw_Folder_list = os.listdir(Raw_Folder_dir)

In [11]:
Output_dir_root = Output_dir
all_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
#scc_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]#internal
scc_fluors = [2,3,4,5,6,7,8,9,11,12,14,15,18,19,20,22,26,27]#external
scc_fluors = [12]#'SCC_Bead_BV510_CD3'
#scc_fluors = [26]#external
print(all_fluors)
print(scc_fluors)
Fluor_list = Fluor_unmix_list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
[12]


In [None]:
for i_scc in scc_fluors:

    i_folder = i_scc
    added_fluors = all_fluors.copy()
    added_fluors.pop(i_folder)

    Output_dir = Output_dir_root+"/"+Fluor_list[i_folder]
    mkdir_silent(Output_dir)

    
    Par_dir = Unmix_dir
    Fluor_list = Fluor_unmix_list

    print(Fluor_list[i_folder])

    #prepare A_pinv
    # 1 fluor and 1 af
    sig = pd.read_csv(Par_dir+"/"+Fluor_list[i_folder]+"/sig.csv",encoding="utf-8",index_col=0)
    neg_sig = pd.read_csv(Par_dir+"/"+Fluor_list[i_folder]+"/neg_sig.csv",encoding="utf-8",index_col=0)

    neg_sig = neg_sig/max(neg_sig.iloc[:,0])
    A_df = pd.concat([sig, neg_sig], axis=1)
    A_df.columns = [Fluor_list[i_folder], 'AF']

    #add fluors to A_df
    print(pd.DataFrame([Fluor_list[f] for f in added_fluors]))
    for i_add in added_fluors:
        add_sig = pd.read_csv(Par_dir+"/"+Fluor_list[i_add]+"/sig.csv",encoding="utf-8",index_col=0)
        add_sig.columns = [Fluor_list[i_add]]
        A_df = pd.concat([A_df, add_sig], axis=1)

    #Calculate A_pinv
    A_pinv = np.linalg.pinv(A_df.values)


    coverage_results_beads_corrected = []
    coverage_results_cells_corrected = []
    coverage_results_beads_uncorrected = []
    coverage_results_cells_uncorrected = []
    x_idx = 0 #always 0
    for y_idx in range(0,32):
        print(A_df.columns[y_idx])

        #Step 1 real unmixed results
        filtered_list = [s for s in Raw_Folder_list if "_"+Fluor_list[i_folder].split("_")[2]+"_" in s]
        file_sample = filtered_list[0]+ "_sample.fcs.pkl"
        path = Raw_Data_dir+"/"+file_sample
        data_sample = pd.read_pickle(path)  
        selected_channels = data_sample.columns[19:70]
        data_sample = data_sample[selected_channels]
        data_sample = data_sample.reset_index(drop=True)
        data_sample = np.array(data_sample).transpose()

        unmixed_data = np.dot(A_pinv,data_sample)


        #Step 2 prepare unmixed data for plot
        sample_size = 100
        sample_bin = 60
        # Extract x and y coordinates
        x = unmixed_data[x_idx]
        y = unmixed_data[y_idx]

        # Compute the 5% and 95% quantiles for x and y
        x_lower, x_upper = np.percentile(x, 0.01), np.percentile(x, 99.99)
        y_lower, y_upper = np.percentile(y, 0), np.percentile(y, 100)
        #y_lower, y_upper = np.percentile(y, 1), np.percentile(y, 99)

        # Create a mask to filter points within the specified range
        mask = (x >= x_lower) & (x <= x_upper) & (y >= y_lower) & (y <= y_upper)
        if Fluor_list[i_folder] == 'SCC_Bead_BV510_CD3':
            mask = (x >= 100) & (x <= x_upper) & (y >= y_lower) & (y <= y_upper) #remove irregular shape negative population


        # apply mask
        x_filtered = x[mask]
        y_filtered = y[mask]

        bins = np.linspace(x_filtered.min(), x_filtered.max(), sample_bin+1)

        # save sampled data
        sampled_x = []
        sampled_y = []

        # Sample 100 points from each bin with replacement
        for i in range(sample_bin):
            bin_mask = (x_filtered >= bins[i]) & (x_filtered < bins[i+1])
            x_bin = x_filtered[bin_mask]
            y_bin = y_filtered[bin_mask]
            if len(x_bin) > 0:
                indices = np.random.choice(len(x_bin), size=sample_size, replace=True)
                sampled_x.extend(x_bin[indices])
                sampled_y.extend(y_bin[indices])
            #else:
                #print(i)
                #print(len(x_bin))

        #correct with compensation
        model = TheilSenRegressor()
        model.fit(np.array(sampled_x).reshape(-1, 1), sampled_y)
        predicted_y = model.predict(np.array(sampled_x).reshape(-1, 1))
        sampled_y_corrected = sampled_y - predicted_y
        sampled_x_corrected = sampled_x.copy()

        # mask sampled_y_corrected
        sampled_x_corrected = np.array(sampled_x_corrected)
        sampled_y_corrected_lower, sampled_y_corrected_upper = np.percentile(sampled_y_corrected, 1), np.percentile(sampled_y_corrected, 99)
        mask_sampled_y_corrected = (sampled_y_corrected >= sampled_y_corrected_lower) & (sampled_y_corrected <= sampled_y_corrected_upper)
        sampled_y_corrected = sampled_y_corrected[mask_sampled_y_corrected]
        sampled_y_corrected = sampled_y_corrected.tolist()
        sampled_x_corrected = sampled_x_corrected[mask_sampled_y_corrected]
        sampled_x_corrected = sampled_x_corrected.tolist()


        # mask sampled_y
        sampled_x = np.array(sampled_x)
        sampled_y = np.array(sampled_y)
        sampled_y_lower, sampled_y_upper = np.percentile(sampled_y, 1), np.percentile(sampled_y, 99)
        mask_sampled_y = (sampled_y >= sampled_y_lower) & (sampled_y <= sampled_y_upper)
        sampled_y = sampled_y[mask_sampled_y]
        sampled_y = sampled_y.tolist()
        sampled_x = sampled_x[mask_sampled_y]
        sampled_x = sampled_x.tolist()

        #Step 3 prepare predicted data from beads par

        intercepts_B_vs_mean = np.load(Par_beads_dir+"/"+Fluor_beads_list[i_folder]+"/intercepts_B_vs_mean.npy")
        slopes_B_vs_mean = np.load(Par_beads_dir+"/"+Fluor_beads_list[i_folder]+"/slopes_B_vs_mean.npy")

        intercepts_B_vs_cov_beads = np.load(Par_beads_dir+"/"+Fluor_beads_list[i_folder]+"/intercepts_B_vs_cov.npy")
        slopes_B_vs_cov_beads = np.load(Par_beads_dir+"/"+Fluor_beads_list[i_folder]+"/slopes_B_vs_cov.npy")

        intercepts_B_vs_cov_cells = np.load(Par_cells_dir+"/"+Fluor_cells_list[i_folder]+"/intercepts_B_vs_cov.npy")
        slopes_B_vs_cov_cells = np.load(Par_cells_dir+"/"+Fluor_cells_list[i_folder]+"/slopes_B_vs_cov.npy")

        pred_array_beads = np.zeros((A_pinv.shape[0],3,len(bins)))# 0: mean, 1: std, 2: bins
        pred_array_cells = np.zeros((A_pinv.shape[0],3,len(bins)))# 0: mean, 1: std, 2: bins

        for i_bin in range(len(bins)):
            tmp_B = bins[i_bin]
            tmp_mean = intercepts_B_vs_mean + slopes_B_vs_mean * tmp_B
            tmp_mean = tmp_mean.transpose()

            #cal for beads
            tmp_cov_beads = intercepts_B_vs_cov_beads + slopes_B_vs_cov_beads * tmp_B
            tmp_cov_beads = tmp_cov_beads[:,:,0]

            par_raw = np.empty((A_pinv.shape[0],3))# 0: mean, 1: std, 2: bins
            for i in range(A_pinv.shape[0]):
                A_pinv_oneline = A_pinv[i:(i+1),:]
                u_N, sigma_N = SumNormal(u=tmp_mean, Z=tmp_cov_beads, weight=A_pinv_oneline)
                if sigma_N < 0:
                    sigma_N = - sigma_N
                sigma_N = math.sqrt(sigma_N)
                par_raw[i,0] = u_N
                par_raw[i,1] = sigma_N
                par_raw[i,2] = tmp_B
            pred_array_beads[:,:,i_bin] = par_raw

            #cal for cells
            tmp_cov_cells = intercepts_B_vs_cov_cells + slopes_B_vs_cov_cells * tmp_B
            tmp_cov_cells = tmp_cov_cells[:,:,0]

            par_raw = np.empty((A_pinv.shape[0],3))# 0: mean, 1: std, 2: bins
            for i in range(A_pinv.shape[0]):
                A_pinv_oneline = A_pinv[i:(i+1),:]
                u_N, sigma_N = SumNormal(u=tmp_mean, Z=tmp_cov_cells, weight=A_pinv_oneline)
                if sigma_N < 0:
                    sigma_N = - sigma_N
                sigma_N = math.sqrt(sigma_N)
                par_raw[i,0] = u_N
                par_raw[i,1] = sigma_N
                par_raw[i,2] = tmp_B
            pred_array_cells[:,:,i_bin] = par_raw


        u_line_data = pred_array_beads[y_idx,0,:]
        u_line_data = np.zeros((pred_array_beads[y_idx,0,:].shape)) #set mean to be 0
        sigma_data_beads = pred_array_beads[y_idx,1,:]
        lower_line_data_beads = u_line_data - 1.96 * sigma_data_beads
        higher_line_data_beads = u_line_data + 1.96 * sigma_data_beads

        sigma_data_cells = pred_array_cells[y_idx,1,:]
        lower_line_data_cells = u_line_data - 1.96 * sigma_data_cells
        higher_line_data_cells = u_line_data + 1.96 * sigma_data_cells

        #plot

        coverage_rate_beads, covered_count_beads, total_count_beads = calculate_coverage_rate(sampled_x, sampled_y, bins, 
                                                                            lower_line_data_beads, higher_line_data_beads)
        coverage_rate_cells, covered_count_cells, total_count_cells = calculate_coverage_rate(sampled_x, sampled_y, bins, 
                                                                            lower_line_data_cells, higher_line_data_cells)    
        coverage_results_beads_uncorrected.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate_beads, 4),
                'Covered Count': covered_count_beads,
                'Total Count': total_count_beads
            })
        coverage_results_cells_uncorrected.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate_cells, 4),
                'Covered Count': covered_count_cells,
                'Total Count': total_count_cells
            })
        title_text = (f"Scatter Plot of Unmixed Data (panel: 31 fluors + AF; corrected compensated result)\n"
                    f"(Coverage_beads: {coverage_rate_beads*100:.2f}%, {covered_count_beads}/{total_count_beads})\n"
                    f"(Coverage_cells: {coverage_rate_cells*100:.2f}%, {covered_count_cells}/{total_count_cells})\n")
        plot_publication_scatter(
            sampled_x, sampled_y,
            bins, u_line_data,
            lower_line_data_beads, higher_line_data_beads,
            lower_line_data_cells, higher_line_data_cells,
            xlabel=A_df.columns[x_idx],
            ylabel=A_df.columns[y_idx],
            title_text=title_text,
            save_path=Output_dir+"/fitting_scatter_uncorrected_"+A_df.columns[y_idx]+".pdf"
        )

        #plot
        coverage_rate_beads, covered_count_beads, total_count_beads = calculate_coverage_rate(sampled_x_corrected, sampled_y_corrected, bins, 
                                                                            lower_line_data_beads, higher_line_data_beads)
        coverage_rate_cells, covered_count_cells, total_count_cells = calculate_coverage_rate(sampled_x_corrected, sampled_y_corrected, bins, 
                                                                            lower_line_data_cells, higher_line_data_cells)    
        coverage_results_beads_corrected.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate_beads, 4),
                'Covered Count': covered_count_beads,
                'Total Count': total_count_beads
            })
        coverage_results_cells_corrected.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate_cells, 4),
                'Covered Count': covered_count_cells,
                'Total Count': total_count_cells
            })
        title_text = (f"Scatter Plot of Unmixed Data (panel: 31 fluors + AF; corrected compensated result)\n"
                    f"(Coverage_beads: {coverage_rate_beads*100:.2f}%, {covered_count_beads}/{total_count_beads})\n"
                    f"(Coverage_cells: {coverage_rate_cells*100:.2f}%, {covered_count_cells}/{total_count_cells})\n")
        plot_publication_scatter(
            sampled_x_corrected, sampled_y_corrected,
            bins, u_line_data,
            lower_line_data_beads, higher_line_data_beads,
            lower_line_data_cells, higher_line_data_cells,
            xlabel=A_df.columns[x_idx],
            ylabel=A_df.columns[y_idx],
            title_text=title_text,
            save_path=Output_dir+"/fitting_scatter_corrected_"+A_df.columns[y_idx]+".pdf"
        )

    coverage_table_beads_corrected = pd.DataFrame(coverage_results_beads_corrected)
    coverage_table_beads_corrected.to_csv(Output_dir+"/coverage_beads_corrected_summary.csv", index=False)
    coverage_table_cells_corrected = pd.DataFrame(coverage_results_cells_corrected)
    coverage_table_cells_corrected.to_csv(Output_dir+"/coverage_cells_corrected_summary.csv", index=False)
    coverage_table_beads_uncorrected = pd.DataFrame(coverage_results_beads_uncorrected)
    coverage_table_beads_uncorrected.to_csv(Output_dir+"/coverage_beads_uncorrected_summary.csv", index=False)
    coverage_table_cells_uncorrected = pd.DataFrame(coverage_results_cells_uncorrected)
    coverage_table_cells_uncorrected.to_csv(Output_dir+"/coverage_cells_uncorrected_summary.csv", index=False)
    

Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output4/SCC_Bead_BV510_CD3' created successfully.
SCC_Bead_BV510_CD3
                             0
0          SCC_Bead_AF532_CD45
1   SCC_Bead_APCeFluor780_CD34
2         SCC_Bead_APC_TCRVa72
3        SCC_Bead_BUV395_NKG2D
4         SCC_Bead_BUV496_CD19
5         SCC_Bead_BUV563_CD56
6        SCC_Bead_BUV615_CD161
7        SCC_Bead_BUV661_CD127
8        SCC_Bead_BUV737_NKp30
9         SCC_Bead_BUV805_CD16
10        SCC_Bead_BV421_NKG2C
11        SCC_Bead_BV480_KLRG1
12          SCC_Bead_BV570_CD8
13         SCC_Bead_BV650_CD38
14         SCC_Bead_BV750_CD69
15  SCC_Bead_BV785_TCRVa24Ja18
16     SCC_Bead_eFluor450_CD57
17           SCC_Bead_FITC_CD2
18    SCC_Bead_NFB61070S_HLADR
19         SCC_Bead_NFR700_CD4
20        SCC_Bead_PECy55_CD25
21        SCC_Bead_PECy5_CRTH2
22  SCC_Bead_PEDazzle594_NKG2A
23  SCC_Bead_PerCPVio700_TCRrd
24   SCC_Bead_PEVio770_TCRVb11
25          SCC_Bead_PE_KIRDL1
26        SCC_Bead_RB780_NKp46
