In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import norm
from scipy.stats import poisson
import copy
import textwrap
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau
from sklearn.linear_model import LinearRegression
import math

import numpy as np
def SumNormal(u, Z, weight):
    """
    Calculates the mean and variance of a linear combination of normally distributed variables.

    Parameters:
    ----------
    u : np.ndarray, shape (n, 1)
        Mean vector of the multivariate normal distribution.

    Z : np.ndarray, shape (n, n)
        Covariance matrix of the multivariate normal distribution.

    weight : np.ndarray, shape (1, n)
        Weight vector used to compute the linear combination.

    Returns:
    ----------
    u_N : float
        The mean of the resulting linear combination.

    sigma_N : float
        The variance of the resulting linear combination.
    """
    # Compute the weighted mean: u_N = weight · u
    u_N = np.dot(weight, u)[0, 0]

    # Compute the weighted variance: sigma_N = weight · Z · weight.T
    sigma_N = np.dot(np.dot(weight, Z), weight.transpose())[0, 0]

    return u_N, sigma_N


In [None]:
import numpy as np
import random
from matplotlib.colors import LinearSegmentedColormap
def calculate_coverage_rate(sampled_x, sampled_y, bins, lower_line_data, higher_line_data):
    """
    Determine whether sampled points fall between the upper and lower boundaries
    of each bin, and return the coverage statistics.

    Parameters
    ----------
    sampled_x : 1D array
        x-coordinates of the sampled points.
    sampled_y : 1D array
        y-coordinates of the sampled points.
    bins : 1D array
        Bin edges along the x-axis.
    lower_line_data : 1D array
        Lower boundary y-values for each bin (length = len(bins)).
    higher_line_data : 1D array
        Upper boundary y-values for each bin (length = len(bins)).

    Returns
    -------
    coverage_rate : float
        Proportion of points that fall within the boundaries.
    covered_count : int
        Number of points within the boundaries.
    total_count : int
        Total number of sampled points.
    """

    sampled_x = np.array(sampled_x)
    sampled_y = np.array(sampled_y)
    covered_count = 0

    for i in range(len(bins) - 1):
        x_start = bins[i]
        x_end = bins[i + 1]

        # Fit the lower boundary line
        y_lower_start = lower_line_data[i]
        y_lower_end = lower_line_data[i + 1]
        slope_lower = (y_lower_end - y_lower_start) / (x_end - x_start)
        intercept_lower = y_lower_start - slope_lower * x_start

        # Fit the upper boundary line
        y_upper_start = higher_line_data[i]
        y_upper_end = higher_line_data[i + 1]
        slope_upper = (y_upper_end - y_upper_start) / (x_end - x_start)
        intercept_upper = y_upper_start - slope_upper * x_start

        # Identify points belonging to the current bin
        bin_mask = (sampled_x >= x_start) & (sampled_x < x_end)
        x_bin = sampled_x[bin_mask]
        y_bin = sampled_y[bin_mask]

        # Compute the predicted lower and upper boundary values for each point
        y_lower_pred = slope_lower * x_bin + intercept_lower
        y_upper_pred = slope_upper * x_bin + intercept_upper

        # Determine whether points fall between the boundaries
        covered_mask = (y_bin >= y_lower_pred) & (y_bin <= y_upper_pred)
        covered_count += np.sum(covered_mask)

    total_count = len(sampled_x)
    coverage_rate = covered_count / total_count
    return coverage_rate, covered_count, total_count

def mkdir_silent(directory_name):
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

# Global styling configuration (set once)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'
mpl.rcParams['axes.linewidth'] = 0.8
mpl.rcParams['axes.labelsize'] = 9
mpl.rcParams['xtick.labelsize'] = 8
mpl.rcParams['ytick.labelsize'] = 8
mpl.rcParams['legend.fontsize'] = 8

def plot_publication_scatter(
        sampled_x, sampled_y,
        bins, u_line_data,
        lower_line_data, upper_line_data,
        xlabel, ylabel, title_text, save_path,
        ymin=None, ymax=None,
        scatter_color="#6A8A82",
        line_color = "#DD8452"):


    fig, ax = plt.subplots(figsize=(5, 4), dpi=300)

    ax.scatter(sampled_x, sampled_y, s=2, alpha=0.25,
               color=scatter_color, edgecolor="none")

    ax.plot(bins, u_line_data, color="black", lw=1.0, ls="--", label="Mean")

    ax.plot(bins, lower_line_data, color=line_color, lw=1.2)
    ax.plot(bins, upper_line_data, color=line_color, lw=1.2,
            label="Prediction 95% CI")

    # automatic y‑axis range
    if ymin is None or ymax is None:
        all_y = np.concatenate([
            np.array(sampled_y),
            np.array(u_line_data),
            np.array(lower_line_data),
            np.array(upper_line_data)
        ])
        ymin_auto = np.min(all_y)
        ymax_auto = np.max(all_y)
        ymin = ymin if ymin is not None else ymin_auto
        ymax = ymax if ymax is not None else ymax_auto

    ax.set_ylim(ymin, ymax)

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.grid(alpha=0.15, lw=0.5)

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, 1.18),
        ncol=3,
        frameon=False
    )

    fig.suptitle(title_text, fontsize=9, y=1.02)

    plt.tight_layout()
    fig.savefig(save_path, dpi=600, bbox_inches="tight")
    plt.close()


In [2]:
#Set Data dir and Output dir
Raw_Data_dir = 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Data'
Data_dir = 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output'
Output_dir = 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3'
Data_list =  os.listdir(Data_dir)

In [3]:
Output_dir_root = Output_dir
all_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
scc_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]#internal
#scc_fluors = [9]#internal SCC_Cell_BUV805_CD16
#all_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]#31C
#all_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]#21C
#all_fluors = [0,1,2,3,4,5,6,7,8,9,10]#11C
#scc_fluors = [2,3,4,5,6,7,8,9,11,12,14,15,18,19,20,22,26,27]#external
print(all_fluors)
print(scc_fluors)
Fluor_list = Data_list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [None]:
#internal validation Xenith
random.seed(10)
for i_scc in scc_fluors:

    i_folder = i_scc
    added_fluors = all_fluors.copy()
    added_fluors.pop(i_folder)

    Output_dir = Output_dir_root+"/"+Fluor_list[i_folder]
    mkdir_silent(Output_dir)

    print(Data_list[i_folder])
    est_pd_neg = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_neg.csv",encoding="utf-8",index_col=0)
    est_pd_pos = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_pos.csv",encoding="utf-8",index_col=0)
    est_pd_fluor = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_fluor.csv",encoding="utf-8",index_col=0)
    cov_matrices = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices.npy")
    mean_std_matrix = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix.npy") # 0: mean, 1: std, 2: bin midpoint
    cov_matrices_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices_neg.npy")
    mean_std_matrix_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix_neg.npy") # 0: mean, 1: std

    intercepts_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_cov.npy")
    intercepts_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_mean.npy")
    intercepts_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_std.npy")
    slopes_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_cov.npy")
    slopes_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_mean.npy")
    slopes_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_std.npy")

    sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/sig.csv",encoding="utf-8",index_col=0)
    neg_sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/neg_sig.csv",encoding="utf-8",index_col=0)


    file_sample = Data_list[i_folder] + "_sample.fcs.pkl"
    #get neg signature
    path = Raw_Data_dir+"/"+file_sample
    data_neg = pd.read_pickle(path)  
    selected_channels = data_neg.columns[19:70]
    channels_id = selected_channels
    
    #prepare A_pinv
    # 1 fluor and 1 af
    neg_sig = neg_sig/max(neg_sig.iloc[:,0])
    A_df = pd.concat([sig, neg_sig], axis=1)
    A_df.columns = [Data_list[i_folder], 'AF']

    for i_add in added_fluors:
        add_sig = pd.read_csv(Data_dir+"/"+Data_list[i_add]+"/sig.csv",encoding="utf-8",index_col=0)
        add_sig.columns = [Data_list[i_add]]
        A_df = pd.concat([A_df, add_sig], axis=1)


    #Calculate A_pinv
    A_pinv = np.linalg.pinv(A_df.values)

    #prepare data for plot
    x_idx = 0 #always 0
    #y_idx = 27
    coverage_results = []
    for y_idx in range(0,A_pinv.shape[0]):

        #Step 1 real unmixed results
        file_sample = Data_list[i_folder] + "_filtered_sample.fcs.pkl"
        path = Raw_Data_dir+"/"+file_sample
        data_sample = pd.read_pickle(path)  
        selected_channels = data_sample.columns[19:70]  
        data_sample = data_sample[selected_channels]
        data_sample = data_sample.reset_index(drop=True)
        data_sample = np.array(data_sample).transpose()

        unmixed_data = np.dot(A_pinv,data_sample)


        #Step 2 prepare unmixed data for plot
        sample_size = 100
        sample_bin = 60
        # Extract x and y coordinates
        x = unmixed_data[x_idx]
        y = unmixed_data[y_idx]

        # Compute the 5% and 95% quantiles for x and y
        x_lower, x_upper = np.percentile(x, 0.01), np.percentile(x, 99.99)
        y_lower, y_upper = np.percentile(y, 0.01), np.percentile(y, 99)

        # Create a mask to filter points within the specified range
        mask = (x >= x_lower) & (x <= x_upper) & (y >= y_lower) & (y <= y_upper)

        # apply mask
        x_filtered = x[mask]
        y_filtered = y[mask]

        bins = np.linspace(x_filtered.min(), x_filtered.max(), sample_bin+1)

        # save sampled data
        sampled_x = []
        sampled_y = []

        # Sample 100 points from each bin without replacement
        for i in range(sample_bin):
            bin_mask = (x_filtered >= bins[i]) & (x_filtered < bins[i+1]) #& (y_filtered < 100)
            x_bin = x_filtered[bin_mask]
            y_bin = y_filtered[bin_mask]
            if len(x_bin) > sample_size:
                indices = np.random.choice(len(x_bin), size=sample_size, replace=False)
                sampled_x.extend(x_bin[indices])
                sampled_y.extend(y_bin[indices])
            elif len(x_bin) > 0:
                    # If the number of points is less than sample_size, use all points directly
                    sampled_x.extend(x_bin)
                    sampled_y.extend(y_bin)
            else:
                print(f"Bin {i} is empty.")


        #Step 3 prepare predicted data
        pred_array = np.zeros((A_pinv.shape[0],3,len(bins)))# 0: mean, 1: std, 2: bins

        for i_bin in range(len(bins)):
            tmp_B = bins[i_bin]
            tmp_mean = intercepts_B_vs_mean + slopes_B_vs_mean * tmp_B
            tmp_mean = tmp_mean.transpose()
            tmp_cov = intercepts_B_vs_cov + slopes_B_vs_cov * tmp_B
            tmp_cov = tmp_cov[:,:,0]

            par_raw = np.empty((A_pinv.shape[0],3))# 0: mean, 1: std, 2: bins
            for i in range(A_pinv.shape[0]):
                A_pinv_oneline = A_pinv[i:(i+1),:]
                u_N, sigma_N = SumNormal(u=tmp_mean, Z=tmp_cov, weight=A_pinv_oneline)
                if sigma_N < 0:
                    sigma_N = - sigma_N
                sigma_N = math.sqrt(sigma_N)
                par_raw[i,0] = u_N
                par_raw[i,1] = sigma_N
                par_raw[i,2] = tmp_B
            pred_array[:,:,i_bin] = par_raw

        #calculate par_AF
        par_AF = np.empty((A_pinv.shape[0],2))# 0: mean, 1: std
        for i in range(A_pinv.shape[0]):
            u_AF = mean_std_matrix_neg[0:1,:].transpose()
            Z_AF = cov_matrices_neg
            A_pinv_oneline = A_pinv[i:(i+1),:]
            u_N, sigma_N = SumNormal(u=u_AF, Z=Z_AF, weight=A_pinv_oneline)
            if sigma_N < 0:
                sigma_N = - sigma_N
            sigma_N = math.sqrt(sigma_N)
            par_AF[i,0] = u_N
            par_AF[i,1] = sigma_N
            
        u_line_data = pred_array[y_idx,0,:]
        sigma_data = pred_array[y_idx,1,:]
        lower_line_data = u_line_data - 1.96 * sigma_data
        higher_line_data = u_line_data + 1.96 * sigma_data

        sigma_data_neg = np.full(sigma_data.shape, par_AF[y_idx,1])
        lower_line_data_neg = u_line_data - 1.96 * sigma_data_neg
        higher_line_data_neg = u_line_data + 1.96 * sigma_data_neg

        #lower_line_data = lower_line_data/100
        #higher_line_data = higher_line_data/100

        #plot
        coverage_rate, covered_count, total_count = calculate_coverage_rate(sampled_x, sampled_y, bins, 
                                                                            lower_line_data, higher_line_data)
            
        coverage_results.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate, 4),
                'Covered Count': covered_count,
                'Total Count': total_count
            })

        title_text = (f"Scatter Plot of Unmixed Data\n"
                    f"(Coverage: {coverage_rate*100:.2f}%, {covered_count}/{total_count})"
                    )
        
        plot_publication_scatter(
            sampled_x=sampled_x, sampled_y=sampled_y,
            bins=bins, u_line_data=u_line_data,
            lower_line_data=lower_line_data, 
            upper_line_data=higher_line_data,
            xlabel=A_df.columns[x_idx], 
            ylabel=A_df.columns[y_idx], 
            title_text=title_text, 
            save_path=Output_dir+"/fitting_scatter_"+A_df.columns[y_idx]+".pdf",
            ymin=None, ymax=None,
            scatter_color="#2D2D2D",
            line_color = "#DD8452")

    coverage_table = pd.DataFrame(coverage_results)
    coverage_table.to_csv(Output_dir+"/coverage_summary.csv", index=False)

    plt.figure(figsize=(25, 18))

    custom_cmap_slopes = LinearSegmentedColormap.from_list(
        "custom_cmap",
        [(0, "blue"),   # -1 blue
        (0.5, "white"),  #  0 white
        (1, "red")],   #  1 red
        N=256
    )
    sns.heatmap(slopes_B_vs_cov[:,:,0], 
                xticklabels=channels_id, 
                yticklabels=channels_id, 
                annot=True,         
                fmt=".1f",          
                cmap=custom_cmap_slopes,    
                cbar=True,
                vmin=-1, 
                vmax=1,
                center=0
    )          

    plt.title("Heatmap of slopes_B_vs_cov (" + Data_list[i_folder] + ")", fontsize=16)
    plt.xlabel("Channels", fontsize=12)
    plt.ylabel("Channels", fontsize=12)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

    plt.tight_layout()
    # show image
    #plt.show()
    plt.savefig(Output_dir+"/slopes_B_vs_cov_"+Data_list[i_folder]+".pdf", format='pdf')
    plt.close()

Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_BUV805_CD16' already exists.
SCC_Cell_BUV805_CD16


In [None]:
#external slop matrix
Output_dir_root = Output_dir
all_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
#scc_fluors = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]#internal
scc_fluors = [2,3,4,5,6,7,8,9,11,12,14,15,18,19,20,22,26,27]#external
print(all_fluors)
print(scc_fluors)
Fluor_list = Data_list


Unmix_dir = 'E:/ResidualModel/extra_17_pinv/extra_17_Xenith/extra_17_flow_channel_cor_spectrum_beads/Output'
Fluor_unmix_list = os.listdir(Unmix_dir)
Fluor_list = Fluor_unmix_list


for i_data in range(len(scc_fluors)):
  
    i_scc = scc_fluors[i_data]
    i_folder = i_data
    added_fluors = all_fluors.copy()
    added_fluors.pop(i_folder)

    Output_dir = Output_dir_root+"/"+Fluor_list[i_scc]
    mkdir_silent(Output_dir)

    print(Data_list[i_folder])
    est_pd_neg = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_neg.csv",encoding="utf-8",index_col=0)
    est_pd_pos = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_pos.csv",encoding="utf-8",index_col=0)
    est_pd_fluor = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_fluor.csv",encoding="utf-8",index_col=0)
    cov_matrices = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices.npy")
    mean_std_matrix = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix.npy") # 0: mean, 1: std, 2: bin midpoint
    cov_matrices_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices_neg.npy")
    mean_std_matrix_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix_neg.npy") # 0: mean, 1: std

    intercepts_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_cov.npy")
    intercepts_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_mean.npy")
    intercepts_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_std.npy")
    slopes_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_cov.npy")
    slopes_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_mean.npy")
    slopes_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_std.npy")

    
    sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/sig.csv",encoding="utf-8",index_col=0)
    neg_sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/neg_sig.csv",encoding="utf-8",index_col=0)


    file_sample = Data_list[i_folder] + "_sample.fcs.pkl"
    #get neg signature
    path = Raw_Data_dir+"/"+file_sample
    data_neg = pd.read_pickle(path)  
    selected_channels = data_neg.columns[19:70]

    channels_id = selected_channels

    plt.figure(figsize=(25, 18))

    custom_cmap_slopes = LinearSegmentedColormap.from_list(
        "custom_cmap",
        [(0, "blue"),   # -1 blue
        (0.5, "white"),  #  0 white
        (1, "red")],   #  1 red
        N=256
    )
    sns.heatmap(slopes_B_vs_cov[:,:,0], 
                xticklabels=channels_id, 
                yticklabels=channels_id, 
                annot=True,         
                fmt=".1f",          
                cmap=custom_cmap_slopes,   
                cbar=True,
                vmin=-1, 
                vmax=1,
                center=0
    )         

    plt.title("Heatmap of slopes_B_vs_cov (" + Data_list[i_folder] + ")", fontsize=16)
    plt.xlabel("Channels", fontsize=12)
    plt.ylabel("Channels", fontsize=12)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

    plt.tight_layout()
    # show image
    #plt.show()
    plt.savefig(Output_dir+"/slopes_B_vs_cov_"+Data_list[i_folder]+".pdf", format='pdf')
    plt.close()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
[2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 18, 19, 20, 22, 26, 27]
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_APC_TCRVa72' created successfully.
SCC_External_APC_CD4
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_BUV395_NKG2D' created successfully.
SCC_External_BUV395_CD4
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_BUV496_CD19' created successfully.
SCC_External_BUV496_CD4
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_BUV563_CD56' created successfully.
SCC_External_BUV563_CD4
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_BUV615_CD161' created successfully.
SCC_External_BUV615_CD4
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Bead_BUV661_CD127' created successfully.
SCC_Extern

In [None]:
#internal validation aurora 5L
Output_dir_root = Output_dir

all_fluors = list(range(0,62))
scc_fluors = list(range(0,62))#internal
#scc_fluors = [2,3,4,5,6,7,8,9,11,12,14,15,18,19,20,22,26,27]#external
print(all_fluors)
#print(scc_fluors)
Fluor_list = Data_list

random.seed(42)

for i_scc in scc_fluors:

    loop_credit = 10
    similarity_pass = False
    while((loop_credit > 0) & (not similarity_pass)):

        i_folder = i_scc
        filtered_fluors = [f for f in all_fluors if f != i_folder]
        added_fluors = random.sample(filtered_fluors, 10)

        Output_dir = Output_dir_root+"/"+Fluor_list[i_folder]
        mkdir_silent(Output_dir)

        print(Data_list[i_folder])
        print("loop_credit: ", loop_credit)
        est_pd_neg = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_neg.csv",encoding="utf-8",index_col=0)
        est_pd_pos = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_pos.csv",encoding="utf-8",index_col=0)
        est_pd_fluor = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/est_pd_fluor.csv",encoding="utf-8",index_col=0)
        cov_matrices = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices.npy")
        mean_std_matrix = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix.npy") # 0: mean, 1: std, 2: bin midpoint
        cov_matrices_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/cov_matrices_neg.npy")
        mean_std_matrix_neg = np.load(Data_dir+"/"+Data_list[i_folder]+"/mean_std_matrix_neg.npy") # 0: mean, 1: std

        intercepts_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_cov.npy")
        intercepts_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_mean.npy")
        intercepts_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/intercepts_B_vs_std.npy")
        slopes_B_vs_cov = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_cov.npy")
        slopes_B_vs_mean = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_mean.npy")
        slopes_B_vs_std = np.load(Data_dir+"/"+Data_list[i_folder]+"/slopes_B_vs_std.npy")

        sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/sig.csv",encoding="utf-8",index_col=0)
        neg_sig = pd.read_csv(Data_dir+"/"+Data_list[i_folder]+"/neg_sig.csv",encoding="utf-8",index_col=0)


        file_sample = Data_list[i_folder] + "_sample.fcs.pkl"
        #get neg signature
        path = Raw_Data_dir+"/"+file_sample
        data_neg = pd.read_pickle(path)  
        selected_channels = data_neg.columns[list(range(1,17))+list(range(19,35))+list(range(39,71))]

        #prepare A_pinv
        # 1 fluor and 1 af
        neg_sig = neg_sig/max(neg_sig.iloc[:,0])
        A_df = pd.concat([sig, neg_sig], axis=1)
        A_df.columns = [Data_list[i_folder], 'AF']

        for i_add in added_fluors:
            add_sig = pd.read_csv(Data_dir+"/"+Data_list[i_add]+"/sig.csv",encoding="utf-8",index_col=0)
            add_sig.columns = [Data_list[i_add]]
            A_df = pd.concat([A_df, add_sig], axis=1)
        
        #check similarity
        cos_sim = cosine_similarity(A_df.transpose())
        cos_sim_array = np.array(cos_sim)
        lower_triangle_indices = np.tril_indices_from(cos_sim_array, k=-1)
        lower_triangle_values = cos_sim_array[lower_triangle_indices]

        if np.any(lower_triangle_values > 0.9):
            loop_credit = loop_credit - 1
        else:
            loop_credit = loop_credit - 1
            similarity_pass = True

    #Calculate A_pinv
    A_pinv = np.linalg.pinv(A_df.values)

    #prepare data for plot
    x_idx = 0 #always 0
    #y_idx = 27
    coverage_results = []
    for y_idx in range(0,A_pinv.shape[0]):

        #Step 1 real unmixed results
        file_sample = Data_list[i_folder] + "_filtered_sample.fcs.pkl"
        path = Raw_Data_dir+"/"+file_sample
        data_sample = pd.read_pickle(path)  
        selected_channels = data_sample.columns[list(range(1,17))+list(range(19,35))+list(range(39,71))]  
        data_sample = data_sample[selected_channels]
        data_sample = data_sample.reset_index(drop=True)
        data_sample = np.array(data_sample).transpose()

        unmixed_data = np.dot(A_pinv,data_sample)


        #Step 2 prepare unmixed data for plot
        sample_size = 100
        sample_bin = 60
        # Extract x and y coordinates
        x = unmixed_data[x_idx]
        y = unmixed_data[y_idx]

        # Compute the 5% and 95% quantiles for x and y
        x_lower, x_upper = np.percentile(x, 0.01), np.percentile(x, 99.99)
        y_lower, y_upper = np.percentile(y, 0.01), np.percentile(y, 99)

        # Create a mask to filter points within the specified range
        mask = (x >= -1000) & (x >= x_lower) & (x <= x_upper) & (y >= y_lower) & (y <= y_upper)

        # apply mask
        x_filtered = x[mask]
        y_filtered = y[mask]

        bins = np.linspace(x_filtered.min(), x_filtered.max(), sample_bin+1)

        # save sampled data
        sampled_x = []
        sampled_y = []

        # Sample 100 points from each bin without replacement
        for i in range(sample_bin):
            bin_mask = (x_filtered >= bins[i]) & (x_filtered < bins[i+1]) #& (y_filtered < 100)
            x_bin = x_filtered[bin_mask]
            y_bin = y_filtered[bin_mask]
            if len(x_bin) > sample_size:
                indices = np.random.choice(len(x_bin), size=sample_size, replace=False)
                sampled_x.extend(x_bin[indices])
                sampled_y.extend(y_bin[indices])
            elif len(x_bin) > 0:
                    # If the number of points is less than sample_size, use all points directly
                    sampled_x.extend(x_bin)
                    sampled_y.extend(y_bin)
            else:
                print(f"Bin {i} is empty.")


        #Step 3 prepare predicted data
        pred_array = np.zeros((A_pinv.shape[0],3,len(bins)))# 0: mean, 1: std, 2: bins

        for i_bin in range(len(bins)):
            tmp_B = bins[i_bin]
            tmp_mean = intercepts_B_vs_mean + slopes_B_vs_mean * tmp_B
            tmp_mean = tmp_mean.transpose()
            tmp_cov = intercepts_B_vs_cov + slopes_B_vs_cov * tmp_B
            tmp_cov = tmp_cov[:,:,0]

            par_raw = np.empty((A_pinv.shape[0],3))# 0: mean, 1: std, 2: bins
            for i in range(A_pinv.shape[0]):
                A_pinv_oneline = A_pinv[i:(i+1),:]
                u_N, sigma_N = SumNormal(u=tmp_mean, Z=tmp_cov, weight=A_pinv_oneline)
                if sigma_N < 0:
                    sigma_N = - sigma_N
                sigma_N = math.sqrt(sigma_N)
                par_raw[i,0] = u_N
                par_raw[i,1] = sigma_N
                par_raw[i,2] = tmp_B
            pred_array[:,:,i_bin] = par_raw

        #calculate par_AF
        par_AF = np.empty((A_pinv.shape[0],2))# 0: mean, 1: std
        for i in range(A_pinv.shape[0]):
            u_AF = mean_std_matrix_neg[0:1,:].transpose()
            Z_AF = cov_matrices_neg
            A_pinv_oneline = A_pinv[i:(i+1),:]
            u_N, sigma_N = SumNormal(u=u_AF, Z=Z_AF, weight=A_pinv_oneline)
            if sigma_N < 0:
                sigma_N = - sigma_N
            sigma_N = math.sqrt(sigma_N)
            par_AF[i,0] = u_N
            par_AF[i,1] = sigma_N
            
        u_line_data = pred_array[y_idx,0,:]
        sigma_data = pred_array[y_idx,1,:]
        lower_line_data = u_line_data - 1.96 * sigma_data
        higher_line_data = u_line_data + 1.96 * sigma_data

        sigma_data_neg = np.full(sigma_data.shape, par_AF[y_idx,1])
        lower_line_data_neg = u_line_data - 1.96 * sigma_data_neg
        higher_line_data_neg = u_line_data + 1.96 * sigma_data_neg

        #lower_line_data = lower_line_data/100
        #higher_line_data = higher_line_data/100

        #plot
        coverage_rate, covered_count, total_count = calculate_coverage_rate(sampled_x, sampled_y, bins, 
                                                                            lower_line_data, higher_line_data)
            
        coverage_results.append({
                'Dim': A_df.columns[y_idx],
                'Coverage Rate': round(coverage_rate, 4),
                'Covered Count': covered_count,
                'Total Count': total_count
            })

        title_text = (f"Scatter Plot of Unmixed Data\n"
                    f"(Coverage: {coverage_rate*100:.2f}%, {covered_count}/{total_count})"
                    )
        plot_publication_scatter(
            sampled_x=sampled_x, sampled_y=sampled_y,
            bins=bins, u_line_data=u_line_data,
            lower_line_data=lower_line_data, 
            upper_line_data=higher_line_data,
            xlabel=A_df.columns[x_idx], 
            ylabel=A_df.columns[y_idx], 
            title_text=title_text, 
            save_path=Output_dir+"/fitting_scatter_"+A_df.columns[y_idx]+".pdf",
            ymin=None, ymax=None,
            scatter_color="#2D2D2D",
            line_color = "#DD8452")

    coverage_table = pd.DataFrame(coverage_results)
    coverage_table.to_csv(Output_dir+"/coverage_summary.csv", index=False)
        


    plt.figure(figsize=(25, 18))

    custom_cmap_slopes = LinearSegmentedColormap.from_list(
        "custom_cmap",
        [(0, "blue"),   # -1 blue
        (0.5, "white"),  #  0 white
        (1, "red")],   #  1 red
        N=256
    )
    sns.heatmap(slopes_B_vs_cov[:,:,0], 
                xticklabels=channels_id, 
                yticklabels=channels_id, 
                annot=True,        
                fmt=".1f",         
                cmap=custom_cmap_slopes,   
                cbar=True,
                vmin=-1, 
                vmax=1,
                center=0
    )         

    plt.title("Heatmap of slopes_B_vs_cov (" + Data_list[i_folder] + ")", fontsize=16)
    plt.xlabel("Channels", fontsize=12)
    plt.ylabel("Channels", fontsize=12)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

    plt.tight_layout()
    # show image
    #plt.show()
    plt.savefig(Output_dir+"/slopes_B_vs_cov_"+Data_list[i_folder]+".pdf", format='pdf')
    plt.close()
    

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_PE_CD4/SCC_Cell_AF700_CD4' created successfully.
SCC_Cell_AF700_CD4
loop_credit:  10
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_PE_CD4/SCC_Cell_AF700_CD4' already exists.
SCC_Cell_AF700_CD4
loop_credit:  9
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_PE_CD4/SCC_Cell_AF700_CD4' already exists.
SCC_Cell_AF700_CD4
loop_credit:  8
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_PE_CD4/SCC_Cell_AF700_CD4' already exists.
SCC_Cell_AF700_CD4
loop_credit:  7
Directory 'E:/ResidualModel/extra_17_flow_channel_cor_spectrum/Output3/SCC_Cell_PE_CD4/SCC_Cell_APCCy7_CD4' created succes