# 1. Fusion Tissue Extraction

In [None]:
#libraries needed for this section

import tifffile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from skimage.segmentation import watershed
from skimage.transform import rescale, resize, downscale_local_mean
from scipy import ndimage as ndi
from skimage.filters import sobel
from sklearn.cluster import AgglomerativeClustering

### Loading the data

In [None]:
#where the output file from fusion is stored
qptiff_path = 'Z:/Yuqi/22_10_CODEX_Datasets/22_10_11_ST_56/11_10_22_ST_Scan1.qptiff' 
save_path = 'Z:/Yuqi/22_10_CODEX_Datasets/22_10_11_ST_56/'

In [None]:
#some tunable meta parameters
downscale_factor = 64
sigma = 5.0
DNAslice = 0 #DNA channel for visualization/segmentation
padding = 50
expected_number_tissues = 2
lower_cutoff = 0.02
upper_cutoff = 0.07

In [None]:
#make sure the directory for saving data exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

#read in the qptiff file
currim = tifffile.imread(qptiff_path) #if this file is too big, delete it after the next line of code and re-read it later

#select for the nuclei channel image
nucim = currim[DNAslice].copy()

#visualize the nuclei channel
plt.imshow(nucim)
plt.title('Nuclear image')
plt.show()

print(f'Loaded nuclear image of dimension (Y,X) = {nucim.shape}')

#visualize the nuclei channel after resizing the image
resized_im = resize(nucim, (nucim.shape[0] // downscale_factor, nucim.shape[1] // downscale_factor), anti_aliasing=True)
resized_im = ndi.gaussian_filter(resized_im, sigma = sigma)
plt.hist(resized_im)
plt.title('Marker expression level histogram')
plt.show()

In [None]:
elevation_map = sobel(resized_im)
markers = np.zeros_like(resized_im)
markers[resized_im <= lower_cutoff] = 1
markers[resized_im >= upper_cutoff] = 2

segmentation = watershed(elevation_map, markers)
plt.imshow(segmentation)
plt.title('Segmented tissues')
plt.show()

segmentation = ndi.binary_fill_holes(segmentation - 1)
plt.imshow(segmentation)
plt.title('Segmented tissues with holes filled')
plt.show()

#visualize initial identified segmented masks
labeled_tissues, _ = ndi.label(segmentation)
print(f'Identified {len(np.unique(labeled_tissues)) - 1} tissue pieces')
plt.imshow(labeled_tissues)
plt.title('Labeled tissues')
plt.show()

#######Non clustering option
idx = np.nonzero(labeled_tissues)
vals = labeled_tissues[idx]
tissueframe = pd.DataFrame(vals, columns = ['tissue'])
tissueframe['y'] = idx[0]
tissueframe['x'] = idx[1]

tissueframe['region1']=tissueframe['tissue']
tissueframe
####Clustering Option
#print(f'Running agglomerative clustering to assign {len(np.unique(labeled_tissues)) - 1} tissue pieces to {expected_number_tissues} tissue regions')

# maskarr = tissueframe[['y', 'x']].to_numpy()
# clustering = AgglomerativeClustering(n_clusters = expected_number_tissues).fit(maskarr)
# tissueframe['region'] = clustering.labels_ + 1

# labeled_tissues2 = labeled_tissues
# labeled_tissues2[idx] = clustering.labels_ + 1
# plt.imshow(labeled_tissues2)
# plt.title('Final tissue masks')
# plt.show()

Optional: manually clean up automatic tissue region assignments. A tissue region often consists of multiple pieces of tissue that are not connected. Occasionally, the above algorithm will assign a piece of tissue to the incorrect region. The next two cells allow the user to manually correct such region mislabelings. 

Running the first cell generates two plots. The first plot shows the tissue piece labels. The second plot shows the final region assignments for the tissue pieces. These two plots can be used to identify the ID of the tissue piece you want to reassign and the region ID you want to assign it to. 

The second cell takes two parameters:

tissue_id - (int) this is the tissue ID of the piece of tissue you want to reassign

new_region_id - (int) this is the ID of the new region you want to assign this tissue piece to

Running the second cell relabels the region assignment of the specified tissue piece.

In [None]:
centroids = tissueframe.groupby('tissue').mean()
fig, ax = plt.subplots()
ax.scatter(centroids['x'], centroids['y'])
ax.invert_yaxis()
plt.gca().set_aspect('equal', adjustable='box')

for i, txt in enumerate(centroids.index):
    ax.annotate(txt, (list(centroids['x'])[i], list(centroids['y'])[i]))

plt.title('Tissue piece labels')
plt.show()

fig, ax = plt.subplots()
ax.scatter(centroids['x'], centroids['y'])
ax.invert_yaxis()
plt.gca().set_aspect('equal', adjustable='box')

for i, txt in enumerate(centroids['region1']):
    ax.annotate(int(txt), (list(centroids['x'])[i], list(centroids['y'])[i]))

plt.title('Region labels')
plt.show()

In [None]:
#Rename the regions based on annotations
rename_dict = {1:1, 2:1, 3:1, 5:2}

for k in rename_dict.keys():
    tissueframe['region1'][tissueframe['tissue'] == k] = rename_dict[k]
    
#rename so ordered 1 through 8
tiss_num = {list(tissueframe['region1'].unique())[i]:i+1 for i in range(len(tissueframe['region1'].unique()))}
tissueframe['region']= tissueframe['region1'].map(tiss_num)
tiss_num

In [None]:
#optional: remove artifacts detected
tissueframe = tissueframe[tissueframe['region'].isin([1,2])]

In [None]:
centroids = tissueframe.groupby('tissue').mean()
fig, ax = plt.subplots()
ax.scatter(centroids['x'], centroids['y'])
ax.invert_yaxis()
plt.gca().set_aspect('equal', adjustable='box')

for i, txt in enumerate(centroids.index):
    ax.annotate(txt, (list(centroids['x'])[i], list(centroids['y'])[i]))

plt.title('Tissue piece labels')
plt.show()

fig, ax = plt.subplots()
ax.scatter(centroids['x'], centroids['y'])
ax.invert_yaxis()
plt.gca().set_aspect('equal', adjustable='box')

for i, txt in enumerate(centroids['region']):
    ax.annotate(int(txt), (list(centroids['x'])[i], list(centroids['y'])[i]))

plt.title('Region labels')
plt.show()

Run the next cell to extract labeled tissues into subimages and save each subimage as its own tiffstack

In [None]:
#read back in currim here if it was too big and needed to be deleted in the first place
#currim = tifffile.imread(path)

tissueframe2 = tissueframe.groupby('region').agg([min, max])

for index, row in tissueframe2.iterrows():
    ymin = row['y']['min'] * downscale_factor
    ymax = row['y']['max'] * downscale_factor
    xmin = row['x']['min'] * downscale_factor
    xmax = row['x']['max'] * downscale_factor
    ymin = max(ymin - padding, 0)
    ymax = min(ymax + padding, currim.shape[1])
    xmin = max(xmin - padding, 0)
    xmax = min(xmax + padding, currim.shape[2])
    subim = currim[:, ymin:ymax, xmin:xmax]
    outpath = os.path.join(savepath, f'reg00{index}_X01_Y01_Z01.tif')
    plt.imshow(subim[0])
    plt.title(f'Extracting tissue {index}: ')
    plt.show()
    print(f'Saving tissue image at {outpath}')
    tifffile.imwrite(outpath, subim)

# 2. Cell segmentation

In [None]:
from multiprocessing import Process
import os
import sys
from main import main
from glob import glob

In [None]:
output_path = 'G:/Fusion/Mouse/22_10_11_ST_56'
output_path2 = 'G:/Fusion/Mouse/22_10_18_ST_4_1_7'

In [None]:
#need to change output path
%run run_cellvision.py

# 3. Basic Image Preprocessing

In [None]:
from preprocessing import *

In [None]:
df_56 = read_data(path='../22_10_11_ST_56/CVcol_DAPI_3px/fcs/compensated/',\
               reg_list=[], nuc_1 = 1)
df_17 = read_data(path='../22_10_18_ST_4_1_7/CVcol_DAPI_3px/fcs/compensated/',\
               reg_list=[], nuc_1 = 1)

In [None]:
# Z normalization
dfz_17 = z_format(data=df_17, list_out=['first_index', 'cell_id','tile_num','z', 'x_tile',
       'y_tile', 'size','DAPI'],
        list_keep = ['region','x','y','region_num',])

dfz_56 = z_format(data=df_56, list_out=['first_index', 'cell_id','tile_num','z', 'x_tile',
       'y_tile', 'size','DAPI'],
        list_keep = ['region','x','y','region_num',])

In [None]:
# XY correlation
df_cor_17 = xycorr(data=dfz_17, y_rows=2, x_columns=1, X_pix=20000, Y_pix=23000)
df_cor_56 = xycorr(data=dfz_56, y_rows=2, x_columns=1, X_pix=13000, Y_pix=11000)
df_cor_17['array'] = 'samples_17' 
df_cor_56['array'] = 'samples_56'

In [None]:
#remove noise
df_nn_56,cc_56 = remove_noise(df=df_cor_56, col_num=48, z_sum_thres=38, z_count_thres=38)
df_nn_17,cc_17 = remove_noise(df=df_cor_17, col_num=48, z_sum_thres=38, z_count_thres=38)

# 4. Cell type classification

In [None]:
from classification import *

In [None]:
df_train_full = pd.read_csv("../training_data/Day135_Markers_Dryad.csv", index_col=0)
df_codex_56 = pd.read_csv("../22_10_11_ST_56/result/df_nn_56_111822.csv", index_col=0)
df_codex_17 = pd.read_csv("../22_10_18_ST_4_1_7/result/df_nn_17_111822.csv", index_col = 0)

In [None]:
celltype_dict = {
    'Tumor PDL1+ MHCI+':'Tumor',
    'CD8+ T cell PD1+':'CD8+ T cell',
    'Tumor': 'Tumor',
    'DC':'DC',
    'DC TCF7+':'DC',
    'Epithelial':'Epithelial',
    'Endothelial':'Endothelial',
    'Macrophage':'Macrophage',
    'Tumor Ki67+': 'Tumor',
    'CD8+ T cell': 'CD8+ T cell',
    'CD4+ Treg':'CD4+ Treg',
    'Neutrophil':'Neutrophil',
    'NK':'NK',
    'Macrophage PDL1+':'Macrophage',
    'APC MHCIIhi':'APC MHCIIhi',
    'Macrophage CD86+':'Macrophage',
    'Lymphatic':'Lymphatic',
    'CD8+ T cell PD1+': 'CD8+ T cell',
    'CD4+ T cell':'CD4+ T cell',
    'DC TCF7+':'DC',
    'B cell':'B cell'
}

In [None]:
df_train_full['celltype_merge'] = df_train_full['Cell common'].map(celltype_dict)

## 4.1 Train a XGB classifier

In [None]:
clf_xgb, X_train, y_train, X_test, y_test, label = xgb_codex_train(df= df_train_full, df_val = df_codex_17, col = 'celltype_merge', n_cells =200, ind_features=40)

## 4.2 Predict cell type annotation

In [None]:
y_codex_56 = xgb_codex_predict(df = df_codex_56, clf_xgb=clf_xgb, label=label)
y_codex_17 = xgb_codex_predict(df = df_codex_17, clf_xgb=clf_xgb, label=label)

# 5. Cell Type Composition

In [None]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns