# Clean workflow to interpret data from Blair

by Pu Zheng

2022.09.20

In [1]:
%run "..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

30320


## 0.1 Folders

In [17]:
postanalysis_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920'
print(postanalysis_folder)
dark_mode = False
if dark_mode:
    figure_folder = os.path.join(postanalysis_folder, 'Figures_0920_dark')
    plt.style.use('dark_background')
else:
    figure_folder = os.path.join(postanalysis_folder, 'Figures_0920')
print(figure_folder)

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920
\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\Figures_0920


In [5]:
# specifically for the first jupyter:
if not os.path.exists(postanalysis_folder):
    print(f"create postanalysis_folder: {postanalysis_folder}")
    os.makedirs(postanalysis_folder)
else:
    print(f"use postanalysis_folder: {postanalysis_folder}")

if not os.path.exists(figure_folder):
    print(f"create figure_folder: {figure_folder}")
    os.makedirs(figure_folder)
else:
    print(f"use figure_folder: {figure_folder}")

create postanalysis_folder: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920
create figure_folder: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\Figures_0920


## 0.2 Plotting Parameters

In [6]:
# Required plotting setting
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})   

# 1. Load Data

In [7]:
# load blair's data
import pandas as pd
data_folder = r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\0814-Sample_Result_Blair'
#data_df = pd.read_hdf(os.path.join(data_folder, f'0713_SE_{fov_id}_spalign.h5'))
sel_data_files = [os.path.join(data_folder, _fl) for _fl in os.listdir(data_folder) if '.h5' in _fl]
print(sel_data_files)
data_df_list = [pd.read_hdf(_fl) for _fl in sel_data_files]

#data_df = pd.concat(data_df_list)

#data_df['x_hat'] = 2048 - data_df['x_hat']

['\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0316_SE_bulk_spalign.h5', '\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0402_SE_bulk_spalign.h5', '\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0713_SE_bulk_spalign.h5']


## 1.2 load positions

In [23]:
# positions
# modify with global coordinates
position_filenames = [
    r'\\10.245.74.158\Chromatin_NAS_4\20220316-P_brain_CTP11-12-13_from_0304\Alignment\adjusted_translated_positions_all.txt',
    r'\\10.245.74.158\Chromatin_NAS_4\20220402-P_brain_CTP11-13_from_0329\Alignment\adjusted_translated_positions_all.txt',
    r'\\10.245.74.158\Chromatin_NAS_7\20220713-P_brain_CTP11-13_from_0418\Alignment\adjusted_translated_positions_all.txt',
]
positions_list = [np.loadtxt(_fl, delimiter=',') for _fl in position_filenames]

## 1.3 load MERFISH

In [24]:
import seaborn as sns
import anndata
import scanpy as sc

save_folder = r'\\crick\SSD_0\Shiwei\RNA_MERFISH_analysis\Merged_nonclear'
merged_adata = sc.read(os.path.join(save_folder,'new_labeled_data.h5ad'))
adata_ori = merged_adata.raw.to_adata()

print(np.unique(adata_ori.obs['experiment']))

adata_ori

['20220304' '20220329' '20220415' '20220418']


AnnData object with n_obs × n_vars = 62732 × 242
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'max_x', 'min_y', 'max_y', 'experiment', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'leiden', 'leiden_subclass', 'subclass_prediction_label', 'leiden_subclass_sub', 'subclass_manual_label', 'subclass_manual_label_predict', 'subclass_label_new', 'class_label_new', 'neuron_identity'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'dendrogram_subclass_label_new', 'experiment_colors', 'leiden', 'leiden_colors', 'leiden_subclass_colors', 'leiden_subclass_sub_colors', 'neighbors', 'pca', 'scrublet', 'subclass_label_new_colors', 'subclass_manual_label_colors', 'subclass_prediction_label_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    obsp: 'connectivities', 'distances'

## 1.4 load fov-cell-exp_2_uid

In [26]:
ExpFovCell_2_uid = {}

from ImageAnalysis3.segmentation_tools.cell import Align_Segmentation
segLabel_folders = [
    r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\20220316-P_brain_CTP11-12-13_from_0304\Analysis_0706\Segmentation',
    r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\20220402-P_brain_CTP11-13_from_0329_warp\Analysis_0709\Segmentation',
    r'\\mendel\Mendel_SSD1\Pu_Temp\20220713-P_brain_CTP11-13_from_0418\Analysis_0709\Segmentation',
]
exp_names = ['20220304', '20220329', '20220418', ]
for _exp, _seg_fd in zip(exp_names, segLabel_folders):
    _seg_fls = [os.path.join(_seg_fd, _fl) for _fl in os.listdir(_seg_fd) if '_Segmentation.hdf5' in _fl]
    for _fl in _seg_fls:
        # load segmentation label matrix and uids
        with h5py.File(_fl, 'r') as _f:
            for _fov_id in _f.keys():
                _fov_group = _f[list(_f.keys())[0]]
                _uid_group = _fov_group['cell_2_uid']
                for _cell_id in _uid_group.keys():
                    ExpFovCell_2_uid[(_exp, _fov_id, _cell_id)] = _uid_group[_cell_id][:][0].decode()

len(ExpFovCell_2_uid)

73889

## 1.5 load codebook

In [30]:
codebook = pd.read_csv(r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\20220713-Export\merged_codebook.csv')

# Process

In [29]:
# save 
from tqdm import tqdm
cellDf_filename = os.path.join(postanalysis_folder, 'CellDfList_3rep.pkl')

pixel_size = 108
cellType_labelName = 'subclass_label_new'

if os.path.exists(cellDf_filename):
    cell_dfs = pickle.load(open(cellDf_filename, 'rb'))
    
else:
    
    missed_cell = []
    cell_dfs = []
    for _exp, _positions, _df in zip(exp_names, positions_list, data_df_list):
        _fovs = np.unique(_df['FOV'])
        for _fov in tqdm(_fovs):
            _fov_df = _df[_df['FOV']==_fov].copy()
            # convert to micron
            _fov_df[['z_um','x_um','y_um']] = _fov_df[['z_hat','x_hat','y_hat']] * pixel_size / 1000
            # convert to global
            _fov_df[['x_um','y_um']] = _fov_df[['x_um','y_um']] + np.flipud(_positions[_fov])
            # loop through cells to re-assign cell type
            #_fov_df['replicate'] = '2022'+np.unique(_fov_df['replicate'])[0]
            _fov_df['rna_experiment'] = _exp
            # apppend
            #fov_dfs.append(_fov_df)

            # load cell
            for _cell_name in np.unique(_fov_df['orig_cellID']):

                _cell_df = _fov_df[_fov_df['orig_cellID']==_cell_name].copy()
                #_exp = np.unique(_cell_df['replicate'])[0]
                _fov = str(np.unique(_cell_df['FOV'])[0])
                _cell = _cell_name.split('Cell-')[1]
                _uid = ExpFovCell_2_uid[(_exp,_fov,_cell)]

                if not _uid in adata_ori.obs.index:
                    continue

                _cell_df['uid'] = _uid
                _cell_df['fov_id'] = _fov
                _cell_df['cell_id'] = _cell
                _cell_df['subclass'] = adata_ori.obs.loc[adata_ori.obs.index==_uid, cellType_labelName].values[0]
                cell_dfs.append(_cell_df)
    # save
    print(f"Saving: {cellDf_filename}")
    pickle.dump(cell_dfs, open(cellDf_filename, 'wb'))
    
print(len(cell_dfs))

100%|████████████████████████████████████████████████████████████████████████████████| 163/163 [02:37<00:00,  1.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 159/159 [02:18<00:00,  1.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 174/174 [02:38<00:00,  1.10it/s]


Saving: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\CellDfList_3rep.pkl
35181


In [22]:
_cell_df

Unnamed: 0,FOV,replicate,finalcellID,celltype,chr,start,end,hyb,x_hat,y_hat,...,uindex,rowID,Score,fiberidx,numfiber,orig_cellID,z_um,x_um,y_um,rna_experiment
3,1,0316,51,Micro,chr2,8748765,8759968,5,1532.213600,601.509681,...,79,181982,4355.280491,1,1,Cell-148,10.297020,167.319069,403.613046,20220418
5,1,0316,51,Micro,chr2,11345065,11361468,7,1525.858800,605.023748,...,1095,181984,4355.280491,1,1,Cell-148,10.941555,166.632750,403.992565,20220418
7,1,0316,51,Micro,chr2,16580141,16598642,11,1533.077600,593.271464,...,1097,181986,4355.280491,1,1,Cell-148,10.414455,167.412381,402.723318,20220418
14,1,0316,51,Micro,chr2,19938554,19950067,14,1530.020250,591.382596,...,1099,181993,4355.280491,1,1,Cell-148,10.391650,167.082187,402.519320,20220418
16,1,0316,51,Micro,chr2,21273895,21283391,16,1528.527800,596.813201,...,1100,181995,4355.280491,1,1,Cell-148,10.371780,166.921002,403.105826,20220418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,1,0316,51,Micro,chr15,96249522,96259954,71,1535.725000,600.327068,...,795,181755,2833.452317,2,2,Cell-148,11.819582,167.698300,403.485323,20220418
132,1,0316,51,Micro,chr15,98747656,98759982,73,1537.322000,603.101264,...,796,181761,2833.452317,2,2,Cell-148,11.546704,167.870776,403.784937,20220418
136,1,0316,51,Micro,chr15,101851871,101860220,75,1537.428200,598.738051,...,1835,181765,2833.452317,2,2,Cell-148,11.247579,167.882246,403.313709,20220418
138,1,0316,51,Micro,chr15,103529162,103537591,76,1567.197467,607.741048,...,1836,181767,2833.452317,2,2,Cell-148,9.264577,171.097326,404.286033,20220418


## Convert to chr_2_zxys and save

In [31]:
sel_cols = ['rna_experiment', 'uid', 'fov_id', 'cell_id', 'subclass',
            'chr', 'hyb', 'fiberidx', 'numfiber',
            'center_intensity', 'z_um', 'x_um', 'y_um', 
            ]
sel_merged_df = pd.concat([_df[sel_cols] for _df in cell_dfs])


In [32]:
sel_merged_filename = os.path.join(postanalysis_folder, 'selected_all_cells.csv')
if not os.path.exists(sel_merged_filename):
    sel_merged_df.to_csv(sel_merged_filename, index=False)

# convert into chr_2_zxys_list 

In [33]:
import multiprocessing as mp
from ImageAnalysis3.io_tools.aligner import spAligner_2_chr2homologList
print(len(cell_dfs))

35181


In [34]:
%time
num_threads = 32
with mp.Pool(num_threads) as aligner_pool:
    align_results = aligner_pool.starmap(spAligner_2_chr2homologList, [(_cell_df, codebook) for _cell_df in cell_dfs])
    aligner_pool.close()
    aligner_pool.join()
    aligner_pool.terminate()
len(align_results)
chr2ZxysList = [_r[0] for _r in align_results]
cellInfoList = [_r[1] for _r in align_results]

Wall time: 0 ns


35181

In [36]:
chr2Zxys_filename = os.path.join(postanalysis_folder, 'all_chr2Zxys.pkl')
if not os.path.exists(chr2Zxys_filename):
    print(f"Saving: {chr2Zxys_filename}")
    pickle.dump(chr2ZxysList, open(chr2Zxys_filename, 'wb'))
    
cellInfo_filename = os.path.join(postanalysis_folder, 'all_cellInfo.pkl')
if not os.path.exists(cellInfo_filename):
    print(f"Saving: {cellInfo_filename}")
    pickle.dump(cellInfoList, open(cellInfo_filename, 'wb'))

Saving: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\all_chr2Zxys.pkl
Saving: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\all_cellInfo.pkl


In [37]:
cellInfoList[0]

{'rna_experiment': '20220304',
 'fov_id': '0',
 'cell_id': '12',
 'subclass': 'Oligo',
 'uid': '259202492748634617304623818845147108919'}

## partition by subclass

In [38]:
subclass_2_chr2ZxysList = {}
subclass_2_cellInfoList = {}
for _info, _chr2Zxys in zip(cellInfoList, chr2ZxysList):
    _cls = _info['subclass']
    if _cls not in subclass_2_chr2ZxysList:
        subclass_2_chr2ZxysList[_cls] = [_chr2Zxys]
        subclass_2_cellInfoList[_cls] = [_info]
    else:
        subclass_2_chr2ZxysList[_cls].append(_chr2Zxys)
        subclass_2_cellInfoList[_cls].append(_info)

In [39]:
for _cls in subclass_2_chr2ZxysList:
    print(_cls, len(subclass_2_chr2ZxysList[_cls]))

Oligo 4765
other 311
Micro 1523
Peri 738
Endo 2891
Astro 3822
OPC 1568
L6 CT 4234
L5/6 NP 596
L5 ET 1202
Pvalb 993
L6 IT 1837
Lamp5 339
Sst 746
SMC 434
L5 IT 2014
L2/3 IT 3095
Vip 357
VLMC 444
L4/5 IT 2398
L6b 741
Sncg 133


In [40]:
subclass_2_chr2Zxys_filename = os.path.join(postanalysis_folder, 'subclass_2_chr2Zxys.pkl')
if not os.path.exists(subclass_2_chr2Zxys_filename):
    print(f"Saving: {subclass_2_chr2Zxys_filename}")
    pickle.dump(subclass_2_chr2ZxysList, open(subclass_2_chr2Zxys_filename, 'wb'))
    
subclass_2_cellInfo_filename = os.path.join(postanalysis_folder, 'subclass_2_cellInfo.pkl')
if not os.path.exists(subclass_2_cellInfo_filename):
    print(f"Saving: {subclass_2_cellInfo_filename}")
    pickle.dump(subclass_2_cellInfoList, open(subclass_2_cellInfo_filename, 'wb'))

Saving: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\subclass_2_chr2Zxys.pkl
Saving: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\subclass_2_cellInfo.pkl


### subclass to median

In [41]:
subclass_2_median_filename = os.path.join(postanalysis_folder, 'subclass_2_medianDict.pkl')
print(subclass_2_median_filename)

if os.path.exists(subclass_2_median_filename):
    print("Loading")
    subclass_2_medianDict = np.load(subclass_2_median_filename, allow_pickle=True)
else:
    from ImageAnalysis3.structure_tools.distance import Chr2ZxysList_2_summaryDist_by_key,Chr2ZxysList_2_summaryDict
    # calculate prob
    subclass_2_medianDict = {}
    for _subclass in subclass_2_chr2ZxysList:
        subclass_2_medianDict[_subclass] = Chr2ZxysList_2_summaryDict(
            subclass_2_chr2ZxysList[_subclass], 
            total_codebook=codebook, 
            num_threads=num_threads, verbose=True)
    
    print(f"Saving to: {subclass_2_median_filename}")
    pickle.dump(subclass_2_medianDict, open(subclass_2_median_filename, 'wb'))

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\subclass_2_medianDict.pkl
-- preparing chr_2_zxys from 4765 cells in 3.219s.
-- summarize 231 inter-chr distances with 44 threads in 773.563s.
-- preparing chr_2_zxys from 311 cells in 0.031s.
-- summarize 231 inter-chr distances with 44 threads in 63.182s.
-- preparing chr_2_zxys from 1523 cells in 0.199s.
-- summarize 231 inter-chr distances with 44 threads in 214.205s.
-- preparing chr_2_zxys from 738 cells in 0.798s.
-- summarize 231 inter-chr distances with 44 threads in 148.343s.
-- preparing chr_2_zxys from 2891 cells in 1.468s.
-- summarize 231 inter-chr distances with 44 threads in 306.252s.
-- preparing chr_2_zxys from 3822 cells in 2.686s.
-- summarize 231 inter-chr distances with 44 threads in 346.031s.
-- preparing chr_2_zxys from 1568 cells in 0.187s.
-- summarize 231 inter-chr distances with 44 threads in 196.203s.
-- preparing chr_2_zxys from 4234 cells in 2.891s.
-- summarize 231 inter-chr dista

## partition by major class

In [42]:
class_2_subclass = {
    'Gluta':['L6b', 'L6 CT', 'L6 IT', 'L5 IT', 'L2/3 IT', 'L5/6 NP', 'L5 ET', ],
    'GABA':['Sncg', 'Sst', 'Vip', 'Pvalb', 'Lamp5', 'L4/5 IT', ],
    'Astro':['Astro', ],
    'Endo':['Endo', ],
    'Micro':['Micro', ],
    'Oligo':['Oligo', 'OPC', ],
}
subclass_2_class = {}
for _cls in class_2_subclass:
    for _subcls in class_2_subclass[_cls]:
        subclass_2_class[_subcls] = _cls

In [44]:
# save if not exists
class_2_chr2ZxysList_filename = os.path.join(postanalysis_folder, 'class_2_chr2Zxys.pkl')
class_2_cellInfoList_filename = os.path.join(postanalysis_folder, 'class_2_cellInfo.pkl')

if not os.path.exists(class_2_chr2ZxysList_filename):
    # calculate
    class_2_chr2ZxysList = {_cls:[] for _cls in class_2_subclass.keys()}
    class_2_cellInfoList = {_cls:[] for _cls in class_2_subclass.keys()}

    for _subcls, _chr2ZxysList in subclass_2_chr2ZxysList.items():
        _cellInfoList = subclass_2_cellInfoList[_subcls]
        if _subcls in subclass_2_class:
            class_2_chr2ZxysList[subclass_2_class[_subcls]].extend(_chr2ZxysList)
            class_2_cellInfoList[subclass_2_class[_subcls]].extend(_cellInfoList)
    # plot stats
    for _cls in class_2_chr2ZxysList:
        print(_cls, len(class_2_chr2ZxysList[_cls]))
    # save
    print(f"Writing to file: {class_2_chr2ZxysList_filename}")
    pickle.dump(class_2_chr2ZxysList, open(class_2_chr2ZxysList_filename, 'wb'))
    print(f"Writing to file: {class_2_cellInfoList_filename}")
    pickle.dump(class_2_cellInfoList, open(class_2_cellInfoList_filename, 'wb'))
else:
    print("Loading")
    class_2_chr2ZxysList = pickle.load(open(class_2_chr2ZxysList_filename, 'rb'))
    class_2_cellInfoList = pickle.load(open(class_2_cellInfoList_filename, 'rb'))

Gluta 13719
GABA 4966
Astro 3822
Endo 2891
Micro 1523
Oligo 6333
Writing to file: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_chr2Zxys.pkl
Writing to file: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_cellInfo.pkl


### class to median

In [45]:
class_2_median_filename = os.path.join(postanalysis_folder, 'class_2_medianDict.pkl')
print(class_2_median_filename)

if os.path.exists(class_2_median_filename):
    print("Loading")
    class_2_medianDict = np.load(class_2_median_filename, allow_pickle=True)
else:
    from ImageAnalysis3.structure_tools.distance import Chr2ZxysList_2_summaryDist_by_key,Chr2ZxysList_2_summaryDict
    # calculate prob
    class_2_medianDict = {}
    for _class in class_2_chr2ZxysList:
        class_2_medianDict[_class] = Chr2ZxysList_2_summaryDict(
            class_2_chr2ZxysList[_class], 
            total_codebook=codebook, 
            num_threads=num_threads, verbose=True)
    
    print(f"Saving to: {class_2_median_filename}")
    pickle.dump(class_2_medianDict, open(class_2_median_filename, 'wb'))

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_medianDict.pkl
-- preparing chr_2_zxys from 13719 cells in 13.284s.
-- summarize 231 inter-chr distances with 32 threads in 1152.696s.
-- preparing chr_2_zxys from 4966 cells in 1.857s.
-- summarize 231 inter-chr distances with 32 threads in 430.333s.
-- preparing chr_2_zxys from 3822 cells in 2.096s.
-- summarize 231 inter-chr distances with 32 threads in 297.629s.
-- preparing chr_2_zxys from 2891 cells in 1.539s.
-- summarize 231 inter-chr distances with 32 threads in 256.843s.
-- preparing chr_2_zxys from 1523 cells in 1.018s.
-- summarize 231 inter-chr distances with 32 threads in 174.712s.
-- preparing chr_2_zxys from 6333 cells in 4.469s.
-- summarize 231 inter-chr distances with 32 threads in 828.681s.
Saving to: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_medianDict.pkl
