# Clean workflow to interpret data from Blair

by Pu Zheng

2022.09.20

In [1]:
%run "..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

26144


## 0.1 Folders

In [4]:
analysis_date = '0211'
postanalysis_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis'+f'_{analysis_date}'
print(postanalysis_folder)
dark_mode = False
if dark_mode:
    figure_folder = os.path.join(postanalysis_folder, f'Figures_{analysis_date}_dark')
    plt.style.use('dark_background')
else:
    figure_folder = os.path.join(postanalysis_folder, f'Figures_{analysis_date}')
print(figure_folder)

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0211
\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0211\Figures_0211


In [5]:
# specifically for the first jupyter:
if not os.path.exists(postanalysis_folder):
    print(f"create postanalysis_folder: {postanalysis_folder}")
    os.makedirs(postanalysis_folder)
else:
    print(f"use postanalysis_folder: {postanalysis_folder}")

if not os.path.exists(figure_folder):
    print(f"create figure_folder: {figure_folder}")
    os.makedirs(figure_folder)
else:
    print(f"use figure_folder: {figure_folder}")

create postanalysis_folder: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0211
create figure_folder: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0211\Figures_0211


## 0.2 Plotting Parameters

In [6]:
# Required plotting setting
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})   

# 1. Load Data

In [5]:
# load blair's data
import pandas as pd
data_folder = r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\0814-Sample_Result_Blair'
#data_df = pd.read_hdf(os.path.join(data_folder, f'0713_SE_{fov_id}_spalign.h5'))
sel_data_files = [os.path.join(data_folder, _fl) for _fl in os.listdir(data_folder) if '.h5' in _fl]
print(sel_data_files)
data_df_list = [pd.read_hdf(_fl) for _fl in sel_data_files]

#data_df = pd.concat(data_df_list)

#data_df['x_hat'] = 2048 - data_df['x_hat']

['\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0316_SE_bulk_spalign.h5', '\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0402_SE_bulk_spalign.h5', '\\\\10.245.74.158\\Chromatin_NAS_8\\Exported_data\\0814-Sample_Result_Blair\\0713_SE_bulk_spalign.h5']


## 1.2 load positions

In [6]:
# positions
# modify with global coordinates
position_filenames = [
    r'\\10.245.74.158\Chromatin_NAS_4\20220316-P_brain_CTP11-12-13_from_0304\Alignment\adjusted_translated_positions_all.txt',
    r'\\10.245.74.158\Chromatin_NAS_4\20220402-P_brain_CTP11-13_from_0329\Alignment\adjusted_translated_positions_all.txt',
    r'\\10.245.74.158\Chromatin_NAS_7\20220713-P_brain_CTP11-13_from_0418\Alignment\adjusted_translated_positions_all.txt',
]
positions_list = [np.loadtxt(_fl, delimiter=',') for _fl in position_filenames]

## 1.3 load MERFISH

In [7]:
import seaborn as sns
import anndata
import scanpy as sc

save_folder = r'\\crick\SSD_0\Shiwei\RNA_MERFISH_analysis\Merged_nonclear'
merged_adata = sc.read(os.path.join(save_folder,'new_labeled_data.h5ad'))
adata_ori = merged_adata.raw.to_adata()

print(np.unique(adata_ori.obs['experiment']))

adata_ori

['20220304' '20220329' '20220415' '20220418']


AnnData object with n_obs × n_vars = 62732 × 242
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'max_x', 'min_y', 'max_y', 'experiment', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'leiden', 'leiden_subclass', 'subclass_prediction_label', 'leiden_subclass_sub', 'subclass_manual_label', 'subclass_manual_label_predict', 'subclass_label_new', 'class_label_new', 'neuron_identity'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'dendrogram_subclass_label_new', 'experiment_colors', 'leiden', 'leiden_colors', 'leiden_subclass_colors', 'leiden_subclass_sub_colors', 'neighbors', 'pca', 'scrublet', 'subclass_label_new_colors', 'subclass_manual_label_colors', 'subclass_prediction_label_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    obsp: 'connectivities', 'distances'

## 1.4 load fov-cell-exp_2_uid

In [8]:
ExpFovCell_2_uid = {}

from ImageAnalysis3.segmentation_tools.cell import Align_Segmentation
segLabel_folders = [
    r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\20220316-P_brain_CTP11-12-13_from_0304\Analysis_0706\Segmentation',
    r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\20220402-P_brain_CTP11-13_from_0329_warp\Analysis_0709\Segmentation',
    r'\\mendel\Mendel_SSD1\Pu_Temp\20220713-P_brain_CTP11-13_from_0418\Analysis_0709\Segmentation',
]
exp_names = ['20220304', '20220329', '20220418', ]
for _exp, _seg_fd in zip(exp_names, segLabel_folders):
    _seg_fls = [os.path.join(_seg_fd, _fl) for _fl in os.listdir(_seg_fd) if '_Segmentation.hdf5' in _fl]
    for _fl in _seg_fls:
        # load segmentation label matrix and uids
        with h5py.File(_fl, 'r') as _f:
            for _fov_id in _f.keys():
                _fov_group = _f[list(_f.keys())[0]]
                _uid_group = _fov_group['cell_2_uid']
                for _cell_id in _uid_group.keys():
                    ExpFovCell_2_uid[(_exp, _fov_id, _cell_id)] = _uid_group[_cell_id][:][0].decode()

len(ExpFovCell_2_uid)

73889

## 1.5 load codebook

In [9]:
codebook = pd.read_csv(r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\20220713-Export\merged_codebook.csv')

# Process

In [10]:
# save 
from tqdm import tqdm
cellDf_filename = os.path.join(postanalysis_folder, 'CellDfList_3rep.pkl')

pixel_size = 108
cellType_labelName = 'subclass_label_new'

if os.path.exists(cellDf_filename):
    print(f"Directly load cell_dfs")
    cell_dfs = pickle.load(open(cellDf_filename, 'rb'))
else:
    missed_cell = []
    cell_dfs = []
    for _exp, _positions, _df in zip(exp_names, positions_list, data_df_list):
        _fovs = np.unique(_df['FOV'])
        for _fov in tqdm(_fovs):
            _fov_df = _df[_df['FOV']==_fov].copy()
            # convert to micron
            _fov_df[['z_um','x_um','y_um']] = _fov_df[['z_hat','x_hat','y_hat']] * pixel_size / 1000
            # convert to global
            _fov_df[['x_um','y_um']] = _fov_df[['x_um','y_um']] + np.flipud(_positions[_fov])
            # loop through cells to re-assign cell type
            #_fov_df['replicate'] = '2022'+np.unique(_fov_df['replicate'])[0]
            _fov_df['rna_experiment'] = _exp
            # apppend
            #fov_dfs.append(_fov_df)

            # load cell
            for _cell_name in np.unique(_fov_df['orig_cellID']):

                _cell_df = _fov_df[_fov_df['orig_cellID']==_cell_name].copy()
                #_exp = np.unique(_cell_df['replicate'])[0]
                _fov = str(np.unique(_cell_df['FOV'])[0])
                _cell = _cell_name.split('Cell-')[1]
                _uid = ExpFovCell_2_uid[(_exp,_fov,_cell)]

                if not _uid in adata_ori.obs.index:
                    continue

                _cell_df['uid'] = _uid
                _cell_df['fov_id'] = _fov
                _cell_df['cell_id'] = _cell
                _cell_df['subclass'] = adata_ori.obs.loc[adata_ori.obs.index==_uid, cellType_labelName].values[0]
                cell_dfs.append(_cell_df)
    # save
    print(f"Saving: {cellDf_filename}")
    pickle.dump(cell_dfs, open(cellDf_filename, 'wb'))
    
print(len(cell_dfs))

Directly load cell_dfs
35181


In [12]:
cell_dfs[-1]

Unnamed: 0,FOV,replicate,finalcellID,celltype,chr,start,end,hyb,x_hat,y_hat,...,numfiber,orig_cellID,z_um,x_um,y_um,rna_experiment,uid,fov_id,cell_id,subclass
0,198,0713,13001,Oligo,chr1,6245958,6258969,1,1560.207033,1116.855194,...,2,Cell-97,7.747461,-2126.227640,6633.960361,20220418,93369905293795937987124665142143186883,198,97,Oligo
9,198,0713,13001,Oligo,chr1,11247744,11257616,5,1563.791267,1117.521861,...,2,Cell-97,8.961289,-2125.840543,6634.032361,20220418,93369905293795937987124665142143186883,198,97,Oligo
14,198,0713,13001,Oligo,chr1,16251322,16259969,7,1557.206050,1119.858944,...,2,Cell-97,8.556781,-2126.551747,6634.284766,20220418,93369905293795937987124665142143186883,198,97,Oligo
19,198,0713,13001,Oligo,chr1,21522568,21534512,11,1554.720600,1107.058894,...,2,Cell-97,8.013567,-2126.820175,6632.902361,20220418,93369905293795937987124665142143186883,198,97,Oligo
22,198,0713,13001,Oligo,chr1,21732182,21745770,12,1555.962800,1117.035094,...,2,Cell-97,8.597788,-2126.686018,6633.979790,20220418,93369905293795937987124665142143186883,198,97,Oligo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,198,0713,13001,Oligo,chrX,138748283,138759979,49,1524.725100,1105.860144,...,1,Cell-97,7.625791,-2130.059689,6632.772896,20220418,93369905293795937987124665142143186883,198,97,Oligo
83,198,0713,13001,Oligo,chrX,143745316,143759954,52,1551.575900,1114.482994,...,1,Cell-97,8.456378,-2127.159803,6633.704163,20220418,93369905293795937987124665142143186883,198,97,Oligo
86,198,0713,13001,Oligo,chrX,156242675,156257265,56,1521.990800,1113.989928,...,1,Cell-97,8.396263,-2130.354994,6633.650912,20220418,93369905293795937987124665142143186883,198,97,Oligo
88,198,0713,13001,Oligo,chrX,166247682,166259932,60,1522.844050,1112.413194,...,1,Cell-97,9.063507,-2130.262843,6633.480625,20220418,93369905293795937987124665142143186883,198,97,Oligo


## Convert to chr_2_zxys and save

In [None]:
sel_cols = ['rna_experiment', 'uid', 'fov_id', 'cell_id', 'subclass',
            'chr', 'hyb', 'fiberidx', 'numfiber',
            'center_intensity', 'z_um', 'x_um', 'y_um', 
            ]
sel_merged_df = pd.concat([_df[sel_cols] for _df in cell_dfs])


In [None]:
sel_merged_filename = os.path.join(postanalysis_folder, 'selected_all_cells.csv')
if not os.path.exists(sel_merged_filename):
    sel_merged_df.to_csv(sel_merged_filename, index=False)

# convert into chr_2_zxys_list 

In [14]:
chr2Zxys_filename = os.path.join(postanalysis_folder, 'all_chr2Zxys.pkl')

cellInfo_filename = os.path.join(postanalysis_folder, 'all_cellInfo.pkl')

if os.path.exists(chr2Zxys_filename):
    print(f"Loading: {chr2Zxys_filename}")
    chr2ZxysList = pickle.load(open(chr2Zxys_filename, 'rb'))
if os.path.exists(cellInfo_filename):
    print(f"Loading: {cellInfo_filename}")
    cellInfoList = pickle.load(open(cellInfo_filename, 'rb'))

Loading: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\all_chr2Zxys.pkl
Loading: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\all_cellInfo.pkl


In [16]:
import multiprocessing as mp
from ImageAnalysis3.io_tools.aligner import spAligner_2_chr2homologList
print(len(cell_dfs))
parallel = True

35181


In [17]:
if 'chr2ZxysList' in locals() and 'cellInfoList' in locals():
    pass
else:
    if parallel:
        num_threads = 32
        with mp.Pool(num_threads) as aligner_pool:
            align_results = aligner_pool.starmap(spAligner_2_chr2homologList, [(_cell_df, codebook) for _cell_df in cell_dfs])
            aligner_pool.close()
            aligner_pool.join()
            aligner_pool.terminate()
        len(align_results)
        chr2ZxysList = [_r[0] for _r in align_results]
        cellInfoList = [_r[1] for _r in align_results]
    else:
        # sequential version
        chr2ZxysList = []
        cellInfoList = []

        for _cell_df in tqdm(cell_dfs):
            _sel_cell_df = _cell_df[sel_cols]
            chr_2_zxys, info_dict = spAligner_2_chr2homologList(_sel_cell_df, codebook)
            chr2ZxysList.append(chr_2_zxys)
            cellInfoList.append(info_dict)

    if not os.path.exists(chr2Zxys_filename):
        print(f"Saving: {chr2Zxys_filename}")
        pickle.dump(chr2ZxysList, open(chr2Zxys_filename, 'wb'))

    if not os.path.exists(cellInfo_filename):
        print(f"Saving: {cellInfo_filename}")
        pickle.dump(cellInfoList, open(cellInfo_filename, 'wb'))

In [18]:
cellInfoList[0]

{'rna_experiment': '20220304',
 'fov_id': '0',
 'cell_id': '12',
 'subclass': 'Oligo',
 'uid': '259202492748634617304623818845147108919'}

## partition by subclass

In [19]:
subclass_2_chr2ZxysList = {}
subclass_2_cellInfoList = {}
for _info, _chr2Zxys in zip(cellInfoList, chr2ZxysList):
    _cls = _info['subclass']
    if _cls not in subclass_2_chr2ZxysList:
        subclass_2_chr2ZxysList[_cls] = [_chr2Zxys]
        subclass_2_cellInfoList[_cls] = [_info]
    else:
        subclass_2_chr2ZxysList[_cls].append(_chr2Zxys)
        subclass_2_cellInfoList[_cls].append(_info)

In [20]:
for _cls in subclass_2_chr2ZxysList:
    print(_cls, len(subclass_2_chr2ZxysList[_cls]))

Oligo 4765
other 311
Micro 1523
Peri 738
Endo 2891
Astro 3822
OPC 1568
L6 CT 4234
L5/6 NP 596
L5 ET 1202
Pvalb 993
L6 IT 1837
Lamp5 339
Sst 746
SMC 434
L5 IT 2014
L2/3 IT 3095
Vip 357
VLMC 444
L4/5 IT 2398
L6b 741
Sncg 133


In [21]:
subclass_2_chr2Zxys_filename = os.path.join(postanalysis_folder, 'subclass_2_chr2Zxys.pkl')
if not os.path.exists(subclass_2_chr2Zxys_filename):
    print(f"Saving: {subclass_2_chr2Zxys_filename}")
    pickle.dump(subclass_2_chr2ZxysList, open(subclass_2_chr2Zxys_filename, 'wb'))
    
subclass_2_cellInfo_filename = os.path.join(postanalysis_folder, 'subclass_2_cellInfo.pkl')
if not os.path.exists(subclass_2_cellInfo_filename):
    print(f"Saving: {subclass_2_cellInfo_filename}")
    pickle.dump(subclass_2_cellInfoList, open(subclass_2_cellInfo_filename, 'wb'))

### subclass to median

In [41]:
subclass_2_median_filename = os.path.join(postanalysis_folder, 'subclass_2_medianDict.pkl')
print(subclass_2_median_filename)

if os.path.exists(subclass_2_median_filename):
    print("Loading")
    subclass_2_medianDict = np.load(subclass_2_median_filename, allow_pickle=True)
else:
    from ImageAnalysis3.structure_tools.distance import Chr2ZxysList_2_summaryDist_by_key,Chr2ZxysList_2_summaryDict
    # calculate prob
    subclass_2_medianDict = {}
    for _subclass in subclass_2_chr2ZxysList:
        subclass_2_medianDict[_subclass] = Chr2ZxysList_2_summaryDict(
            subclass_2_chr2ZxysList[_subclass], 
            total_codebook=codebook, 
            num_threads=num_threads, verbose=True)
    
    print(f"Saving to: {subclass_2_median_filename}")
    pickle.dump(subclass_2_medianDict, open(subclass_2_median_filename, 'wb'))

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\subclass_2_medianDict.pkl
-- preparing chr_2_zxys from 4765 cells in 3.219s.
-- summarize 231 inter-chr distances with 44 threads in 773.563s.
-- preparing chr_2_zxys from 311 cells in 0.031s.
-- summarize 231 inter-chr distances with 44 threads in 63.182s.
-- preparing chr_2_zxys from 1523 cells in 0.199s.
-- summarize 231 inter-chr distances with 44 threads in 214.205s.
-- preparing chr_2_zxys from 738 cells in 0.798s.
-- summarize 231 inter-chr distances with 44 threads in 148.343s.
-- preparing chr_2_zxys from 2891 cells in 1.468s.
-- summarize 231 inter-chr distances with 44 threads in 306.252s.
-- preparing chr_2_zxys from 3822 cells in 2.686s.
-- summarize 231 inter-chr distances with 44 threads in 346.031s.
-- preparing chr_2_zxys from 1568 cells in 0.187s.
-- summarize 231 inter-chr distances with 44 threads in 196.203s.
-- preparing chr_2_zxys from 4234 cells in 2.891s.
-- summarize 231 inter-chr dista

## partition by major class

In [23]:
class_2_subclass = {
    'Gluta':['L6b', 'L6 CT', 'L6 IT', 'L5 IT', 'L4/5 IT', 'L2/3 IT', 'L5/6 NP', 'L5 ET', ],
    'GABA':['Sncg', 'Sst', 'Vip', 'Pvalb', 'Lamp5', ],
    'Astro':['Astro', ],
    'Endo':['Endo', ],
    'Micro':['Micro', ],
    'Oligo':['Oligo', 'OPC', ],
}
subclass_2_class = {}
for _cls in class_2_subclass:
    for _subcls in class_2_subclass[_cls]:
        subclass_2_class[_subcls] = _cls

In [25]:
# save if not exists
class_2_chr2ZxysList_filename = os.path.join(postanalysis_folder, 'class_2_chr2Zxys.pkl')
class_2_cellInfoList_filename = os.path.join(postanalysis_folder, 'class_2_cellInfo.pkl')

if not os.path.exists(class_2_chr2ZxysList_filename):
    # calculate
    class_2_chr2ZxysList = {_cls:[] for _cls in class_2_subclass.keys()}
    class_2_cellInfoList = {_cls:[] for _cls in class_2_subclass.keys()}

    for _subcls, _chr2ZxysList in subclass_2_chr2ZxysList.items():
        _cellInfoList = subclass_2_cellInfoList[_subcls]
        if _subcls in subclass_2_class:
            class_2_chr2ZxysList[subclass_2_class[_subcls]].extend(_chr2ZxysList)
            class_2_cellInfoList[subclass_2_class[_subcls]].extend(_cellInfoList)
    # plot stats
    for _cls in class_2_chr2ZxysList:
        print(_cls, len(class_2_chr2ZxysList[_cls]))
    # save
    print(f"Writing to file: {class_2_chr2ZxysList_filename}")
    pickle.dump(class_2_chr2ZxysList, open(class_2_chr2ZxysList_filename, 'wb'))
    print(f"Writing to file: {class_2_cellInfoList_filename}")
    pickle.dump(class_2_cellInfoList, open(class_2_cellInfoList_filename, 'wb'))
else:
    print("Loading")
    class_2_chr2ZxysList = pickle.load(open(class_2_chr2ZxysList_filename, 'rb'))
    class_2_cellInfoList = pickle.load(open(class_2_cellInfoList_filename, 'rb'))

Gluta 16117
GABA 2568
Astro 3822
Endo 2891
Micro 1523
Oligo 6333
Writing to file: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_chr2Zxys.pkl
Writing to file: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_cellInfo.pkl


### class to median

In [27]:
num_threads = 32

In [28]:
class_2_median_filename = os.path.join(postanalysis_folder, 'class_2_medianDict.pkl')
print(class_2_median_filename)

if os.path.exists(class_2_median_filename):
    print("Loading")
    class_2_medianDict = np.load(class_2_median_filename, allow_pickle=True)
else:
    from ImageAnalysis3.structure_tools.distance import Chr2ZxysList_2_summaryDist_by_key,Chr2ZxysList_2_summaryDict
    # calculate prob
    class_2_medianDict = {}
    for _class in class_2_chr2ZxysList:
        class_2_medianDict[_class] = Chr2ZxysList_2_summaryDict(
            class_2_chr2ZxysList[_class], 
            total_codebook=codebook, 
            num_threads=num_threads, verbose=True)
    
    print(f"Saving to: {class_2_median_filename}")
    pickle.dump(class_2_medianDict, open(class_2_median_filename, 'wb'))

\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_medianDict.pkl
-- preparing chr_2_zxys from 16117 cells in 18.798s.
-- summarize 231 inter-chr distances with 32 threads in 1337.561s.
-- preparing chr_2_zxys from 2568 cells in 1.556s.
-- summarize 231 inter-chr distances with 32 threads in 205.721s.
-- preparing chr_2_zxys from 3822 cells in 2.175s.
-- summarize 231 inter-chr distances with 32 threads in 280.441s.
-- preparing chr_2_zxys from 2891 cells in 2.679s.
-- summarize 231 inter-chr distances with 32 threads in 231.762s.
-- preparing chr_2_zxys from 1523 cells in 0.251s.
-- summarize 231 inter-chr distances with 32 threads in 165.021s.
-- preparing chr_2_zxys from 6333 cells in 5.160s.
-- summarize 231 inter-chr distances with 32 threads in 759.657s.
Saving to: \\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_0920\class_2_medianDict.pkl
