This notebook calculates normalized distance-ratio value (aka DR, d1/(d1+d2)) for all reads in each sample.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
from tqdm import tqdm

import scanpy as sc
from scipy.ndimage import distance_transform_edt as dt
from skimage import measure
from scipy.stats import *
from tifffile import imread
from scipy.io import loadmat
from starmap.sequencing import *

In [2]:
ppath = 'Z:/Data/Processed/2022-10-15-Rena-SkinCulture254-gene/output'
area_list = list(range(1,7))
area2sample = {
    1: '2h pulse 0h chase',
    2: '2h pulse 2h chase', 
    3: '2h pulse 4h chase',
    4: '20h pulse',
    5: '2h pulse 6h chase',
    6: 'STARmap'
}

In [3]:
adata = sc.read_h5ad(os.path.join(ppath, '2022-11-12-Rena-Foreskin254gene-preflt.h5ad'))
adata

AnnData object with n_obs × n_vars = 12574 × 254
    obs: 'orig_index', 'sample', 'area', 'x', 'y', 'z', 'nuclues_volume', 'AF546', 'AF546_nucleus', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'plate_number'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'max_counts'
    layers: 'nucleus'

In [4]:
genes = adata.var.index
genes_ = list(genes)

In [5]:
def load_reads_3D(fpath, reads_file):
    S = loadmat(os.path.join(fpath, reads_file))
    bases = [str(i[0][0]) for i in S["merged_reads"]]
    points = S["merged_points"]
    temp = np.zeros(points.shape)
    temp[:, 0] = np.round(points[:, 2]-1)
    temp[:, 1] = np.round(points[:, 1]-1)
    temp[:, 2] = np.round(points[:, 0]-1)

    print(f"Number of reads: {len(bases)}")
    return bases, temp

Voxel size: 0.0901x0.0901x0.2930 micron^3 

In [6]:
def cell_iter_cont_all(rp, curr_cell, cell_index, nuclei_seg, points, bases, reads_assign_cell, reads_assign_nucleus, cell_by_barcode, genes_list, sampling=[1,1,1]):
    '''record into cell_by_barcode (collector), each entry (cell_x_gene) has a list of DR values'''
    # mask and bbox of current cell & nuclei (this ensures distance calculated is wrt the current cell not neigboring ones)
    cb = rp[curr_cell].bbox
    cell_mask = rp[curr_cell].image
    nuc_mask = nuclei_seg[cb[0]:cb[3], cb[1]:cb[4], cb[2]:cb[5]]
    nuc_mask[nuc_mask != curr_cell+1] = 0

    # dt
    cdt = dt(cell_mask, sampling=sampling)
    ndt = dt(np.logical_not(nuc_mask), sampling=sampling)
    area = np.logical_xor(cell_mask, nuc_mask)
    rdt = np.zeros(cdt.shape)
    rdt[area] = ndt[area] / cdt[area] 

    # get cytosol points & corr. bases of current cell
    cell_pts = points[np.logical_and(reads_assign_cell==curr_cell+1, reads_assign_nucleus==0)]
    cell_bs = bases[np.logical_and(reads_assign_cell==curr_cell+1, reads_assign_nucleus==0)]

    # bases to gene index
    bases_idx = np.array([genes_list.index(g) for g in cell_bs])
    
    for i, bs in enumerate(cell_bs):
        pt = cell_pts[i]
        r = rdt[pt[0]-cb[0], pt[1]-cb[1], pt[2]-cb[2]]
        if type(cell_by_barcode[cell_index, bases_idx[i]]) == list:
            cell_by_barcode[cell_index, bases_idx[i]].append(r/(r+1))
        else: cell_by_barcode[cell_index, bases_idx[i]] = [r/(r+1)]
    return

2D

In [None]:
for area in area_list: 
    # sample specific inputs
    _adata = adata[adata.obs['sample'] == area2sample[area]]
    sample_cells = adata.obs['orig_index']
    cell_by_barcode = np.zeros((adata.n_obs, adata.n_var), dtype=object)
    cell_seg = imread(os.path.join(ppath, '2022-10-16-2D-segmentation', 'flamingo_backscaled', f'Area{area}_cell_2D.tif'))
    nucleus_seg = imread(os.path.join(ppath, '2022-10-16-2D-segmentation', 'dapi_backscaled', f'Area{area}_nuclei_2D.tif'))
    mat_bases, mat_points = load_reads_3D(os.path.join(ppath, '2022-10-15-merged-points', f'Area_{area}'), 'merged_goodPoints.mat')
    points = mat_points.astype(int)
    bases = np.array(mat_bases)
    genes = adata.var.index
    rp = measure.regionprops(cell_seg)
    reads_assign_cell = cell_seg[points[:,0], points[:,1], points[:,2]]
    reads_assign_nucleus = nucleus_seg[points[:,0], points[:,1], points[:,2]]
    print('======= Input for sample {} is ready. Now start processing ========'.format(area))

    for cell_index, cell in enumerate(tqdm(sample_cells)):
        cell_iter_cont_all(rp, cell, cell_index, nucleus_seg, points, bases, reads_assign_cell, reads_assign_nucleus, cell_by_barcode, genesToIndex)
    np.save(area+'_DR.npy', cell_by_barcode)
    break

3D

In [18]:
for area in area_list[2:]: 
    # sample specific inputs
    _adata = adata[adata.obs['plate_number'] == area]
    sample_cells = _adata.obs['orig_index']
    cell_by_barcode = np.zeros((_adata.n_obs, _adata.n_vars), dtype=object)
    cell_seg = imread(os.path.join(ppath, '2022-11-11-3D-segmentation', 'cell', f'Area{area}_cell_3Dseg.tif'))
    nucleus_seg = imread(os.path.join(ppath, '2022-11-11-3D-segmentation', 'nucleus', f'Area{area}_nucleus_3Dseg.tif'))
    genes2seqs, seqs2genes = load_genes('Z:/Data/Processed/2022-10-15-Rena-SkinCulture254-gene')
    mat_bases, mat_points = load_reads_3D(os.path.join(ppath, '2022-11-07-merged-points', f'Area{area}'), 'merged_goodPoints.mat')
    points = mat_points.astype(int)
    bases = np.array([seqs2genes[x] for x in mat_bases])
    rp = measure.regionprops(cell_seg)
    reads_assign_cell = cell_seg[points[:,0], points[:,1], points[:,2]]
    reads_assign_nucleus = nucleus_seg[points[:,0], points[:,1], points[:,2]]
    print('======= Input for sample {} is ready. Now start processing ========'.format(area))

    for cell_index, cell in enumerate(tqdm(sample_cells)):
        cell_iter_cont_all(rp, cell, cell_index, nucleus_seg, points, bases, reads_assign_cell, reads_assign_nucleus, cell_by_barcode, genes_, sampling=[0.2930, 0.0901, 0.0901])
    np.save(f'Z:/Connie/02.TEMPOmap/04.revisionForeskin254Gene/output/DR_area{area}.npy', cell_by_barcode)
    # break

Number of reads: 628307


  rdt[area] = ndt[area] / cdt[area]
100%|██████████| 2731/2731 [36:00<00:00,  1.26it/s] 


Number of reads: 1404691


100%|██████████| 2310/2310 [33:03<00:00,  1.16it/s] 


Number of reads: 480138


100%|██████████| 1665/1665 [23:40<00:00,  1.17it/s]


Number of reads: 2299965


100%|██████████| 2077/2077 [42:43<00:00,  1.23s/it] 
