# Reads assignment example

In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from starmap.sequencing import *
from natsort import natsorted
from scipy.io import loadmat, savemat
from skimage.filters import threshold_otsu
from skimage.color import label2rgb
from tqdm import tqdm

### Functions

In [2]:
from functools import wraps
from time import time

# Timer
def timer(func):
    @wraps(func)
    def _time_it(*args, **kwargs):
        start = int(round(time() * 1000))
        try:
            return func(*args, **kwargs)
        finally:
            end_ = int(round(time() * 1000)) - start
            end_ = round(end_ / 1000, 4)
            print(f"Total execution time: {end_ if end_ > 0 else 0} s")
    return _time_it


@timer
# Load reads and their positions from mat file
def load_reads(fpath, reads_file):
    S = loadmat(os.path.join(fpath, reads_file))
    bases = [str(i[0][0]) for i in S["merged_reads"]]
    points = S["merged_points"]
    temp = np.zeros(points.shape)
    temp[:, 0] = np.round(points[:, 1]-1)
    temp[:, 1] = np.round(points[:, 0]-1)
    temp[:, 2] = np.round(points[:, 2]-1)
    
    print(f"Number of reads: {len(bases)}")
    
    return bases, temp


# Load gene table from genes.csv
def load_genes(fpath):
    genes2seq = {}
    seq2genes = {}
    with open(os.path.join(fpath, "genes.csv"), encoding='utf-8-sig') as f:
        for l in f:
            fields = l.rstrip().split(",")
            genes2seq[fields[0]] = "".join([str(s+1) for s in encode_SOLID(fields[1][::-1])])
            seq2genes[genes2seq[fields[0]]] = fields[0]
    return genes2seq, seq2genes

## Input

In [16]:
# IO path 
# base_path = 'Z:/Data/Analyzed/2021-08-25-Hu-HelaSTARmap'
base_path = 'Z:/Data/Analyzed/2021-07-09-Hu-HelaRIBOmap'

out_path = os.path.join(base_path, 'output')
if not os.path.exists(out_path):
    os.mkdir(out_path)

## Generate reads pattern plot 

In [17]:
# load gene cluters
gene_cluster_df = pd.read_csv('../other-datasets/gene_cluster_label.csv')
gene_cluster_df

Unnamed: 0,cluster,gene
0,0,AARS
1,0,ASXL1
2,0,COX7B
3,1,CTGF
4,0,DLX4
...,...,...
976,0,ZRANB1
977,3,ZSCAN12
978,1,ZSCAN29
979,3,ZXDC


In [22]:
color_dict = {0:'green', 1:'yellow', 2:'white', 3:'magenta'}

print(f"Current sample: {base_path}")

# Load genes
genes2seqs, seqs2genes = load_genes(base_path)

# Load reads 
bases, points = load_reads(base_path, "2021-10-03-merged_goodPoints_max3d")
bases = list(map(lambda x: seqs2genes[x], bases))
bases = np.array(bases)
points = np.array(points)

fig_out_path = os.path.join(os.path.join(base_path, 'figures'))
if not os.path.exists(fig_out_path):
    os.mkdir(fig_out_path)

# Load raw image
# current_img = load_label_image(os.path.join(base_path, 'images'), fname='overlay_max.tif')
current_img = load_label_image(os.path.join(base_path, 'segmentation'), fname='cell_bnd.tif')

# Load segmentation
current_seg = load_label_image(os.path.join(base_path, 'segmentation'), fname='cell.tif')

points = points.astype(int)
reads_assignment = current_seg[points[:, 2], points[:, 0], points[:, 1]]

reads_info = pd.DataFrame({'x':points[:, 0], 'y':points[:, 1], 'z':points[:, 2], 'cell_label':reads_assignment})
reads_info = reads_info.astype(np.int32)
reads_info['orig_index'] = reads_info['cell_label'] - 1
reads_info['gene'] = bases

for current_cluster in sorted(gene_cluster_df['cluster'].unique()):
    print(f"Current cluster: {current_cluster}")
    current_gene_list = gene_cluster_df.loc[gene_cluster_df['cluster'] == current_cluster, 'gene'].to_list()
    current_reads_df = reads_info.loc[reads_info['gene'].isin(current_gene_list), :]

    # remove unassigned reads
    current_reads_df = current_reads_df.loc[current_reads_df['cell_label'] != 0, :]

    plt.figure(figsize=(current_img.shape[0]/1000, current_img.shape[1]/1000), dpi=1000)
    plt.imshow(current_img, cmap='gray')
    plt.plot(current_reads_df['y'], current_reads_df['x'], '.', color=color_dict[current_cluster], markersize=.3, markeredgewidth=0.0)
    plt.axis('off')
    plt.tight_layout(pad=0)
    # plt.show()
    current_fig_path = f"{fig_out_path}/cluster_{current_cluster}.tiff"
    plt.savefig(current_fig_path, dpi=1000, pil_kwargs={"compression": "tiff_lzw"}, bbox_inches='tight', pad_inches=0)
    plt.clf()
    plt.close()

plt.figure(figsize=(current_img.shape[0]/1000, current_img.shape[1]/1000), dpi=1000)
plt.imshow(current_img, cmap='gray')
for current_cluster in sorted(gene_cluster_df['cluster'].unique()):
    current_gene_list = gene_cluster_df.loc[gene_cluster_df['cluster'] == current_cluster, 'gene'].to_list()
    current_reads_df = reads_info.loc[reads_info['gene'].isin(current_gene_list), :]

    # remove unassigned reads
    current_reads_df = current_reads_df.loc[current_reads_df['cell_label'] != 0, :]

    plt.plot(current_reads_df['y'], current_reads_df['x'], '.', color=color_dict[current_cluster], markersize=.3, markeredgewidth=0.0)
plt.axis('off')
plt.tight_layout(pad=0)
# plt.show()
current_fig_path = f"{fig_out_path}/cluster_all.tiff"
plt.savefig(current_fig_path, dpi=1000, pil_kwargs={"compression": "tiff_lzw"}, bbox_inches='tight', pad_inches=0)
plt.clf()
plt.close()

Current sample: Z:/Data/Analyzed/2021-07-09-Hu-HelaRIBOmap
Number of reads: 5189148
Total execution time: 36.272 s
Current cluster: 0
Current cluster: 1
Current cluster: 2
Current cluster: 3
