In [None]:
# imports and chamber selection
%load_ext autoreload
%autoreload 2
import iss_preprocess as iss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from flexiznam.config import PARAMETERS
from pathlib import Path
from itertools import cycle
from matplotlib.animation import FuncAnimation

import scanpy as sc
import re
import seaborn as sns
import bg_atlasapi as bga

data_path = 'becalia_rabies_barseq/BRYC65.1d/chamber_13/'

processed_path = Path(PARAMETERS['data_root']['processed'])
metadata = iss.io.load_metadata(data_path)

ops = iss.io.load_ops(data_path)

## Filter detected spots (skip to Load spot and mask dfs for prefiltered ROI 5 spots)

Load detected spots barcodes and filter them by dot product.

In [None]:
roi = 5
gaussian_width_um = 10

# get spots
raw_spots = dict()
spot_list = ['gene', 'barcode_round', 'hybridisation_1_1', 'hybridisation_2_1']
for prefix in spot_list:
    print(f"Loading {prefix}", flush=True)
    raw_spots[prefix] = pd.read_pickle(
        processed_path / data_path / f"{prefix}_spots_{roi}.pkl")
raw_spots['gene'].head()

In [None]:
# filter spots
barcode_dot_threshold = 0.2
omp_score_threshold = 0.1
hyb_score_threshold = 0.8

spots = dict()
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(7, 5)
kw = dict(histtype='step', color='k', lw=2)
axes[0,0].hist(raw_spots['barcode_round'].dot_product_score, bins=np.arange(-0.5, 1.1, 0.05), **kw)
axes[0,0].axvline(barcode_dot_threshold, color='k')
axes[0, 0].set_xlabel('Barcode dot score')
axes[0, 0].set_ylabel('# barcode rolonies')

axes[0,1].hist(raw_spots['gene'].spot_score, bins=np.arange(0, 1.2, 0.05), **kw)
axes[0,1].axvline(omp_score_threshold, color='k')
axes[0,1].set_xlabel('OMP score')
axes[0, 1].set_ylabel('# genes rolonies')

for i in range(2):
    axes[1,i].hist(raw_spots[f'hybridisation_{i+1}_1'].score, bins=np.arange(-0.50, 1.2, 0.05), **kw)
    axes[1,i].axvline(hyb_score_threshold, color='k')
    axes[1,i].set_xlabel('Hybridisation score')
    axes[1, i].set_ylabel(f'# hyb {i+1} rolonies')

plt.tight_layout()
raw_spots['gene'].head()

ok_barcode = raw_spots['barcode_round'].dot_product_score > barcode_dot_threshold
spots['barcode_round'] = raw_spots['barcode_round'][ok_barcode].copy()
print(f'Keeping {np.sum(ok_barcode)} barcode rolonies out of {len(ok_barcode)}.')
ok_genes = raw_spots['gene'].spot_score > omp_score_threshold
spots['gene'] = raw_spots['gene'][ok_genes].copy()
print(f'Keeping {np.sum(ok_genes)} genes rolonies out of {len(ok_genes)}.')
for i in range(2):
    ok_hyb = raw_spots[f'hybridisation_{i +1}_1'].score > hyb_score_threshold
    spots[f'hybridisation_{i +1}_1'] = raw_spots[f'hybridisation_{i +1}_1'][ok_hyb].copy()
    print(f'Keeping {np.sum(ok_hyb)} hybridisation rolonies out of {len(ok_hyb)} for round {i+1}.')

In [None]:
# get masks and expand 
# (this can be done in segment_rolonies but we want to keep a reference to the big masks)
masks = np.load(processed_path / data_path / f"masks_{roi}.npy")

In [None]:
from skimage.segmentation import expand_labels
pixel_size=0.18
big_mask = expand_labels(masks, distance=int(3/pixel_size))

In [None]:
#Or use premade ROI 5 bigmask to save time
big_mask = np.load(processed_path / data_path / f"big_masks_{roi}.npy")

# Find barcodes and genes inside cells

In [None]:
# find which barcode is in which cells
from iss_preprocess.pipeline.segment import segment_rolonies
barcode_df, genes_df = segment_rolonies(data_path,
    roi=roi,
    mask_expansion=None,
    masks=big_mask,
    barcode_dot_threshold=barcode_dot_threshold,
    spot_score_threshold=omp_score_threshold,
    hyb_score_threshold=hyb_score_threshold,
)
barcode_df.head()

# Create cell mask location df

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
from iss_preprocess.pipeline.segment import make_cell_dataframe
cell_df = make_cell_dataframe(data_path, roi, masks=big_mask, atlas_size=10)
cell_df.head()

# (Load spot and mask location dfs) - ROI5

In [None]:
genes_df = pd.read_pickle('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/genes_df.pkl')
barcode_df = pd.read_pickle('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/barcode_df.pkl')
cell_df = pd.read_pickle('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/cell_df.pkl')

# Create adata object for cell analysis

In [None]:
cell_df_all = cell_df[cell_df.index.isin(genes_df.index)]
genes_df_all = genes_df[genes_df.index.isin(cell_df.index)]

adata = sc.AnnData(genes_df_all)
adata.obs = cell_df_all
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)

#Add fine area info
areas = adata.obs.area_acronym.to_list()
layer = out = [re.sub(r'\D', '', s) for s in areas]
adata.obs['layer'] = layer
barcode_sum_df = barcode_df.sum(axis=1).to_frame()
barcode_sum_df.index.names = ['label']
barcode_sum_df = barcode_sum_df[barcode_sum_df.index.isin(adata.obs.index.astype('int'))]
barcode_sum_df.reindex(adata.obs.index.astype(int).to_list())
barcode_sum_df.index = barcode_sum_df.index.astype('str')
adata.obs['sum_barcode_counts'] = barcode_sum_df

# Create subsets for iso and vis cortex

In [None]:
atlas_name = "allen_mouse_%dum" % 10
bg_atlas = bga.bg_atlas.BrainGlobeAtlas(atlas_name, '/nemo/lab/znamenskiyp/home/shared/resources/.brainglobe/')

#Generate Isocortex and visual cortex subsets
isocortex_acronyms = bg_atlas.get_structure_descendants('Isocortex')
adata_iso = adata[adata.obs.area_acronym.isin(isocortex_acronyms)].copy()
visual_cortex_acronyms = bg_atlas.get_structure_descendants('VISp')
adata_vis = adata[adata.obs.area_acronym.isin(visual_cortex_acronyms)].copy()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15,6))
sc.pl.highest_expr_genes(adata, n_top=20, ax=ax[0], show=False)
sc.pl.highest_expr_genes(adata_iso, n_top=20, ax=ax[1], show=False)
sc.pl.highest_expr_genes(adata_vis, n_top=20, ax=ax[2], show=False)
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
min_genes = 4
min_counts = 10
max_counts = 150
for adata in [adata, adata_iso, adata_vis]:
    sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
    sc.pp.filter_cells(adata, min_counts=min_counts)
    adata.raw = adata
    sc.pp.normalize_total(adata, target_sum=10)
    sc.pp.log1p(adata)    
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pl.pca_variance_ratio(adata, log=True, show=False)
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=30)
    sc.tl.umap(adata, min_dist=0.1)
    sc.tl.leiden(adata, resolution=0.5)
    sc.tl.louvain(adata, resolution=0.5)

In [None]:
#fig, ax = plt.subplots(3, 1, figsize=(6,15))
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True, ax=ax[0], show=False)
sc.pl.violin(adata_iso, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True, ax=ax[1], show=False)
sc.pl.violin(adata_vis, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True, ax=ax[2], show=False)
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15,6))
sc.pl.pca(adata, color='Slc17a7', ax=ax[0], show=False)
sc.pl.pca(adata_iso, color='Slc17a7', ax=ax[1], show=False)
sc.pl.pca(adata_vis, color='Slc17a7', ax=ax[2], show=False)
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
for adata in [adata, adata_iso, adata_vis]:
    sc.tl.umap(adata, min_dist=0.1)
    sc.tl.leiden(adata, resolution=1)

# Plot UMAPs

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
sc.set_figure_params(figsize=(9,9))
sc.pl.umap(
    adata, 
    use_raw=True, 
    ncols=2, 
    color=['Slc17a7', 'Gad1', 'Sst', 'Vip', 'leiden', 'louvain', 'sum_barcode_counts'], 
    frameon=False,
    size=30, 
    vmax=['p99.9', 'p99.9', 'p99.9', 'p99.9', None, 30],
    legend_loc='on data', 
    legend_fontsize=20, 
    legend_fontoutline=2
)

In [None]:
sc.set_figure_params(figsize=(9,9))
sc.pl.umap(
    adata_iso, 
    use_raw=True, 
    ncols=2, 
    color=['Slc17a7', 'Gad1', 'Sst', 'Vip', 'leiden', 'sum_barcode_counts'], 
    frameon = False, 
    size=30, 
    vmax=['p99.9', 'p99.9', 'p99.9', 'p99.9', None, 30],
    legend_loc='on data', 
    legend_fontsize=20, 
    legend_fontoutline=2
)

In [None]:

barcode_threshold = 10
adata_vis.obs['barcoded'] = (adata_vis.obs['sum_barcode_counts'] > barcode_threshold).astype(int)
adata_vis.obs['barcoded'][adata_vis.obs['barcoded'] == 0] = np.nan
sc.set_figure_params(figsize=(9,9))
sc.pl.umap(adata_vis, 
    use_raw=True, 
    ncols=2, 
    color=['Slc17a7', 'Gad1', 'Sst', 'Vip', 'leiden', 'barcoded'], 
    frameon=False, 
    size=30, 
    vmax=['p99.9', 'p99.9', 'p99.9', 'p99.9', None, 1],
    legend_loc='on data', 
    legend_fontsize=20,
    legend_fontoutline=2
)

In [None]:
plt.figure(figsize=(20,20))
#plt.scatter(adata.obs.x, adata.obs.y, c=adata.obs.leiden.astype(int))
sns.scatterplot(adata.obs.x, adata.obs.y, hue=adata.obs.leiden, s=3)
#plt.xlim(5000, 20000)
#plt.ylim(15000, 4000)
plt.gca().set_aspect('equal')
plt.gca().invert_yaxis()
plt.axis('off')

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(x=adata_iso.obs.x, y=adata_iso.obs.y, hue=adata_iso.obs.leiden, s=12)
#sns.scatterplot(adata_iso.obs.x, adata_iso.obs.y, c=adata_iso.obs.sum_barcode_counts, s=12)
plt.xlim(1000, 25000)
#plt.ylim(15000, 4000)
plt.gca().set_aspect('equal')
plt.gca().invert_yaxis()
plt.axis('off')

In [None]:
plt.figure(figsize=(20,20))
#plt.scatter(adata.obs.x, adata.obs.y, c=adata.obs.leiden.astype(int))
sns.scatterplot(x=adata_vis.obs.x, y=adata_vis.obs.y, hue=adata_vis.obs.leiden)
plt.xlim(5000, 20000)
plt.ylim(15000, 4000)
plt.gca().set_aspect('equal')
plt.axis('off')

In [None]:
#Save adata with all analysis
adata.write('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/adata.h5ad')
adata_iso.write('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/adata_iso.h5ad')
adata_vis.write('/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/adata_vis.h5ad')

In [None]:
#Individual clusters can be subset and then PCA, UMAP etc can be performed again
adata_exc_cortex = adata_iso[adata_iso.obs.leiden == '0'].copy()
adata_exc_cortex