# 2. Generate h5ad Input

2023-03-23

In [None]:
# Load libraries

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData

# Load starmap python library 
import starmap.sc_util as su

# test()

## Input

In [None]:
# IO path
base_path = 'path_to_preprocessed_dataset'

out_path = os.path.join(base_path, 'output')
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

# Load whole cell data
primary_dataset = 'whole_cell'
expr_path = os.path.join(out_path, primary_dataset, 'complete_cell_barcode_count.csv')
var_path = os.path.join(out_path, primary_dataset, 'cell_barcode_names.csv')
obs_path = os.path.join(out_path, primary_dataset, 'complete_meta.csv')

# Add expression data to the AnnData object 
expr_x = np.loadtxt(expr_path, delimiter=',')
var = pd.read_csv(var_path, header=None)
var = pd.DataFrame(index=var.iloc[:,2].to_list())
obs = pd.read_csv(obs_path, index_col=0)

adata = AnnData(X=expr_x, var=var, obs=obs)
adata

In [None]:
# Load nucleus data
primary_dataset = 'nucleus'
expr_path = os.path.join(out_path, primary_dataset, 'complete_cell_barcode_count.csv')
var_path = os.path.join(out_path, primary_dataset, 'cell_barcode_names.csv')
obs_path = os.path.join(out_path, primary_dataset, 'complete_meta.csv')

# Add expression data to the AnnData object 
expr_x = np.loadtxt(expr_path, delimiter=',')
var = pd.read_csv(var_path, header=None)
var = pd.DataFrame(index=var.iloc[:,2].to_list())
obs = pd.read_csv(obs_path, index_col=0)

nucdata = AnnData(X=expr_x, var=var, obs=obs)

In [None]:
# Load cytoplasm data
primary_dataset = 'cytoplasm'
expr_path = os.path.join(out_path, primary_dataset, 'complete_cell_barcode_count.csv')
var_path = os.path.join(out_path, primary_dataset, 'cell_barcode_names.csv')
obs_path = os.path.join(out_path, primary_dataset, 'complete_meta.csv')

# Add expression data to the AnnData object 
expr_x = np.loadtxt(expr_path, delimiter=',')
var = pd.read_csv(var_path, header=None)
var = pd.DataFrame(index=var.iloc[:,2].to_list())
obs = pd.read_csv(obs_path, index_col=0)

cytodata = AnnData(X=expr_x, var=var, obs=obs)

In [None]:
# Load outer_cytoplasm data
primary_dataset = 'outer_cytoplasm'
expr_path = os.path.join(out_path, primary_dataset, 'complete_cell_barcode_count.csv')
var_path = os.path.join(out_path, primary_dataset, 'cell_barcode_names.csv')
obs_path = os.path.join(out_path, primary_dataset, 'complete_meta.csv')

# Add expression data to the AnnData object 
expr_x = np.loadtxt(expr_path, delimiter=',')
var = pd.read_csv(var_path, header=None)
var = pd.DataFrame(index=var.iloc[:,2].to_list())
obs = pd.read_csv(obs_path, index_col=0)

out_cytodata = AnnData(X=expr_x, var=var, obs=obs)

In [None]:
# Load ER data
primary_dataset = 'er'
expr_path = os.path.join(out_path, primary_dataset, 'complete_cell_barcode_count.csv')
var_path = os.path.join(out_path, primary_dataset, 'cell_barcode_names.csv')
obs_path = os.path.join(out_path, primary_dataset, 'complete_meta.csv')

# Add expression data to the AnnData object 
expr_x = np.loadtxt(expr_path, delimiter=',')
var = pd.read_csv(var_path, header=None)
var = pd.DataFrame(index=var.iloc[:,2].to_list())
obs = pd.read_csv(obs_path, index_col=0)

erdata = AnnData(X=expr_x, var=var, obs=obs)

## QC

In [None]:
# Remove cells without ER structure 
cells_to_keep = erdata.obs['area'] != 0 

# Subset 
adata = adata[cells_to_keep, ]
adata.layers['nucleus'] = nucdata[cells_to_keep, ].X
adata.layers['cytoplasm'] = cytodata[cells_to_keep, ].X
adata.layers['er'] = erdata[cells_to_keep, ].X
adata.layers['outer_cytoplasm'] = out_cytodata[cells_to_keep, ].X

In [None]:
# Remove cells out of focus in Hela STARmap (tile_51, y=27, x=9121) (2075, 11169)
temp_df = adata.obs.copy()
cells_to_remove = temp_df.loc[(temp_df.x > 27) & (temp_df.x < 2075) & (temp_df.y > 9121) & (temp_df.y < 11169), ].index
adata = adata[~adata.obs.index.isin(cells_to_remove), ]

In [None]:
adata

In [None]:
# Plot top 20 most expressed genes 
sc.pl.highest_expr_genes(adata, n_top=20)

## Regular Filtration

In [None]:
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, percent_top=None, inplace=True)

# Calculate max count for each gene
adata.var['max_counts'] = adata.X.max(axis=0)

In [None]:
# Filter cell by area 
# adata = su.filter_cells_by_area(adata, min_area=1000, max_area=100000, save=False)

In [None]:
# Per-cell stats plot
su.plot_stats_per_cell(adata, save=False)

In [None]:
# Per-cell stats violin plot
sc.pl.violin(adata, ['total_counts', 'n_genes_by_counts', 'area'],
             jitter=0.4, multi_panel=True)

In [None]:
# Check reads per cell interactively 
import pandas_bokeh
adata.obs.loc[:, ['total_counts']].plot_bokeh.hist(
    bins=np.linspace(0, 8000, 500),
    vertical_xlabel=True,
    hovertool=True,
    line_color="black")

In [None]:
# Get quantiles of reads
su.show_reads_quantile(adata)

In [None]:
# Filtration 
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=10)

# Filter gene by max counts 
# adata = adata[:, adata.var['max_counts'] > 2]
# adata.var['detected'] = adata.var['max_counts'] > 2
# adata.var['highly_variable'] = adata.var['max_counts'] > 2

# sc.pp.filter_cells(adata, min_counts=300)
# sc.pp.filter_cells(adata, max_counts=4000)

adata

## Output

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
adata.write_h5ad(f"{base_path}output/{date}-FUCCI-raw.h5ad")

## FUCCI protein quantification

In [None]:
# adata = sc.read_h5ad(os.path.join(out_path, '2022-02-05-Hu-FUCCI-raw.h5ad'))
adata.obs['Fluo'] = 0
adata.obs['mKO2'] = 0
adata

In [None]:
from starmap.sequencing import *
from tqdm.notebook import tqdm

current_sample = 'starmap'

index_list = adata.obs.loc[adata.obs['sample'] == 'STARmap', 'orig_index'].to_list()

# Path
seg_path = os.path.join(base_path, 'segmentation', current_sample)
img_path = os.path.join(base_path, 'images', current_sample)

# Load segmentation
current_seg = load_label_image(seg_path, fname='nuclei.tif')

# Load fluro image
fluro_ch01 = load_label_image(img_path, fname='fluro_ch01.tif')
fluro_ch02 = load_label_image(img_path, fname='fluro_ch02.tif')

# Get region information 
print('Getting region information...')
intensity_ch01 = []
intensity_ch02 = []
for i, region in enumerate(tqdm(regionprops(current_seg))):
    orig_index = region.label-1
    if orig_index in index_list:
        adata.obs.loc[(adata.obs['orig_index'] == orig_index) & (adata.obs['sample'] == 'STARmap'), 'Fluo'] = (fluro_ch01[region.coords[:, 0], region.coords[:, 1], region.coords[:, 2]]).sum()
        adata.obs.loc[(adata.obs['orig_index'] == orig_index) & (adata.obs['sample'] == 'STARmap'), 'mKO2'] = (fluro_ch02[region.coords[:, 0], region.coords[:, 1], region.coords[:, 2]]).sum()

# Output

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
adata.write_h5ad(f"{base_path}output/{date}-FUCCI-raw.h5ad")