In [1]:
import scanpy as sc
import anndata
import logging
import os
import numpy as np
import pandas as pd
import json
# import scanpy.external as sce
logging.basicConfig(level=logging.INFO)
import scvi

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
import umap

# from PySpots.fish_helpers import *
from dredFISH.Analysis import basicu
from dredFISH.Analysis import regu 
from dredFISH.Analysis import celltypeu
from dredFISH.Analysis import TissueGraph_basics as tgh
# from dredFISH.Analysis import powerplots

import importlib
importlib.reload(tgh)
importlib.reload(celltypeu)
importlib.reload(basicu)
sc.set_figure_params(figsize=(7, 7))

INFO:pytorch_lightning.utilities.seed:Global seed set to 0


In [2]:
logging.info('hi')

INFO:root:hi


# file paths and load data

In [3]:
prj_dir = '/bigstore/GeneralStorage/fangming/projects/dredfish/'
dat_dir = prj_dir + 'data/'
res_dir = prj_dir + 'data_dump/'
fig_dir = prj_dir + 'figures/'

input_csv = f'{res_dir}noHarmony_March30_v4_isocortex_hpf.csv'
print(input_csv)

output_csv = f'{res_dir}Joint_UMAP_isocortex_hpf_April14.csv'
output2_csv = f'{res_dir}Joint_UMAP_isocortex_hpf_subsamp100_level1_April14.csv'
print(output_csv)
print(output2_csv)

# dredFISH data
dataset = 'DPNMF_PolyA_2021Nov19' # dataset tag
base_path = '/bigstore/Images2021/gaby/dredFISH/DPNMF_PolyA_2021Nov19/'
fish_path = f'{base_path}fishdata_2022Mar23/' # Zach's new version
# anndata_path = f'{base_path}results/raw_fishdata_Mar30.h5ad'
anndata_path = '/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/DPNMF_PolyA_2021Nov19_v2_results_anndata.h5ad'
# anndata_path = ''

# allen data
scrna_path = '/bigstore/GeneralStorage/fangming/projects/dredfish/data/rna/scrna_ss_ctxhippo_a_exon_DPNMF_matrix.h5ad'

# allen tree
allen_tree_path='/bigstore/GeneralStorage/fangming/reference/allen_ccf/structures.json'

# analysis metadata
meta_path = '/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/analysis_meta_Mar31.json'


/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/noHarmony_March30_v4_isocortex_hpf.csv
/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/Joint_UMAP_isocortex_hpf_April14.csv
/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/Joint_UMAP_isocortex_hpf_subsamp100_level1_April14.csv


In [4]:
%%time
# results
dftypes = pd.read_csv(input_csv, index_col=0)
dftypes = dftypes.filter(regex=r'^Level_[0-9]', axis=1)
levels = dftypes.columns.values

# load dredFISH
TMG = tgh.TissueMultiGraph(name=dataset)
# load raw counts data
if os.path.isfile(anndata_path):
    print('from AnnData')
    TMG.load_from_anndata(anndata_path)
else:
    print('from FishData')
    TMG.load_from_fishdata(fish_path, dataset, output_path=anndata_path)
# normalize (optional for spatial registration)
TMG.normalize_data(norm_cell=True, norm_bit=False)

### for now
data = TMG.data
data.obs = data.obs.join(dftypes)

# print(data)
    
# allen scrna matrix
ref_data = anndata.read_h5ad(scrna_path)
# print(ref_data)

# allen tree
allen_tree, allen_maps = regu.load_allen_tree(allen_tree_path)

# analysis
with open(meta_path, 'r') as fh:
    meta = json.load(fh)


from AnnData


INFO:root:113758 cells, minimum counts = 1248.0


CPU times: user 889 ms, sys: 148 ms, total: 1.04 s
Wall time: 1.04 s


In [5]:
ref_data.obs = ref_data.obs.rename({
                                    'class_label': 'Level_1_class_label',
                                    'neighborhood_label': 'Level_2_neighborhood_label',
                                    'subclass_label': 'Level_3_subclass_label',
                                    'cluster_label': 'Level_5_cluster_label',
                                    }, axis=1)
ref_data

AnnData object with n_obs × n_vars = 73347 × 24
    obs: 'donor_sex_id', 'donor_sex_label', 'donor_sex_color', 'region_id', 'region_label', 'region_color', 'platform_label', 'cluster_order', 'Level_5_cluster_label', 'cluster_color', 'subclass_order', 'Level_3_subclass_label', 'subclass_color', 'neighborhood_id', 'Level_2_neighborhood_label', 'neighborhood_color', 'class_order', 'Level_1_class_label', 'class_color', 'exp_component_name', 'external_donor_name_label', 'full_genotype_label', 'facs_population_plan_label', 'injection_roi_label', 'injection_materials_label', 'injection_method_label', 'injection_type_label', 'full_genotype_id', 'full_genotype_color', 'external_donor_name_id', 'external_donor_name_color', 'facs_population_plan_id', 'facs_population_plan_color', 'injection_materials_id', 'injection_materials_color', 'injection_method_id', 'injection_method_color', 'injection_roi_id', 'injection_roi_color', 'injection_type_id', 'injection_type_color', 'cell_type_accession_label',

In [6]:
# select specific regions
selected_regions = ['Isocortex', 'HPF']
selected_all_sids = regu.expand_regions(allen_tree, selected_regions, 'acronym')

# selected cells
cond_cells = data.obs['region_id'].isin(selected_all_sids)
data = data[cond_cells]
data

View of AnnData object with n_obs × n_vars = 44075 × 24
    obs: 'label', 'pixel_x', 'pixel_y', 'nuclei_size', 'nuclei_signal', 'cytoplasm_size', 'cytoplasm_signal', 'total_size', 'total_signal', 'posname', 'posname_stage_x', 'posname_stage_y', 'cell_name', 'stage_x', 'stage_y', 'coord_x', 'coord_y', 'region_id', 'region_color', 'region_acronym', 'Level_1_class_label', 'Level_2_neighborhood_label', 'Level_3_subclass_label'
    obsm: 'stage'
    layers: 'cytoplasm_vectors', 'nuclei_vectors', 'total_vectors', 'norm_cell'

In [7]:
def get_umap(Xcell, Ycell, 
             Xidx, Yidx, 
             Xname, Yname,
             **kwargs):
    """
    """
    # run UMAP
    embed = umap.UMAP(**kwargs).fit_transform(
        np.vstack([Xcell, Ycell])
        )

    dfembed = pd.DataFrame(index=np.hstack([Xidx, Yidx])) 
    dfembed['dataset'] = [Xname]*len(Xidx) + [Yname]*len(Yidx) 
    dfembed['embed_1'] = embed[:,0]
    dfembed['embed_2'] = embed[:,1]
    
    return dfembed 

In [8]:

Xname = 'scRNA-seq'
Yname = 'dredFISH'
X = ref_data.X
Y = data.layers['norm_cell']
# level = 'Level_1_class_label'
# n = 100

# dfsub, xidx = basicu.stratified_sample(ref_data.obs, level, n, return_idx=True)
# Xcell = basicu.zscore(X[xidx], axis=0)
Xcell = basicu.zscore(X, axis=0)
Ycell = basicu.zscore(Y, axis=0)

Xcell.shape, Ycell.shape

((73347, 24), (44075, 24))

In [9]:
# run scVI

In [10]:
adata = sc.read(
    "/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/lung_atlas.h5ad",
    backup_url="https://figshare.com/ndownloader/files/24539942",
)


In [11]:
adata.raw = adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="batch",
    subset=True
)


`flavor='seurat_v3'` expects raw count data, but non-integers were found.



In [12]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

INFO:absl:Unable to initialize backend 'tpu_driver': NOT_FOUND: Unable to find driver in registry given worker: 
INFO:absl:Unable to initialize backend 'gpu': NOT_FOUND: Could not find registered platform with name: "cuda". Available platform names are: Interpreter Host
INFO:absl:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.

adata.layers[counts] does not contain unnormalized count data. Are you sure this is what you want?



In [13]:
vae = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood="nb")

In [14]:
%%time
vae.train()
# need to use python 3.9

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmponbnace2
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmponbnace2/_remote_module_non_sriptable.py

CUDA initialization: The NVIDIA driver on your system is too old (found version 10000). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  /opt/conda/conda-bld/pytorch_1646755849709/work/c10/cuda/CUDAFunctions.cpp:112.)

INFO:pytorch_lightning.utilities.distributed:GPU available: False, used: False
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs


Epoch 5/246:   2%|▏         | 4/246 [01:11<1:11:15, 17.67s/it, loss=623, v_num=1]CPU times: user 36min 33s, sys: 1min 32s, total: 38min 6s
Wall time: 1min 14s



Detected KeyboardInterrupt, attempting graceful shutdown...



In [16]:
# try GPU pytorch
import torch
torch.cuda.is_available()

False

In [None]:
obs_df = pd.DataFrame(index=np.hstack([
    ref_data.obs.index, 
    data.obs.index, 
    ]))
obs_df['dataset'] = [Xname]*len(Xcell) + [Yname]*len(Ycell) 

adata_merged = anndata.AnnData(
    X=np.vstack([Xcell,Ycell]),
    obs=obs_df,
)
adata_merged.obsm['norm_bit'] = np.vstack([Xcell, Ycell])
adata_merged

In [None]:
%%time

kwargs = dict(verbose=True)
sce.pp.harmony_integrate(adata_merged, 'dataset', 
                         basis='norm_bit', 
                         adjusted_basis='harmony',
                         **kwargs,
                        )



In [None]:
%%time

Xcell_adj = adata_merged[adata_merged.obs['dataset']=='scRNA-seq'].obsm['harmony']
Ycell_adj = adata_merged[adata_merged.obs['dataset']=='dredFISH'].obsm['harmony']
print(Xcell_adj.shape, Ycell_adj.shape)

dfembed = get_umap(Xcell_adj, Ycell_adj, 
                   ref_data.obs.index.values,
                   data.obs.index.values,
                   'scRNA-seq', 'dredFISH',
                   metric='correlation',
                   verbose=True,
                 )
output2_csv = f'{res_dir}Joint_UMAP_harmony_correlation_April14.csv'
print(output2_csv)
dfembed.to_csv(output2_csv, index=True, header=True)
dfembed

In [None]:
dfembed

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(data=dfembed.sample(frac=1), 
                x='embed_1', y='embed_2', hue='dataset',
                s=1, 
                edgecolor="none",
               )
ax.axis('off')
ax.set_aspect('equal')
plt.show()