This notebook is primarily to make sure your environment installed properly. If anything in here does not run, certainly nothing else will.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import scvi
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np

sc.set_figure_params(dpi=100, frameon=False, color_map='Reds', facecolor=None)
sc.logging.print_header()
assert(scvi.__version__=='0.16.3')

from gradients import run_expected_jacobian_scanvi, run_integrated_jacobian_scanvi

Global seed set to 0
2022-09-21 16:30:54.226666: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.9.1 pandas==1.4.4 scikit-learn==1.1.2 statsmodels==0.13.2 python-igraph==0.10.1 pynndescent==0.5.7


In [3]:
base_path = '/home/icb/yuge.ji/projects/feature-attribution-sc'  # should be changed to shared dir when I can find one

## scGen

Training notebook: `train_scgen.ipynb`

In [4]:
import os
import scgen

adata = sc.read(f'{base_path}/datasets/scgen_norman19.h5ad')
models = {}
for file in os.listdir(f'{base_path}/models'):
    if 'scgen' in file:
        print('loading', file)
        models['_'.join(file.split('_')[1:])] = scgen.SCGEN.load(f'{base_path}/models/{file}', adata=adata)

loading scgen_norman19_model1_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model1_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloaded                                                
loading scgen_norman19_model1_random
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model1_[0m
         [35mrandom/[0m[95mmodel.pt[0m already downloaded                                                  
loading scgen_norman19_model0_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model0_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloaded                                                
loading scgen_norman19_model4_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model4_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloade

In [5]:
models

{'norman19_model1_shuffled': ,
 'norman19_model1_random': ,
 'norman19_model0_shuffled': ,
 'norman19_model4_shuffled': ,
 'norman19_model4_random': ,
 'norman19_model2_random': ,
 'norman19_model2_shuffled': ,
 'norman19_model0_random.pt': ,
 'norman19_model3_shuffled': ,
 'norman19_model0_random': ,
 'norman19_model3_random': }

## scVI

## scANVI (HLCA)

In [6]:
hlca_path = '/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data/HLCA_core_h5ads/HLCA_v1_integration/HLCA_v1_scANVI_input.h5ad'
adata = sc.read(hlca_path)
adata

AnnData object with n_obs × n_vars = 587218 × 2000
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI', 'subject_ID', 'subject_ID_as_published', 'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'ethnicity', 'mixed_ethnicity', 'smoking_status', 'smoking_history', 'BMI', 'known_lung_disease', 'condition', 'subject_type', 'cause_of_death', 'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed', 'tissue_dissociation_protocol', 'cells_or_nuclei', 'single_cell_platform', "3'_or_5'", 'enrichment', 'sequencing_platform', 'reference_genome_coarse', 'ensembl_release_reference_genome', 'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured', 'cell_viability_%', 'comments', 'Processing_site', 'dataset', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'anatomical_region_highest_res', 'age', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_ne

In [7]:
model = scvi.model.SCANVI.load('/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/scanvi_model/', adata)
model

[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/sca[0m
         [35mnvi_model/[0m[95mmodel.pt[0m already downloaded                                               






In [8]:
# parse marker gene dictionary from csv for ground truth
marker_df = pd.read_csv('/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/markergenes.csv', index_col=0)

marker_dict = {}
for i in range(0, marker_df.shape[1], 3):
    ct = marker_df.columns[i].split('_')[0]
    l = list(marker_df[
        marker_df[f'{ct}_marker_for'].isin([ct, f'{ct} (poss. lowly expressed, non-unique)'])
    ][f'{ct}_marker'].values)
    # some markers are skipped due to uncertainty in annotation
    if len(l) > 0:
        marker_dict[ct] = l
marker_dict

{'AT0': ['SFTPB', 'SCGB3A2', 'SFTA2'],
 'AT1': ['CLIC5', 'SPOCK2', 'TIMP3'],
 'AT2 proliferating': ['CENPW', 'CDKN3', 'BIRC5'],
 'AT2': ['MFSD2A', 'C8orf4', 'C11orf96'],
 'Adventitial fibroblasts': ['MFAP5', 'SCARA5', 'PI16'],
 'Alveolar Mph CCL3+': ['CCL20', 'FAM89A'],
 'Alveolar Mph MT-positive': ['MT1M', 'CCL18', 'MT1E'],
 'Alveolar Mph proliferating': ['TYMS', 'CENPW', 'MND1'],
 'Alveolar fibroblasts': ['SPINT2', 'LIMCH1', 'FGFR4'],
 'Alveolar macrophages': ['CYP27A1', 'MARCO', 'FABP4'],
 'B cells': ['MS4A1'],
 'Basal resting': ['KRT15', 'KRT17', 'TP63'],
 'CD8 T cells': ['CD8A', 'CD8B', 'TRGC2'],
 'Classical monocytes': ['S100A12', 'FCN1', 'RNASE2'],
 'Club (nasal)': ['SERPINB3', 'TCN1', 'ASRGL1'],
 'Club (non-nasal)': ['CYP2F1', 'SCGB3A1', 'BPIFB1'],
 'DC1': ['XCR1', 'CLEC9A', 'CLNK'],
 'DC2': ['PKIB', 'CLEC10A', 'CD1E'],
 'Deuterosomal': ['CDC20B', 'KDELC2', 'E2F7'],
 'EC aerocyte capillary': ['SOSTDC1'],
 'EC arterial': ['DKK2', 'IGFBP3'],
 'EC general capillary': ['IL7R', 'FCN

In [9]:
#scdl = model._make_data_loader(adata=adata, shuffle=True, batch_size=100)

In [10]:
#inpt_dict = batch_to_dict_scanvi(next(scdl.__iter__()))
#inpt_dict["x"].requires_grad = True

In [11]:
#expected_jacobian(model.module.classify, inpt_dict, "x", prime_inpt)

In [12]:
adata.obs["dataset"].value_counts()

Banovich_Kropski_2020        122319
Barbry_Leroy_2020             74728
Nawijn_2021                   70591
Misharin_2021                 65326
Krasnow_2020                  60990
Misharin_Budinger_2018        41265
Meyer_2019                    35674
Jain_Misharin_2021_10Xv2      33144
Seibold_2020_10Xv3            21466
Lafyatis_Rojas_2019_10Xv2     21271
Teichmann_Meyer_2019          12971
Jain_Misharin_2021_10Xv1      12422
Seibold_2020_10Xv2            12127
Lafyatis_Rojas_2019_10Xv1      2924
Name: dataset, dtype: int64

In [13]:
indices = np.where(adata.obs["dataset"] == "Meyer_2019")[0]

In [14]:
dl_base = model._make_data_loader(adata=adata, shuffle=False, indices=indices, batch_size=1000)
dl_prime = model._make_data_loader(adata=adata, shuffle=True, indices=indices, batch_size=1000)

In [24]:
avg_abs_exp_jac = None
avg_exp_jac = None

In [25]:
# sum abs values
for ds in adata.obs["dataset"].unique():
    print(ds)
    
    indices = np.where(adata.obs["dataset"] == ds)[0]
    batch_size = min(2000, int(len(indices) / 11))
    print(batch_size)
    
    dl_base = model._make_data_loader(adata=adata, shuffle=False, indices=indices, batch_size=batch_size)
    dl_prime = model._make_data_loader(adata=adata, shuffle=True, indices=indices, batch_size=batch_size)
    
    exp_jac_ds = run_expected_jacobian_scanvi(model.module.classify, dl_base, dl_prime, apply_abs=True, sum_obs=True)
    if avg_abs_exp_jac is None:
        avg_abs_exp_jac = exp_jac_ds
    else:
        avg_abs_exp_jac += exp_jac_ds
        
# just sum
for ds in adata.obs["dataset"].unique():
    print(ds)
    
    indices = np.where(adata.obs["dataset"] == ds)[0]
    batch_size = min(2000, int(len(indices) / 11))
    print(batch_size)
    
    dl_base = model._make_data_loader(adata=adata, shuffle=False, indices=indices, batch_size=batch_size)
    dl_prime = model._make_data_loader(adata=adata, shuffle=True, indices=indices, batch_size=batch_size)
    
    exp_jac_ds = run_expected_jacobian_scanvi(model.module.classify, dl_base, dl_prime, apply_abs=False, sum_obs=True)
    if avg_exp_jac is None:
        avg_exp_jac = exp_jac_ds
    else:
        avg_exp_jac += exp_jac_ds

Teichmann_Meyer_2019
1179
Misharin_Budinger_2018
2000
Krasnow_2020
2000
Seibold_2020_10Xv3
1951
Banovich_Kropski_2020
2000
Nawijn_2021
2000
Misharin_2021
2000
Seibold_2020_10Xv2
1102
Barbry_Leroy_2020
2000
Lafyatis_Rojas_2019_10Xv1
265
Lafyatis_Rojas_2019_10Xv2
1933
Meyer_2019
2000
Jain_Misharin_2021_10Xv2
2000
Jain_Misharin_2021_10Xv1
1129
Teichmann_Meyer_2019
1179
Misharin_Budinger_2018
2000
Krasnow_2020
2000
Seibold_2020_10Xv3
1951
Banovich_Kropski_2020
2000
Nawijn_2021
2000
Misharin_2021
2000
Seibold_2020_10Xv2
1102
Barbry_Leroy_2020
2000
Lafyatis_Rojas_2019_10Xv1
265
Lafyatis_Rojas_2019_10Xv2
1933
Meyer_2019
2000
Jain_Misharin_2021_10Xv2
2000
Jain_Misharin_2021_10Xv1
1129


In [34]:
avg_abs_integr_jac = None
avg_integr_jac = None

In [35]:
# sum abs values
for ds in adata.obs["dataset"].unique():
    print(ds)
    
    indices = np.where(adata.obs["dataset"] == ds)[0]
    dl_base = model._make_data_loader(adata=adata, shuffle=False, indices=indices, batch_size=2000)
    
    integr_jac_ds = run_integrated_jacobian_scanvi(model.module.classify, dl_base, apply_abs=True, sum_obs=True)
    if avg_abs_integr_jac is None:
        avg_abs_integr_jac = integr_jac_ds
    else:
        avg_abs_integr_jac += integr_jac_ds
        
# just sum
for ds in adata.obs["dataset"].unique():
    print(ds)
    
    indices = np.where(adata.obs["dataset"] == ds)[0]
    dl_base = model._make_data_loader(adata=adata, shuffle=False, indices=indices, batch_size=2000)
    
    integr_jac_ds = run_integrated_jacobian_scanvi(model.module.classify, dl_base, apply_abs=False, sum_obs=True)
    if avg_integr_jac is None:
        avg_integr_jac = integr_jac_ds
    else:
        avg_integr_jac += integr_jac_ds

Teichmann_Meyer_2019
Misharin_Budinger_2018
Krasnow_2020
Seibold_2020_10Xv3
Banovich_Kropski_2020
Nawijn_2021
Misharin_2021
Seibold_2020_10Xv2
Barbry_Leroy_2020
Lafyatis_Rojas_2019_10Xv1
Lafyatis_Rojas_2019_10Xv2
Meyer_2019
Jain_Misharin_2021_10Xv2
Jain_Misharin_2021_10Xv1
Teichmann_Meyer_2019
Misharin_Budinger_2018
Krasnow_2020
Seibold_2020_10Xv3
Banovich_Kropski_2020
Nawijn_2021
Misharin_2021
Seibold_2020_10Xv2
Barbry_Leroy_2020
Lafyatis_Rojas_2019_10Xv1
Lafyatis_Rojas_2019_10Xv2
Meyer_2019
Jain_Misharin_2021_10Xv2
Jain_Misharin_2021_10Xv1


In [47]:
from scvi import REGISTRY_KEYS
labels = model.adata_manager.get_state_registry(REGISTRY_KEYS.LABELS_KEY)['categorical_mapping']

In [49]:
pd.DataFrame(avg_abs_exp_jac.numpy(), columns=labels[:-1], index=adata.var_names.tolist()).to_csv("absolute_sum_expected_grads.csv", index_label="Gene")

In [50]:
pd.DataFrame(avg_exp_jac.numpy(), columns=labels[:-1], index=adata.var_names.tolist()).to_csv("sum_expected_grads.csv", index_label="Gene")

In [51]:
pd.DataFrame(avg_abs_integr_jac.numpy(), columns=labels[:-1], index=adata.var_names.tolist()).to_csv("absolute_sum_integrated_grads.csv", index_label="Gene")

In [52]:
pd.DataFrame(avg_integr_jac.numpy(), columns=labels[:-1], index=adata.var_names.tolist()).to_csv("sum_integrated_grads.csv", index_label="Gene")