This notebook is primarily to make sure your environment installed properly. If anything in here does not run, certainly nothing else will.

In [1]:
import torch
import scvi
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

sc.set_figure_params(dpi=100, frameon=False, color_map='Reds', facecolor=None)
sc.logging.print_header()
assert(scvi.__version__=='0.16.3')

Global seed set to 0
  IPython.display.set_matplotlib_formats(*ipython_format)


scanpy==1.9.1 anndata==0.7.6 umap==0.5.3 numpy==1.19.4 scipy==1.6.2 pandas==1.3.3 scikit-learn==0.23.2 statsmodels==0.11.1 pynndescent==0.5.5




In [2]:
base_path = '/home/icb/yuge.ji/projects/feature-attribution-sc'  # should be changed to shared dir when I can find one

## scGen

Training notebook: `train_scgen.ipynb`

In [3]:
import os
import scgen

adata = sc.read(f'{base_path}/datasets/scgen_norman19.h5ad')
models = {}
for file in os.listdir(f'{base_path}/models'):
    if 'scgen' in file:
        print('loading', file)
        models['_'.join(file.split('_')[1:])] = scgen.SCGEN.load(f'{base_path}/models/{file}', adata=adata)

loading scgen_norman19_model1_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model1_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloaded                                                
loading scgen_norman19_model1_random
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model1_[0m
         [35mrandom/[0m[95mmodel.pt[0m already downloaded                                                  
loading scgen_norman19_model0_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model0_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloaded                                                
loading scgen_norman19_model4_shuffled
[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/feature-attribution-sc/models/scgen_norman19_model4_[0m
         [35mshuffled/[0m[95mmodel.pt[0m already downloade

In [4]:
models

{'norman19_model1_shuffled': ,
 'norman19_model1_random': ,
 'norman19_model0_shuffled': ,
 'norman19_model4_shuffled': ,
 'norman19_model4_random': ,
 'norman19_model2_random': ,
 'norman19_model2_shuffled': ,
 'norman19_model0_random.pt': ,
 'norman19_model3_shuffled': ,
 'norman19_model0_random': ,
 'norman19_model3_random': }

## scVI

## scANVI (HLCA)

In [7]:
hlca_path = '/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data/HLCA_core_h5ads/HLCA_v1_integration/HLCA_v1_scANVI_input.h5ad'
adata = sc.read(hlca_path)
adata

AnnData object with n_obs × n_vars = 587218 × 2000
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI', 'subject_ID', 'subject_ID_as_published', 'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'ethnicity', 'mixed_ethnicity', 'smoking_status', 'smoking_history', 'BMI', 'known_lung_disease', 'condition', 'subject_type', 'cause_of_death', 'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed', 'tissue_dissociation_protocol', 'cells_or_nuclei', 'single_cell_platform', "3'_or_5'", 'enrichment', 'sequencing_platform', 'reference_genome_coarse', 'ensembl_release_reference_genome', 'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured', 'cell_viability_%', 'comments', 'Processing_site', 'dataset', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'anatomical_region_highest_res', 'age', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_ne

In [8]:
model = scvi.model.SCANVI.load('/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/scanvi_model/', adata)
model

[34mINFO    [0m File [35m/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/sca[0m
         [35mnvi_model/[0m[95mmodel.pt[0m already downloaded                                               




In [9]:
# parse marker gene dictionary from csv for ground truth
marker_df = pd.read_csv('/home/icb/yuge.ji/projects/HLCA_reproducibility/notebooks/3_atlas_extension/markergenes.csv', index_col=0)

marker_dict = {}
for i in range(0, marker_df.shape[1], 3):
    ct = marker_df.columns[i].split('_')[0]
    l = list(marker_df[
        marker_df[f'{ct}_marker_for'].isin([ct, f'{ct} (poss. lowly expressed, non-unique)'])
    ][f'{ct}_marker'].values)
    # some markers are skipped due to uncertainty in annotation
    if len(l) > 0:
        marker_dict[ct] = l
marker_dict

{'AT0': ['SFTPB', 'SCGB3A2', 'SFTA2'],
 'AT1': ['CLIC5', 'SPOCK2', 'TIMP3'],
 'AT2 proliferating': ['CENPW', 'CDKN3', 'BIRC5'],
 'AT2': ['MFSD2A', 'C8orf4', 'C11orf96'],
 'Adventitial fibroblasts': ['MFAP5', 'SCARA5', 'PI16'],
 'Alveolar Mph CCL3+': ['CCL20', 'FAM89A'],
 'Alveolar Mph MT-positive': ['MT1M', 'CCL18', 'MT1E'],
 'Alveolar Mph proliferating': ['TYMS', 'CENPW', 'MND1'],
 'Alveolar fibroblasts': ['SPINT2', 'LIMCH1', 'FGFR4'],
 'Alveolar macrophages': ['CYP27A1', 'MARCO', 'FABP4'],
 'B cells': ['MS4A1'],
 'Basal resting': ['KRT15', 'KRT17', 'TP63'],
 'CD8 T cells': ['CD8A', 'CD8B', 'TRGC2'],
 'Classical monocytes': ['S100A12', 'FCN1', 'RNASE2'],
 'Club (nasal)': ['SERPINB3', 'TCN1', 'ASRGL1'],
 'Club (non-nasal)': ['CYP2F1', 'SCGB3A1', 'BPIFB1'],
 'DC1': ['XCR1', 'CLEC9A', 'CLNK'],
 'DC2': ['PKIB', 'CLEC10A', 'CD1E'],
 'Deuterosomal': ['CDC20B', 'KDELC2', 'E2F7'],
 'EC aerocyte capillary': ['SOSTDC1'],
 'EC arterial': ['DKK2', 'IGFBP3'],
 'EC general capillary': ['IL7R', 'FCN