In [1]:
import scanpy as sc

# dataset downloaded from scVIDR github provided drive link
data = "../data/scvidr/nault2021_multiDose.h5ad"
adata = sc.read_h5ad(data)

In [2]:
adata

AnnData object with n_obs × n_vars = 131613 × 22213
    obs: 'Dose', 'batch', 'celltype'

In [4]:
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_cells(adata, min_genes=720)
sc.pp.filter_genes(adata, min_cells=100)

# should this be per cell like https://github.com/facebookresearch/CPA/blob/main/preprocessing/sciplex3.ipynb? No, it is deprecated
sc.pp.normalize_total(adata) 

sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000)
adata = adata[:,adata.var.highly_variable]

In [3]:
adata.obs.groupby(['celltype', 'Dose']).size().sort_values(ascending=False).head(20)

  adata.obs.groupby(['celltype', 'Dose']).size().sort_values(ascending=False).head(20)


celltype               Dose 
Hepatocytes - portal   0.10     9094
                       0.30     8953
                       0.00     7512
                       10.00    6914
                       1.00     6600
                       0.03     6286
Macrophage             30.00    6072
Hepatocytes - portal   3.00     5880
                       0.01     4971
Hepatocytes - central  10.00    3731
                       0.10     2907
Endothelial Cells      30.00    2852
Hepatocytes - central  1.00     2610
                       0.00     2492
                       3.00     2431
Macrophage             10.00    2322
Hepatocytes - central  0.30     2309
Endothelial Cells      1.00     2150
                       0.10     2017
                       0.30     1930
dtype: int64

In [6]:
dosage = 30
single_dosage = adata[(adata.obs["Dose"] == 0) | (adata.obs["Dose"] == dosage)]
single_dosage.obs.groupby(['celltype', 'Dose']).size()

  single_dosage.obs.groupby(['celltype', 'Dose']).size()


celltype               Dose
B Cells                0.0      208
                       30.0    1565
Cholangiocytes         0.0      109
                       30.0     600
Endothelial Cells      0.0     1495
                       30.0    2852
Hepatocytes - central  0.0     2492
                       30.0    1325
Hepatocytes - portal   0.0     7512
                       30.0    1074
Macrophage             0.0     1508
                       30.0    6072
Neutrophils            0.0       61
                       30.0     614
Portal Fibroblasts     0.0       90
                       30.0      76
Stellate Cells         0.0      849
                       30.0     391
Subtype 1              0.0       67
                       30.0     280
T Cells                0.0      242
                       30.0    1445
dtype: int64

In [7]:
control = adata[adata.obs["Dose"] == 0]
perturb = adata[adata.obs["Dose"] == 30]

(View of AnnData object with n_obs × n_vars = 14633 × 22213
     obs: 'Dose', 'batch', 'celltype',
 View of AnnData object with n_obs × n_vars = 16294 × 22213
     obs: 'Dose', 'batch', 'celltype')

In [8]:
control.obs.groupby(['celltype', 'Dose']).size(
)

  control.obs.groupby(['celltype', 'Dose']).size(


celltype               Dose
B Cells                0.0      208
Cholangiocytes         0.0      109
Endothelial Cells      0.0     1495
Hepatocytes - central  0.0     2492
Hepatocytes - portal   0.0     7512
Macrophage             0.0     1508
Neutrophils            0.0       61
Portal Fibroblasts     0.0       90
Stellate Cells         0.0      849
Subtype 1              0.0       67
T Cells                0.0      242
dtype: int64

In [9]:
perturb.obs.groupby(['celltype', 'Dose']).size(
    
)

  perturb.obs.groupby(['celltype', 'Dose']).size(


celltype               Dose
B Cells                30.0    1565
Cholangiocytes         30.0     600
Endothelial Cells      30.0    2852
Hepatocytes - central  30.0    1325
Hepatocytes - portal   30.0    1074
Macrophage             30.0    6072
Neutrophils            30.0     614
Portal Fibroblasts     30.0      76
Stellate Cells         30.0     391
Subtype 1              30.0     280
T Cells                30.0    1445
dtype: int64

In [7]:
adata.obs["Dose"].unique()

array([0.e+00, 1.e-02, 3.e-02, 1.e-01, 3.e-01, 1.e+00, 3.e+00, 1.e+01,
       3.e+01])

In [25]:
control = adata[adata.obs["Dose"] == 0]
control

assert all(control.obs["celltype"].unique() == control.obs["celltype"].unique())

In [5]:
for drug_dose in adata.obs["Dose"].unique():
    perturb = adata[adata.obs["Dose"] == drug_dose]
    print(drug_dose, perturb.shape)
    
    print("Num cell types", len(perturb.obs["celltype"].unique()))
    for cell_type in sorted(perturb.obs["celltype"].unique()):
        perturb_cell_type = perturb[perturb.obs["celltype"] == cell_type]
        print(cell_type, perturb_cell_type.shape[0])
    print()

0.0 (14245, 5000)
Num cell types 11
B Cells 120
Cholangiocytes 101
Endothelial Cells 1445
Hepatocytes - central 2486
Hepatocytes - portal 7492
Macrophage 1411
Neutrophils 42
Portal Fibroblasts 84
Stellate Cells 826
Subtype 1 60
T Cells 178

0.01 (8524, 5000)
Num cell types 11
B Cells 88
Cholangiocytes 84
Endothelial Cells 812
Hepatocytes - central 1482
Hepatocytes - portal 4955
Macrophage 284
Neutrophils 7
Portal Fibroblasts 42
Stellate Cells 608
Subtype 1 27
T Cells 135

0.03 (11741, 5000)
Num cell types 11
B Cells 185
Cholangiocytes 89
Endothelial Cells 1494
Hepatocytes - central 1775
Hepatocytes - portal 6271
Macrophage 587
Neutrophils 38
Portal Fibroblasts 89
Stellate Cells 940
Subtype 1 34
T Cells 239

0.1 (17199, 5000)
Num cell types 11
B Cells 232
Cholangiocytes 131
Endothelial Cells 1991
Hepatocytes - central 2902
Hepatocytes - portal 9056
Macrophage 1329
Neutrophils 34
Portal Fibroblasts 90
Stellate Cells 1037
Subtype 1 51
T Cells 346

0.3 (15667, 5000)
Num cell types 11
B Cel

In [6]:
adata.X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 246137687 stored elements and shape (131613, 22213)>