**Requires**
* `'lincs_full_smiles.h5ad'`
* `'sciplex_raw_chunk_{i}.h5ad'` with $i \in \{0,1,2,3,4\}$

**Output**
* `'sciplex3_matched_genes_lincs.h5ad'`
* Only with genes that are shared with `lincs`: `'sciplex3_lincs_genes.h5ad'`
* Only with genes that are shared with `sciplex`: `'lincs_full_smiles_sciplex_genes.h5ad'`

## Imports

In [1]:
import os 
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import sfaira

sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
os.getcwd()

from compert.paths import DATA_DIR, PROJECT_DIR

pd.set_option('display.max_columns', 100)

scanpy==1.9.0.dev41+g58f4904c anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2


In [2]:
%load_ext autoreload
%autoreload 2

## Load data

Load lincs

In [3]:
adata_lincs = sc.read(PROJECT_DIR/'datasets'/'lincs_full_smiles.h5ad' )

Load trapnell

In [4]:
adatas = []
for i in range(5):
    adatas.append(sc.read(PROJECT_DIR/'datasets'/f'sciplex_raw_chunk_{i}.h5ad'))
adata = adatas[0].concatenate(adatas[1:])

Add gene_id to trapnell

In [5]:
adata.var['gene_id'] = adata.var.id.str.split('.').str[0]

### Get gene ids from symbols via sfaira

Load genome container with sfaira

In [6]:
genome_container = sfaira.versions.genomes.GenomeContainer(organism="homo_sapiens", release="82")

Extend symbols dict with unknown symbol

In [7]:
symbols_dict = genome_container.symbol_to_id_dict
symbols_dict.update({'PLSCR3':'ENSG00000187838'})

Identify genes that are shared between lincs and trapnell

In [8]:
# For lincs
adata_lincs.var['gene_id'] = adata_lincs.var_names.map(symbols_dict)
adata_lincs.var['in_sciplex'] = adata_lincs.var.gene_id.isin(adata.var.gene_id)

In [9]:
# For trapnell
adata.var['in_lincs'] = adata.var.gene_id.isin(adata_lincs.var.gene_id)

## Preprocess sciplex dataset

See `sciplex3.ipynb`

The original CPA implementation required to subset the data due to scaling limitations.   
In this version we expect to be able to handle the full sciplex dataset.

In [10]:
SUBSET = False

if SUBSET: 
    sc.pp.subsample(adata, fraction=0.5)

In [11]:
sc.pp.normalize_per_cell(adata)

In [12]:
sc.pp.log1p(adata)

In [13]:
sc.pp.highly_variable_genes(adata, n_top_genes=1032, subset=False)

### Combine HVG with lincs genes

Union of genes that are considered highly variable and those that are shared with lincs

In [14]:
((adata.var.in_lincs) | (adata.var.highly_variable)).sum()

2000

Subset to that union of genes

In [15]:
adata = adata[:, (adata.var.in_lincs) | (adata.var.highly_variable)].copy()

### Create additional meta data 

Normalise dose values

In [16]:
adata.obs['dose_val'] = adata.obs.dose.astype(float) / np.max(adata.obs.dose.astype(float))
adata.obs.loc[adata.obs['product_name'].str.contains('Vehicle'), 'dose_val'] = 1.0

In [17]:
adata.obs['dose_val'].value_counts()

0.001    153013
0.010    147670
0.100    141828
1.000    139266
Name: dose_val, dtype: int64

Change `product_name`

In [18]:
adata.obs['product_name'] = [x.split(' ')[0] for x in adata.obs['product_name']]
adata.obs.loc[adata.obs['product_name'].str.contains('Vehicle'), 'product_name'] = 'control'

Create copy of `product_name` with column name `control`

In [19]:
adata.obs['condition'] = adata.obs.product_name.copy()

Add combinations of drug (`condition`), dose (`dose_val`), and cell_type (`cell_type`)

In [20]:
adata.obs['drug_dose_name'] = adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)
adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.drug_dose_name.astype(str)
adata.obs['cov_drug'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str)

Add `control` columns with vale `1` where only the vehicle was used

In [21]:
adata.obs['control'] = [1 if x == 'control_1.0' else 0 for x in adata.obs.drug_dose_name.values]

## Compute DE genes

In [22]:
from compert.helper import rank_genes_groups_by_cov
rank_genes_groups_by_cov(adata, groupby='cov_drug', covariate='cell_type', control_group='control', key_added='all_DEGs')

Using backend: pytorch


A549


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


MCF7


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


K562


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


In [23]:
adata_subset = adata[:, adata.var.in_lincs].copy()
rank_genes_groups_by_cov(adata_subset, groupby='cov_drug', covariate='cell_type', control_group='control', key_added='lincs_DEGs')
adata.uns['lincs_DEGs'] = adata_subset.uns['lincs_DEGs']

A549


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


MCF7


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


K562


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'pathway' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'product_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'target' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug' as categorical


### Map all unique `cov_drug_dose_name` to the computed DEGs, independent of the dose value

Create mapping between names with dose and without dose

In [24]:
cov_drug_dose_unique = adata.obs.cov_drug_dose_name.unique()

In [25]:
remove_dose = lambda s: '_'.join(s.split('_')[:-1])
cov_drug = pd.Series(cov_drug_dose_unique).apply(remove_dose)
dose_no_dose_dict = dict(zip(cov_drug_dose_unique, cov_drug))

### Compute new dicts for DEGs

In [26]:
uns_keys = ['all_DEGs', 'lincs_DEGs']

In [27]:
for uns_key in uns_keys:
    new_DEGs_dict = {}

    df_DEGs = pd.Series(adata.uns[uns_key])

    for key, value in dose_no_dose_dict.items():
        if 'control' in key:
            continue
        new_DEGs_dict[key] = df_DEGs.loc[value]
    adata.uns[uns_key] = new_DEGs_dict

In [28]:
adata

AnnData object with n_obs × n_vars = 581777 × 2000
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'all_DEGs', 'lincs_DEGs'

## Create sciplex splits

This is not the right configuration fot the experiments we want but for the moment this is okay

### OOD in Pathways

In [29]:
adata.obs['split_ho_pathway'] = 'train'  # reset

ho_drugs = [
    # selection of drugs from various pathways
    "Azacitidine",
    "Carmofur",
    "Pracinostat",
    "Cediranib",
    "Luminespib",
    "Crizotinib",
    "SNS-314",
    "Obatoclax",
    "Momelotinib",
    "AG-14361",
    "Entacapone",
    "Fulvestrant",
    "Mesna",
    "Zileuton",
    "Enzastaurin",
    "IOX2",
    "Alvespimycin",
    "XAV-939",
    "Fasudil",
]

ho_drug_pathway = adata.obs['condition'].isin(ho_drugs)
adata.obs.loc[ho_drug_pathway, 'pathway_level_1'].value_counts()

DNA damage & DNA repair                  6640
Epigenetic regulation                    6093
Tyrosine kinase signaling                5846
Protein folding & Protein degradation    3863
Neuronal signaling                       3635
Antioxidant                              3616
HIF signaling                            3501
Metabolic regulation                     3470
Focal adhesion signaling                 3450
Nuclear receptor signaling               3420
JAK/STAT signaling                       3155
Apoptotic regulation                     3141
TGF/BMP signaling                        2794
PKC signaling                            2778
Cell cycle regulation                    2237
Other                                       0
Vehicle                                     0
Name: pathway_level_1, dtype: int64

In [30]:
ho_drug_pathway.sum()

57639

In [31]:
adata.obs.loc[ho_drug_pathway & (adata.obs['dose_val'] == 1.0), 'split_ho_pathway'] = 'ood'

test_idx = sc.pp.subsample(adata[adata.obs['split_ho_pathway'] != 'ood'], .15, copy=True).obs.index
adata.obs.loc[test_idx, 'split_ho_pathway'] = 'test'

In [32]:
pd.crosstab(adata.obs.pathway_level_1, adata.obs['condition'][adata.obs.condition.isin(ho_drugs)])

condition,AG-14361,Alvespimycin,Azacitidine,Carmofur,Cediranib,Crizotinib,Entacapone,Enzastaurin,Fasudil,Fulvestrant,IOX2,Luminespib,Mesna,Momelotinib,Obatoclax,Pracinostat,SNS-314,XAV-939,Zileuton
pathway_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Antioxidant,0,0,0,0,0,0,0,0,0,0,0,0,3616,0,0,0,0,0,0
Apoptotic regulation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3141,0,0,0,0
Cell cycle regulation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2237,0,0
DNA damage & DNA repair,3401,0,0,3239,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Epigenetic regulation,0,0,3151,0,0,0,0,0,0,0,0,0,0,0,0,2942,0,0,0
Focal adhesion signaling,0,0,0,0,0,0,0,0,3450,0,0,0,0,0,0,0,0,0,0
HIF signaling,0,0,0,0,0,0,0,0,0,0,3501,0,0,0,0,0,0,0,0
JAK/STAT signaling,0,0,0,0,0,0,0,0,0,0,0,0,0,3155,0,0,0,0,0
Metabolic regulation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3470
Neuronal signaling,0,0,0,0,0,0,3635,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
adata.obs['split_ho_pathway'].value_counts()

train    483951
test      85403
ood       12423
Name: split_ho_pathway, dtype: int64

In [34]:
adata[adata.obs.split_ho_pathway == 'ood'].obs.condition.value_counts()

Fasudil         966
IOX2            913
Mesna           884
Entacapone      868
Fulvestrant     836
Zileuton        822
Carmofur        767
AG-14361        759
Azacitidine     736
Enzastaurin     694
Pracinostat     658
SNS-314         547
Cediranib       528
Momelotinib     487
XAV-939         479
Crizotinib      464
Luminespib      405
Obatoclax       404
Alvespimycin    206
Name: condition, dtype: int64

In [35]:
adata[adata.obs.split_ho_pathway == 'test'].obs.condition.value_counts()

control         1964
ENMD-2076        914
RG108            604
GSK-LSD1         596
Altretamine      573
                ... 
Luminespib       236
Patupilone       228
Flavopiridol     207
Epothilone       181
YM155            112
Name: condition, Length: 188, dtype: int64

### OOD drugs in epigenetic regulation, Tyrosine kinase signaling, cell cycle regulation

In [36]:
adata.obs['pathway_level_1'].value_counts()

Epigenetic regulation                    147875
Tyrosine kinase signaling                 85503
JAK/STAT signaling                        70922
DNA damage & DNA repair                   60042
Cell cycle regulation                     53952
Other                                     19980
Nuclear receptor signaling                19940
Protein folding & Protein degradation     19191
Metabolic regulation                      17989
Neuronal signaling                        14071
Antioxidant                               13414
Apoptotic regulation                      13141
Vehicle                                   13004
HIF signaling                              9279
PKC signaling                              8804
TGF/BMP signaling                          8774
Focal adhesion signaling                   5896
Name: pathway_level_1, dtype: int64

___

#### Tyrosine signaling

In [37]:
adata.obs.loc[adata.obs.pathway_level_1.isin(["Tyrosine kinase signaling"]),'condition'].value_counts()

PD98059                 3763
AG-490                  3533
Motesanib               3363
TGX-221                 3358
Ki8751                  3347
PD173074                3290
Tie2                    3263
AC480                   3242
SL-327                  3199
Glesatinib?(MGCD265)    3183
Linifanib               3165
Nilotinib               3087
Sorafenib               3071
Cediranib               3060
Lapatinib               3038
KW-2449                 3032
Nintedanib              2995
Pelitinib               2926
BMS-536924              2900
Vandetanib              2854
Crizotinib              2786
Regorafenib             2684
BMS-754807              2676
Dasatinib               2538
Trametinib              2474
Bosutinib               2436
Temsirolimus            2233
TAK-901                 2068
Rigosertib              1939
Name: condition, dtype: int64

In [38]:
tyrosine_drugs = adata.obs.loc[adata.obs.pathway_level_1.isin(["Tyrosine kinase signaling"]),'condition'].unique()

In [39]:
adata.obs['split_tyrosine_ood'] = 'train'  

test_idx = sc.pp.subsample(adata[adata.obs.pathway_level_1.isin(["Tyrosine kinase signaling"])], .20, copy=True).obs.index
adata.obs.loc[test_idx, 'split_tyrosine_ood'] = 'test'

adata.obs.loc[adata.obs.condition.isin(["Cediranib", "Crizotinib", "Motesanib", "BMS-754807", "Nintedanib"]), 'split_tyrosine_ood'] = 'ood'  

In [40]:
adata.obs.split_tyrosine_ood.value_counts()

train    552761
ood       14880
test      14136
Name: split_tyrosine_ood, dtype: int64

In [41]:
pd.crosstab(adata.obs.split_tyrosine_ood, adata.obs['condition'][adata.obs.condition.isin(tyrosine_drugs)])

condition,AC480,AG-490,BMS-536924,BMS-754807,Bosutinib,Cediranib,Crizotinib,Dasatinib,Glesatinib?(MGCD265),KW-2449,Ki8751,Lapatinib,Linifanib,Motesanib,Nilotinib,Nintedanib,PD173074,PD98059,Pelitinib,Regorafenib,Rigosertib,SL-327,Sorafenib,TAK-901,TGX-221,Temsirolimus,Tie2,Trametinib,Vandetanib
split_tyrosine_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
ood,0,0,0,2676,0,3060,2786,0,0,0,0,0,0,3363,0,2995,0,0,0,0,0,0,0,0,0,0,0,0,0
test,645,728,582,0,491,0,0,491,656,580,641,603,678,0,639,0,702,723,620,502,377,678,658,419,620,453,647,443,560
train,2597,2805,2318,0,1945,0,0,2047,2527,2452,2706,2435,2487,0,2448,0,2588,3040,2306,2182,1562,2521,2413,1649,2738,1780,2616,2031,2294


In [42]:
pd.crosstab(adata.obs.split_tyrosine_ood, adata.obs.dose_val)

dose_val,0.001,0.010,0.100,1.000
split_tyrosine_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ood,4226,4118,3822,2714
test,3928,3930,3590,2688
train,144859,139622,134416,133864


____

#### Epigenetic regulation

In [43]:
adata.obs.loc[adata.obs.pathway_level_1.isin(["Epigenetic regulation"]),'condition'].value_counts()

RG108           3715
Tubastatin      3710
GSK-LSD1        3688
SRT2104         3687
Tacedinaline    3664
Tazemetostat    3639
BRD4770         3629
Anacardic       3604
GSK             3601
UNC0631         3554
Sirtinol        3541
Valproic        3540
Sodium          3497
MC1568          3416
A-366           3396
Entinostat      3385
Selisistat      3374
UNC1999         3369
Resminostat     3319
UNC0379         3281
EED226          3269
Divalproex      3228
CUDC-101        3209
PFI-1           3207
Droxinostat     3197
M344            3154
Azacitidine     3151
Givinostat      3105
Trichostatin    3083
SRT1720         3070
(+)-JQ1         3037
Belinostat      3025
SRT3025         3010
Resveratrol     2972
Pracinostat     2942
PCI-34051       2941
Abexinostat     2913
AR-42           2901
ITSA-1          2826
TMP195          2578
Panobinostat    2573
Dacinostat      2516
CUDC-907        2417
Decitabine      2357
Quisinostat     2354
Tucidinostat    2253
Mocetinostat    1978
Name: conditi

In [44]:
epigenetic_drugs = adata.obs.loc[adata.obs.pathway_level_1.isin(["Epigenetic regulation"]),'condition'].unique()

In [45]:
adata.obs['split_epigenetic_ood'] = 'train'  

test_idx = sc.pp.subsample(adata[adata.obs.pathway_level_1.isin(["Epigenetic regulation"])], .20, copy=True).obs.index
adata.obs.loc[test_idx, 'split_epigenetic_ood'] = 'test'

adata.obs.loc[adata.obs.condition.isin(["Azacitidine", "Pracinostat", "Trichostatin", "Quisinostat", "Tazemetostat"]), 'split_epigenetic_ood'] = 'ood'  

In [46]:
adata.obs.split_epigenetic_ood.value_counts()

train    540070
test      26538
ood       15169
Name: split_epigenetic_ood, dtype: int64

In [47]:
pd.crosstab(adata.obs.split_epigenetic_ood, adata.obs['condition'][adata.obs.condition.isin(epigenetic_drugs)])

condition,(+)-JQ1,A-366,AR-42,Abexinostat,Anacardic,Azacitidine,BRD4770,Belinostat,CUDC-101,CUDC-907,Dacinostat,Decitabine,Divalproex,Droxinostat,EED226,Entinostat,GSK,GSK-LSD1,Givinostat,ITSA-1,M344,MC1568,Mocetinostat,PCI-34051,PFI-1,Panobinostat,Pracinostat,Quisinostat,RG108,Resminostat,Resveratrol,SRT1720,SRT2104,SRT3025,Selisistat,Sirtinol,Sodium,TMP195,Tacedinaline,Tazemetostat,Trichostatin,Tubastatin,Tucidinostat,UNC0379,UNC0631,UNC1999,Valproic
split_epigenetic_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
ood,0,0,0,0,0,3151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2942,2354,0,0,0,0,0,0,0,0,0,0,0,3639,3083,0,0,0,0,0,0
test,625,645,623,582,728,0,743,581,661,519,518,491,647,652,645,716,690,686,631,544,611,655,385,591,618,517,0,0,701,649,655,583,779,605,690,669,710,511,747,0,0,718,453,686,664,686,728
train,2412,2751,2278,2331,2876,0,2886,2444,2548,1898,1998,1866,2581,2545,2624,2669,2911,3002,2474,2282,2543,2761,1593,2350,2589,2056,0,0,3014,2670,2317,2487,2908,2405,2684,2872,2787,2067,2917,0,0,2992,1800,2595,2890,2683,2812


In [48]:
pd.crosstab(adata.obs.split_tyrosine_ood, adata.obs.dose_val)

dose_val,0.001,0.010,0.100,1.000
split_tyrosine_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ood,4226,4118,3822,2714
test,3928,3930,3590,2688
train,144859,139622,134416,133864


__________

#### Cell cycle regulation

In [49]:
adata.obs.loc[adata.obs.pathway_level_1.isin(["Cell cycle regulation"]),'condition'].value_counts()

ENMD-2076       5757
BMS-265246      3274
Roscovitine     3254
Aurora          3036
MK-5108         3006
JNJ-7706621     2988
CYC116          2951
ZM              2716
AMG-900         2710
GSK1070916      2502
Barasertib      2421
Danusertib      2396
MLN8054         2344
SNS-314         2237
PHA-680632      2181
Alisertib       2101
Tozasertib      2020
Hesperadin      1949
Patupilone      1481
Flavopiridol    1407
Epothilone      1221
Name: condition, dtype: int64

In [50]:
cell_cycle_drugs = adata.obs.loc[adata.obs.pathway_level_1.isin(["Cell cycle regulation"]),'condition'].unique()

In [51]:
adata.obs['split_cellcycle_ood'] = 'train'  

test_idx = sc.pp.subsample(adata[adata.obs.pathway_level_1.isin(["Cell cycle regulation"])], .20, copy=True).obs.index
adata.obs.loc[test_idx, 'split_cellcycle_ood'] = 'test'

adata.obs.loc[adata.obs.condition.isin(["SNS-314", "Flavopiridol", "Roscovitine"]), 'split_cellcycle_ood'] = 'ood'  

In [52]:
adata.obs.split_cellcycle_ood.value_counts()

train    565503
test       9376
ood        6898
Name: split_cellcycle_ood, dtype: int64

In [53]:
pd.crosstab(adata.obs.split_cellcycle_ood, adata.obs['condition'][adata.obs.condition.isin(cell_cycle_drugs)])

condition,AMG-900,Alisertib,Aurora,BMS-265246,Barasertib,CYC116,Danusertib,ENMD-2076,Epothilone,Flavopiridol,GSK1070916,Hesperadin,JNJ-7706621,MK-5108,MLN8054,PHA-680632,Patupilone,Roscovitine,SNS-314,Tozasertib,ZM
split_cellcycle_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ood,0,0,0,0,0,0,0,0,0,1407,0,0,0,0,0,0,0,3254,2237,0,0
test,545,428,616,679,463,570,469,1140,230,0,512,356,590,590,478,450,290,0,0,424,546
train,2165,1673,2420,2595,1958,2381,1927,4617,991,0,1990,1593,2398,2416,1866,1731,1191,0,0,1596,2170


In [54]:
pd.crosstab(adata.obs.split_cellcycle_ood, adata.obs.dose_val)

dose_val,0.001,0.010,0.100,1.000
split_cellcycle_ood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ood,2165,1774,1457,1502
test,2673,2429,2329,1945
train,148175,143467,138042,135819


In [55]:
[c for c in adata.obs.columns if 'split' in c]

['split_ho_pathway',
 'split_tyrosine_ood',
 'split_epigenetic_ood',
 'split_cellcycle_ood']

### Further splits

**We omit these split as we design our own splits - for referece this is commented out for the moment**

Also a split which sees all data:

In [56]:
# adata.obs['split_all'] = 'train'
# test_idx = sc.pp.subsample(adata, .10, copy=True).obs.index
# adata.obs.loc[test_idx, 'split_all'] = 'test'

In [57]:
# adata.obs['ct_dose'] = adata.obs.cell_type.astype('str') + '_' + adata.obs.dose_val.astype('str')

Round robin splits: dose and cell line combinations will be held out in turn.

In [58]:
# i = 0
# split_dict = {}

In [59]:
# # single ct holdout
# for ct in adata.obs.cell_type.unique():
#     for dose in adata.obs.dose_val.unique():
#         i += 1
#         split_name = f'split{i}'
#         split_dict[split_name] = f'{ct}_{dose}'
        
#         adata.obs[split_name] = 'train'
#         adata.obs.loc[adata.obs.ct_dose == f'{ct}_{dose}', split_name] = 'ood'
        
#         test_idx = sc.pp.subsample(adata[adata.obs[split_name] != 'ood'], .16, copy=True).obs.index
#         adata.obs.loc[test_idx, split_name] = 'test'
        
#         display(adata.obs[split_name].value_counts())

In [60]:
# # double ct holdout
# for cts in [('A549', 'MCF7'), ('A549', 'K562'), ('MCF7', 'K562')]:
#     for dose in adata.obs.dose_val.unique():
#         i += 1
#         split_name = f'split{i}'
#         split_dict[split_name] = f'{cts[0]}+{cts[1]}_{dose}'
        
#         adata.obs[split_name] = 'train'
#         adata.obs.loc[adata.obs.ct_dose == f'{cts[0]}_{dose}', split_name] = 'ood'
#         adata.obs.loc[adata.obs.ct_dose == f'{cts[1]}_{dose}', split_name] = 'ood'
        
#         test_idx = sc.pp.subsample(adata[adata.obs[split_name] != 'ood'], .16, copy=True).obs.index
#         adata.obs.loc[test_idx, split_name] = 'test'
        
#         display(adata.obs[split_name].value_counts())

In [61]:
# # triple ct holdout
# for dose in adata.obs.dose_val.unique():
#     i += 1
#     split_name = f'split{i}'

#     split_dict[split_name] = f'all_{dose}'
#     adata.obs[split_name] = 'train'
#     adata.obs.loc[adata.obs.dose_val == dose, split_name] = 'ood'

#     test_idx = sc.pp.subsample(adata[adata.obs[split_name] != 'ood'], .16, copy=True).obs.index
#     adata.obs.loc[test_idx, split_name] = 'test'

#     display(adata.obs[split_name].value_counts())

In [62]:
# adata.uns['all_DEGs']

## Save adata

Reindex the lincs dataset

In [63]:
sciplex_ids = pd.Index(adata.var.gene_id)

lincs_idx = [sciplex_ids.get_loc(_id) for _id in adata_lincs.var.gene_id[adata_lincs.var.in_sciplex]]

In [64]:
non_lincs_idx = [sciplex_ids.get_loc(_id) for _id in adata.var.gene_id if not adata_lincs.var.gene_id.isin([_id]).any()]

lincs_idx.extend(non_lincs_idx)

In [65]:
adata = adata[:, lincs_idx].copy()

In [66]:
fname = PROJECT_DIR/'datasets'/'sciplex3_matched_genes_lincs.h5ad'

sc.write(fname, adata)

... storing 'cell_type' as categorical
... storing 'pathway' as categorical
... storing 'product_name' as categorical
... storing 'target' as categorical
... storing 'condition' as categorical
... storing 'drug_dose_name' as categorical
... storing 'cov_drug_dose_name' as categorical
... storing 'cov_drug' as categorical
... storing 'split_ho_pathway' as categorical
... storing 'split_tyrosine_ood' as categorical
... storing 'split_epigenetic_ood' as categorical
... storing 'split_cellcycle_ood' as categorical


Check that it worked

In [67]:
sc.read(fname)

AnnData object with n_obs × n_vars = 581777 × 2000
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split_ho_pathway', 'split_tyrosine_ood', 'split_epigenetic_ood', 'split_cellcycle_ood'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'all_DEGs', 'hvg', 'lincs_DEGs'

## Subselect to shared only shared genes

Subset to shared genes

In [68]:
adata_lincs = adata_lincs[:, adata_lincs.var.in_sciplex].copy() 

In [69]:
adata = adata[:, adata.var.in_lincs].copy()

In [70]:
adata_lincs.var_names

Index(['DDR1', 'PAX8', 'RPS5', 'ABCF1', 'SPAG7', 'RHOA', 'RNPS1', 'SMNDC1',
       'ATP6V0B', 'RPS6',
       ...
       'P4HTM', 'SLC27A3', 'TBXA2R', 'RTN2', 'TSTA3', 'PPARD', 'GNA11',
       'WDTC1', 'PLSCR3', 'NPEPL1'],
      dtype='object', length=977)

In [71]:
adata.var_names

Index(['DDR1', 'PAX8', 'RPS5', 'ABCF1', 'SPAG7', 'RHOA', 'RNPS1', 'SMNDC1',
       'ATP6V0B', 'RPS6',
       ...
       'P4HTM', 'SLC27A3', 'TBXA2R', 'RTN2', 'TSTA3', 'PPARD', 'GNA11',
       'WDTC1', 'PLSCR3', 'NPEPL1'],
      dtype='object', name='index', length=977)

## Save adata objects with shared genes only
Index of lincs has also been reordered accordingly

In [72]:
fname = PROJECT_DIR/'datasets'/'sciplex3_lincs_genes.h5ad'

sc.write(fname, adata)

____

In [73]:
# fname_lincs = PROJECT_DIR/'datasets'/'lincs_full_smiles_sciplex_genes.h5ad'

# sc.write(fname_lincs, adata_lincs)