In [1]:
import os
import numpy as np
import scanpy as sc
import anndata as ad
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
plt.rcParams['figure.figsize']=(5, 5)
sc.settings.verbosity = 3
sc.logging.print_header()
os.chdir('./../')


%load_ext autoreload
%autoreload 2 

scanpy==1.7.1 anndata==0.7.5 umap==0.5.1 numpy==1.19.2 scipy==1.6.1 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.8.3 leidenalg==0.8.3


In [33]:
sc.set_figure_params(dpi=100)

## loading the raw data

In [36]:
import os

In [37]:
os.getcwd()

'/home/mohammad'

In [38]:
adata = sc.read("./Desktop/test_AdvAE/datasets/Norman2019_raw.h5ad")

In [39]:
needed_obs = adata.obs[["guide_identity","read_count", "UMI_count","gemgroup","good_coverage","number_of_cells","guide_ids"]].copy()

In [40]:
adata_new = sc.AnnData(adata.X.copy(), obs=needed_obs, var=adata.var.copy())

In [41]:
adata_new

AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_identity', 'read_count', 'UMI_count', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_ids'
    var: 'gene_symbols'

In [42]:
adata_new.obs

Unnamed: 0_level_0,guide_identity,read_count,UMI_count,gemgroup,good_coverage,number_of_cells,guide_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCTGAGAAGAAGC-1,NegCtrl0_NegCtrl0__NegCtrl0_NegCtrl0,1252,67,1,True,2,
AAACCTGAGGCATGTG-1,TSC22D1_NegCtrl0__TSC22D1_NegCtrl0,2151,104,1,True,1,TSC22D1
AAACCTGAGGCCCTTG-1,KLF1_MAP2K6__KLF1_MAP2K6,1037,59,1,True,1,"KLF1,MAP2K6"
AAACCTGCACGAAGCA-1,NegCtrl10_NegCtrl0__NegCtrl10_NegCtrl0,958,39,1,True,1,
AAACCTGCAGACGTAG-1,CEBPE_RUNX1T1__CEBPE_RUNX1T1,244,14,1,True,1,"CEBPE,RUNX1T1"
...,...,...,...,...,...,...,...
TTTGTCATCAGTACGT-8,FOXA3_NegCtrl0__FOXA3_NegCtrl0,2068,95,8,True,1,FOXA3
TTTGTCATCCACTCCA-8,CELF2_NegCtrl0__CELF2_NegCtrl0,829,33,8,True,1,CELF2
TTTGTCATCCCAACGG-8,BCORL1_NegCtrl0__BCORL1_NegCtrl0,136,9,8,True,1,BCORL1
TTTGTCATCCTCCTAG-8,ZBTB10_PTPN12__ZBTB10_PTPN12,1254,59,8,True,3,"PTPN12,ZBTB10"


In [43]:
#check all ctrl guides

In [44]:
list_control = []
for i in np.unique(adata_new.obs["guide_identity"]):
   m = re.match(r"NegCtrl(.*)_NegCtrl(.*)+NegCtrl(.*)_NegCtrl(.*)", i)
   if m :
    list_control.append(m.group())

remove "NegCtrl1_NegCtrl0__NegCtrl1_NegCtrl0" suggested by authors

In [45]:
adata_new = adata_new[adata_new.obs["guide_identity"] != "NegCtrl1_NegCtrl0__NegCtrl1_NegCtrl0"] 

In [46]:
adata_new

View of AnnData object with n_obs × n_vars = 108497 × 33694
    obs: 'guide_identity', 'read_count', 'UMI_count', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_ids'
    var: 'gene_symbols'

merge all controls 

In [47]:
adata_new.obs["guide_merged"] = adata_new.obs["guide_identity"]

Trying to set attribute `.obs` of view, copying.


In [48]:
for i in np.unique(adata_new.obs["guide_merged"]):
   m = re.match(r"NegCtrl(.*)_NegCtrl(.*)+NegCtrl(.*)_NegCtrl(.*)", i)
   if m :
        adata_new.obs["guide_merged"].replace(i,"ctrl",inplace=True)

relabeling

In [49]:
old_pool = []
for i in np.unique(adata_new.obs["guide_merged"]):
    if i == "ctrl":
        old_pool.append(i)
        continue
    split = i.split("__")[1]
    split = split.split("_")
    for j, string in enumerate(split):
        if "NegCtrl" in split[j]:
            split[j] = "ctrl"
    if len(split) == 1:
        if split[0] in old_pool:
            print("old:",i, "new:",split[0])
        adata_new.obs["guide_merged"].replace(i,split[0],inplace=True)
        old_pool.append(split[0])
    else:
        if f"{split[0]}+{split[1]}" in old_pool:
            print("old:",i, "new:",f"{split[0]}+{split[1]}")
        adata_new.obs["guide_merged"].replace(i, f"{split[0]}+{split[1]}",inplace=True)
        old_pool.append(f"{split[0]}+{split[1]}")

old: HOXC13_NegCtrl0__HOXC13_NegCtrl0_2 new: HOXC13+ctrl
old: TGFBR2_IGDCC3__TGFBR2_IGDCC3_2 new: TGFBR2+IGDCC3
old: ZBTB10_NegCtrl0__ZBTB10_NegCtrl0_2 new: ZBTB10+ctrl


In [50]:
adata_new.obs["guide_merged"]

index
AAACCTGAGAAGAAGC-1             ctrl
AAACCTGAGGCATGTG-1     TSC22D1+ctrl
AAACCTGAGGCCCTTG-1      KLF1+MAP2K6
AAACCTGCACGAAGCA-1             ctrl
AAACCTGCAGACGTAG-1    CEBPE+RUNX1T1
                          ...      
TTTGTCATCAGTACGT-8       FOXA3+ctrl
TTTGTCATCCACTCCA-8       CELF2+ctrl
TTTGTCATCCCAACGG-8      BCORL1+ctrl
TTTGTCATCCTCCTAG-8    ZBTB10+PTPN12
TTTGTCATCTGGCGAC-8      MAP4K3+ctrl
Name: guide_merged, Length: 108497, dtype: category
Categories (284, object): ['AHR+FEV', 'AHR+KLF1', 'AHR+ctrl', 'ARID1A+ctrl', ..., 'ZC3HAV1+HOXC13', 'ZC3HAV1+ctrl', 'ZNF318+FOXL2', 'ZNF318+ctrl']

# preprocessing 

Keep the count data in a counts layer

In [51]:
adata_new.layers["counts"] = adata_new.X.copy()

Normalization and HVG selection

In [52]:
sc.pp.normalize_total(adata_new)
sc.pp.log1p(adata_new)
sc.pp.highly_variable_genes(adata_new,n_top_genes=5000, subset=True)

normalizing counts per cell
    finished (0:00:01)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


In [53]:
adata_new

AnnData object with n_obs × n_vars = 108497 × 5000
    obs: 'guide_identity', 'read_count', 'UMI_count', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_ids', 'guide_merged'
    var: 'gene_symbols', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    layers: 'counts'

# Prepare for the model

In [78]:
adata_new.obs['dose_val'] = 'nan'

In [79]:
adata_new.obs['dose_val'].loc[
    adata_new.obs['guide_merged']=="ctrl"
] = '1'

adata_new.obs['dose_val'].loc[
    adata_new.obs['guide_merged']!="ctrl"
] = "1+1"

In [82]:
adata_new.obs["condition"] = adata_new.obs["guide_merged"]
adata_new.obs['cell_type'] = 'A549'
adata_new.obs['control'] = [1 if x == 'ctrl' else 0 for x in adata_new.obs.condition.values]

In [87]:
adata_new.obs['drug_dose_name'] = adata_new.obs.condition.astype(str) + '_' + adata_new.obs.dose_val.astype(str)

In [88]:
adata_new.obs['cov_drug_dose_name'] = adata_new.obs.cell_type.astype(str) + '_' + adata_new.obs.drug_dose_name.astype(str)

DE test

In [90]:
os.getcwd()

'/home/mohammad'

In [91]:
os.chdir('./Desktop/test_AdvAE/')


In [92]:
from cpa.helper import rank_genes_groups_by_cov

In [94]:
rank_genes_groups_by_cov(adata_new, groupby='cov_drug_dose_name', covariate='cell_type', control_group='ctrl_1', n_genes=20)

ranking genes
Trying to set attribute `.obs` of view, copying.


A549


... storing 'split' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'dose_val' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'drug_dose_name' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cov_drug_dose_name' as categorical
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:02)


saving to new object

In [126]:
to_pick = adata_new.obs[["cov_drug_dose_name","dose_val","control","condition","split","guide_identity","drug_dose_name","cell_type"]]

In [127]:
adata_new_small = sc.AnnData(adata_new.X, obs=to_pick,
                             var=adata_new.var, uns=adata_new.uns)

In [132]:
adata_new_small.layers = adata_new.layers

# splits

visualization split 

In [311]:
ood_set = []
adata_new_small.obs['split'] = 'nan'
adata_idx = adata_new_small.obs_names
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.1, random_state=42)
adata_idx_test, adata_idx_ood = train_test_split(adata_idx_test, test_size=0.0001, random_state=42)
adata_idx = adata_new_small.obs_names[adata_new_small.obs.split!='ood']
adata_new_small.obs['split'].loc[adata_idx_train] = 'train'
adata_new_small.obs['split'].loc[adata_idx_test] = 'test'
adata_new_small.obs['split'].loc[adata_idx_ood] = 'ood'
adata_new_small.obs.groupby('split').size()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


split
ood          2
test     10848
train    97647
dtype: int64

First ood set

In [270]:
ood_set = ['DUSP9+MAPK1',
 'ETS2+MAPK1',
 'DUSP9+ETS2',
 'CBL+CNN1',
 'MAPK1+DUSP9',
 'MAPK1+ETS2',
 'ETS2+DUSP9',
 'CNN1+CBL']
adata_new_small.obs['split1'] = 'nan'
adata_new_small.obs['split1'].loc[
    adata_new_small.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new_small.obs_names[adata_new_small.obs.split1!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_new_small.obs['split1'].loc[adata_idx_train] = 'train'
adata_new_small.obs['split1'].loc[adata_idx_test] = 'test'
adata_new_small.obs.groupby('split1').size()

split1
ood       1884
test     21323
train    85290
dtype: int64

In [271]:
condition_key = "condition"

split for leave one 10 out and predict

In [272]:
double_ko_list = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" not in i]
single = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" in i]
np.random.shuffle(double_ko_list)

#drug split into splits of 10
ood_list = []
i = 0
while(i<len(double_ko_list)):
    ood_list.append(double_ko_list[i:min(i+10,len(double_ko_list))])
    i+=10
ood_list[len(ood_list)-2] = ood_list[len(ood_list)-2] + ood_list[len(ood_list)-1]
del(ood_list[len(ood_list)-1])

In [273]:
for idx, splits in enumerate(ood_list):
    ood_set = splits
    print(splits, idx)
    adata_new_small.obs[f'split{idx+2}'] = 'nan'
    adata_new_small.obs[f'split{idx+2}'].loc[
        adata_new_small.obs['condition'].isin(ood_set)
    ] = 'ood'
    adata_idx = adata_new_small.obs_names[adata_new_small.obs[f'split{idx+2}']!='ood']
    adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
    adata_new_small.obs[f'split{idx+2}'].loc[adata_idx_train] = 'train'
    adata_new_small.obs[f'split{idx+2}'].loc[adata_idx_test] = 'test'
    adata_new_small.obs.groupby(f'split{idx+2}').size()

['PTPN12+OSR2', 'DUSP9+PRTG', 'POU3F2+FOXL2', 'UBASH3B+CNN1', 'MAP2K6+SPI1', 'SAMD1+PTPN12', 'MAPK1+TGFBR2', 'FOXF1+FOXL2', 'IGDCC3+PRTG', 'FOXA3+FOXL2'] 0
['CEBPE+CEBPA', 'UBASH3B+ZBTB25', 'UBASH3B+PTPN9', 'BPGM+ZBTB1', 'KLF1+BAK1', 'SNAI1+DLX2', 'KIF18B+KIF2C', 'UBASH3B+UBASH3A', 'PLK4+STIL', 'SAMD1+UBASH3B'] 1
['FOXA3+FOXA1', 'LYL1+IER5L', 'ETS2+IGDCC3', 'PTPN12+ZBTB25', 'MAP2K3+ELMSAN1', 'KLF1+MAP2K6', 'BCL2L11+TGFBR2', 'SET+CEBPE', 'ETS2+IKZF3', 'CBL+PTPN9'] 2
['FOXA3+FOXF1', 'JUN+CEBPA', 'DUSP9+KLF1', 'CNN1+MAPK1', 'FOSB+PTPN12', 'ETS2+CNN1', 'UBASH3B+PTPN12', 'LHX1+ELMSAN1', 'ZC3HAV1+CEBPA', 'KLF1+CLDN6'] 3
['FOSB+CEBPE', 'SET+KLF1', 'CBL+UBASH3A', 'AHR+KLF1', 'LYL1+CEBPB', 'RHOXF2+SET', 'IGDCC3+MAPK1', 'FOXA1+HOXB9', 'FEV+ISL2', 'C3orf72+FOXL2'] 4
['CEBPE+RUNX1T1', 'MAP2K6+ELMSAN1', 'CEBPE+SPI1', 'CDKN1C+CDKN1A', 'TGFBR2+PRTG', 'BCL2L11+BAK1', 'KLF1+CEBPA', 'MAP2K3+SLC38A2', 'SNAI1+UBASH3B', 'FOSB+OSR2'] 5
['KLF1+COL2A1', 'CDKN1B+CDKN1A', 'CBL+UBASH3B', 'CEBPE+PTPN12', 'ZBTB10+

In [274]:
adata_new_small

AnnData object with n_obs × n_vars = 108497 × 5000
    obs: 'cov_drug_dose_name', 'dose_val', 'control', 'condition', 'guide_identity', 'drug_dose_name', 'cell_type', 'split', 'split1', 'split2', 'split3', 'split4', 'split5', 'split6', 'split7', 'split8', 'split9', 'split10', 'split11', 'split12', 'split13', 'split14'
    var: 'gene_symbols', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'rank_genes_groups_cov'
    layers: 'counts'

robustness split

In [275]:
double_ko_list = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" not in i]
single = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" in i]
np.random.shuffle(double_ko_list)

#split into splits of 10
ood_list = []
i = 120
while(i>10):
    ood_list.append(double_ko_list[0:min(i,len(double_ko_list))])
    print(i)
    i-= 20

120
100
80
60
40
20


In [276]:
for idx, splits in enumerate(ood_list):
    ood_set = splits
    print(splits, idx)
    adata_new_small.obs[f'split{idx+15}'] = 'nan'
    adata_new_small.obs[f'split{idx+15}'].loc[
        adata_new_small.obs['condition'].isin(ood_set)
    ] = 'ood'
    adata_idx = adata_new_small.obs_names[adata_new_small.obs[f'split{idx+15}']!='ood']
    adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
    adata_new_small.obs[f'split{idx+15}'].loc[adata_idx_train] = 'train'
    adata_new_small.obs[f'split{idx+15}'].loc[adata_idx_test] = 'test'
    adata_new_small.obs.groupby(f'split{idx+15}').size()

['TMSB4X+BAK1', 'BPGM+SAMD1', 'TBX3+TBX2', 'CDKN1C+CDKN1B', 'SAMD1+UBASH3B', 'CEBPB+CEBPA', 'CBL+PTPN12', 'DUSP9+ETS2', 'CEBPE+CNN1', 'PTPN12+OSR2', 'KLF1+MAP2K6', 'FOXA3+HOXB9', 'UBASH3B+CNN1', 'FOSB+CEBPB', 'MAP2K6+ELMSAN1', 'CEBPB+PTPN12', 'DUSP9+IGDCC3', 'SAMD1+ZBTB1', 'LYL1+IER5L', 'ZBTB10+SNAI1', 'FOXF1+HOXB9', 'IGDCC3+MAPK1', 'MAP2K6+IKZF3', 'MAPK1+PRTG', 'FOXA3+FOXF1', 'RHOXF2+SET', 'SAMD1+PTPN12', 'MAPK1+IKZF3', 'PTPN12+ZBTB25', 'FEV+CBFA2T3', 'MAP2K6+SPI1', 'SNAI1+UBASH3B', 'CEBPE+PTPN12', 'BCL2L11+TGFBR2', 'ETS2+CEBPE', 'FEV+ISL2', 'JUN+CEBPB', 'ZNF318+FOXL2', 'TGFBR2+ETS2', 'LHX1+ELMSAN1', 'MAP2K3+ELMSAN1', 'FOSB+UBASH3B', 'SET+KLF1', 'FOSB+OSR2', 'PTPN12+SNAI1', 'CEBPE+CEBPA', 'ZBTB10+DLX2', 'DUSP9+SNAI1', 'ETS2+MAPK1', 'CEBPE+KLF1', 'CNN1+UBASH3A', 'ZBTB10+PTPN12', 'ZC3HAV1+CEBPE', 'MAPK1+TGFBR2', 'PRDM1+CBFA2T3', 'KLF1+BAK1', 'UBASH3B+PTPN12', 'ETS2+IKZF3', 'FOSB+PTPN12', 'UBASH3B+ZBTB25', 'TGFBR2+PRTG', 'FOXA3+FOXL2', 'ETS2+IGDCC3', 'KLF1+TGFBR2', 'CEBPB+OSR2', 'ZC3HAV1

epistasis

In [329]:
epistasis = ["AHR+KLF1","MAPK1+TGFBR2","TGFBR2+IGDCC3","TGFBR2+PRTG",
             "UBASH3B+OSR2","DUSP9+ETS2","KLF1+CEBPA","MAP2K6+IKZF3","ZC3HAV1+CEBPA"]
ood_set = epistasis
adata_new_small.obs['split21'] = 'nan'
adata_new_small.obs['split21'].loc[
    adata_new_small.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new_small.obs_names[adata_new_small.obs.split21!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_new_small.obs['split21'].loc[adata_idx_train] = 'train'
adata_new_small.obs['split21'].loc[adata_idx_test] = 'test'
adata_new_small.obs.groupby('split21').size()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


split21
ood       3824
test     20935
train    83738
dtype: int64

In [339]:
check = adata_new_small[adata_new_small.obs["split21"] == "ood"]

In [341]:
check.obs[condition_key].value_counts()

UBASH3B+OSR2     796
DUSP9+ETS2       787
MAPK1+TGFBR2     497
AHR+KLF1         481
KLF1+CEBPA       311
TGFBR2+IGDCC3    301
MAP2K6+IKZF3     300
TGFBR2+PRTG      265
ZC3HAV1+CEBPA     86
Name: condition, dtype: int64

neomorphic interactions

In [344]:
nemorphic = ["CBL+TGFBR2","KLF1+TGFBR2","MAP2K6+SPI1",
            "SAMD1+TGFBR2","TGFBR2+ETS2","CBL+UBASH3A",
            "CEBPE+KLF1","PTPN12+OSR2","ZC3HAV1+CEBPE","PLK4+STIL","FOSB+PTPN12","FEV+CBFA2T3"]

In [345]:
ood_set = nemorphic
adata_new_small.obs['split22'] = 'nan'
adata_new_small.obs['split22'].loc[
    adata_new_small.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new_small.obs_names[adata_new_small.obs.split22!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_new_small.obs['split22'].loc[adata_idx_train] = 'train'
adata_new_small.obs['split22'].loc[adata_idx_test] = 'test'
adata_new_small.obs.groupby('split22').size()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


split22
ood       3083
test     21083
train    84331
dtype: int64

In [347]:
check = adata_new_small[adata_new_small.obs["split22"] == "ood"]

In [348]:
check.obs[condition_key].value_counts()

CEBPE+KLF1       468
ZC3HAV1+CEBPE    410
FOSB+PTPN12      345
PTPN12+OSR2      339
KLF1+TGFBR2      337
TGFBR2+ETS2      318
MAP2K6+SPI1      302
CBL+TGFBR2       188
FEV+CBFA2T3      159
PLK4+STIL         81
SAMD1+TGFBR2      72
CBL+UBASH3A       64
Name: condition, dtype: int64

In [356]:
set(set_orig)

set()

In [359]:
set_orig = []
for i in adata_new_small.obs[condition_key].unique():
    for j in nemorphic:
        split = j.split("+")
        if (split[0] in i) and (split[1] in i):
            set_orig.append(i)


In [361]:
set(set_orig) == set(nemorphic)

True

leave all doubles out  out

In [364]:
double_ko_list = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" not in i]
single = [i for i in adata_new_small.obs[condition_key].unique() if "ctrl" in i]
np.random.shuffle(double_ko_list)

In [368]:
ood_set = double_ko_list
adata_new_small.obs['split23'] = 'nan'
adata_new_small.obs['split23'].loc[
    adata_new_small.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new_small.obs_names[adata_new_small.obs.split23!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_new_small.obs['split23'].loc[adata_idx_train] = 'train'
adata_new_small.obs['split23'].loc[adata_idx_test] = 'test'
adata_new_small.obs.groupby('split23').size()

split23
ood      41759
test     13348
train    53390
dtype: int64

saving final object

In [369]:
adata_new_small

AnnData object with n_obs × n_vars = 108497 × 5000
    obs: 'cov_drug_dose_name', 'dose_val', 'control', 'condition', 'guide_identity', 'drug_dose_name', 'cell_type', 'split', 'split1', 'split2', 'split3', 'split4', 'split5', 'split6', 'split7', 'split8', 'split9', 'split10', 'split11', 'split12', 'split13', 'split14', 'split15', 'split16', 'split17', 'split18', 'split19', 'split20', 'split21', 'split22', 'split23'
    var: 'gene_symbols', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'rank_genes_groups_cov'
    layers: 'counts'

In [370]:
adata_new_small.write("./datasets/Norman2019_prep_new.h5ad")

... storing 'split23' as categorical


add two splits 

split DUSP9+MAPK1

In [4]:
adata_new = sc.read("./datasets/Norman2019_prep_new.h5ad")

In [8]:
ood_set = ['DUSP9+MAPK1']
adata_new.obs['split24'] = 'nan'
adata_new.obs['split24'].loc[
    adata_new.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new.obs_names[adata_new.obs.split24!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_idx = adata_new.obs_names[adata_new.obs.split24!='ood']
train_test = adata_new[~adata_new.obs["condition"].isin(ood_set)].copy()
adata_new.obs['split24'].loc[adata_idx_train] = 'train'
adata_new.obs['split24'].loc[adata_idx_test] = 'test'
adata_new.obs.groupby('split24').size()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  res = method(*args, **kwargs)


split24
ood        290
test     21642
train    86565
dtype: int64

split DUSP9+ETS2

In [9]:
ood_set = ["DUSP9+ETS2"]
adata_new.obs['split25'] = 'nan'
adata_new.obs['split25'].loc[
    adata_new.obs['condition'].isin(ood_set)
] = 'ood'
adata_idx = adata_new.obs_names[adata_new.obs.split25!='ood']
adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.2, random_state=42)
adata_idx = adata_new.obs_names[adata_new.obs.split25!='ood']
train_test = adata_new[~adata_new.obs["condition"].isin(ood_set)].copy()
adata_new.obs['split25'].loc[adata_idx_train] = 'train'
adata_new.obs['split25'].loc[adata_idx_test] = 'test'
adata_new.obs.groupby('split25').size()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  res = method(*args, **kwargs)


split25
ood        787
test     21542
train    86168
dtype: int64

In [11]:
adata_new.write("./datasets/Norman2019_prep_new.h5ad")

... storing 'split24' as categorical
... storing 'split25' as categorical
