## Data Preprocessing for Adamson et al. 2016 dataset 


In [1]:
import os
import numpy as np
import scanpy as sc
import anndata as ad
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
plt.rcParams['figure.figsize']=(5, 5)
sc.settings.verbosity = 3
sc.logging.print_header()


%load_ext autoreload
%autoreload 2 

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.21.1 scipy==1.7.1 pandas==1.3.1 scikit-learn==0.24.2 statsmodels==0.12.2 pynndescent==0.5.4


In [2]:

import sys
sys.path.append('../')

## loading the raw data

In [3]:
adata = sc.read("/dfs/project/perturb-gnn/datasets/Adamson2016/Adamson2016_all.h5ad")

In [4]:
adata

AnnData object with n_obs × n_vars = 69249 × 16528
    obs: 'UMI_count', 'gem_group', 'good_coverage', 'guide_UMI_count', 'guide_coverage', 'guide_identity', 'guide_read_count', 'guide_target', 'guide_target_1', 'guide_target_2', 'num_targets', 'number_of_cells', 'perturbed', 'single_cell', 'G1-S', 'S', 'G2-M', 'M', 'M-G1', 'cell_cycle_phase', 'cell_cycle_progress', 'cell_cycle_order'

In [5]:
needed_obs = adata.obs[["guide_identity", "UMI_count","gem_group","good_coverage","number_of_cells","perturbed"]].copy()

In [6]:
adata_new = sc.AnnData(adata.X.copy(), obs=needed_obs, var=adata.var.copy())

In [7]:
mapper = {k:(k.split('_')[0] + '+' +'ctrl') for k in adata_new.obs['perturbed']}
mapper['control'] = 'ctrl'
adata_new.obs['guide_merged'] = adata_new.obs['perturbed'].map(mapper)

# preprocessing 

Keep the count data in a counts layer

In [8]:
adata_new.layers["counts"] = adata_new.X.copy()

Normalization and HVG selection

In [9]:
sc.pp.normalize_total(adata_new)
sc.pp.log1p(adata_new)
sc.pp.highly_variable_genes(adata_new,n_top_genes=5000, subset=False)

normalizing counts per cell
    finished (0:00:01)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:21)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


In [10]:
adata_new

AnnData object with n_obs × n_vars = 69249 × 16528
    obs: 'guide_identity', 'UMI_count', 'gem_group', 'good_coverage', 'number_of_cells', 'perturbed', 'guide_merged'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    layers: 'counts'

# Prepare for the model

In [12]:
adata_new.obs['dose_val'] = 'nan'

In [13]:
adata_new.obs['dose_val'].loc[
    adata_new.obs['guide_merged']=="ctrl"
] = '1'

adata_new.obs['dose_val'].loc[
    adata_new.obs['guide_merged']!="ctrl"
] = "1+1"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [14]:
adata_new.obs["condition"] = adata_new.obs["guide_merged"]
adata_new.obs['cell_type'] = 'K562(?)'
adata_new.obs['control'] = [1 if x == 'ctrl' else 0 for x in adata_new.obs.condition.values]

In [15]:
adata_new.obs['drug_dose_name'] = adata_new.obs.condition.astype(str) + '_' + adata_new.obs.dose_val.astype(str)

In [17]:
adata_new.obs['cov_drug_dose_name'] = adata_new.obs.cell_type.astype(str) + '_' + adata_new.obs.drug_dose_name.astype(str)

In [18]:
from scipy.sparse import csr_matrix
adata_new.X = csr_matrix(adata_new.X)

## DE test

In [20]:
from compert.helper import rank_genes_groups_by_cov

In [21]:
rank_genes_groups_by_cov(adata_new, groupby='cov_drug_dose_name', covariate='cell_type', control_group='ctrl_1', n_genes=20)

K562(?)
ranking genes


Trying to set attribute `.uns` of view, copying.
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  foldchanges[global_indices]


    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:24)


## Write h5ad file with all genes

In [5]:
adata_new.write_h5ad('/dfs/project/perturb-gnn/datasets/Adamson2016/Adamson2016_all_genes.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'gene_name' as categorical


In [2]:
#adata_new = sc.read_h5ad('/dfs/project/perturb-gnn/datasets/Adamson2016/Adamson2016_all_genes.h5ad')

## Subset genes

In [13]:
conditions = [(c.split('+')[0], c.split('+')[1]) for c in adata_new.obs['guide_merged'] if '+' in c]
conditions = [item for sublist in conditions for item in sublist]
genes_to_keep = np.unique(conditions)

In [25]:
len(genes_to_keep)

88

In [4]:
# Add gene name

gene_name_df = pd.read_csv('/dfs/project/perturb-gnn/datasets/Adamson2016/GSM2406675_10X001/outs/raw_gene_bc_matrices_mex/GRCh38/genes.tsv', 
            delimiter='\t', header=None)
gene_name_df = gene_name_df.set_index(0)
gene_name_df.columns = ['gene_name']
adata_new.var = adata_new.var.merge(gene_name_df, left_index=True, right_index=True)


In [28]:
map_dict = {i:i for i in adata_new.var['gene_name'].values}
map_dict['Gal4-4(mod)']='Gal4'
map_dict['FOXL2NB']='C3orf72'
map_dict['RP5-862P8.2']='KIAA1804'
map_dict['RHOXF2B']='RHOXF2BB'

adata_new.var['gene_name'] = adata_new.var.gene_name.map(map_dict)

### HVG + pert

In [35]:
adata_new.var['highly_variable'] = adata_new.var['highly_variable'] + adata_new.var.gene_name.isin(genes_to_keep)
adata_new = adata_new[:,adata_new.var['highly_variable'] == True]