# 1 LINCS

**Requires**
lincs_full.h5ad / lincs.h5ad

**Outputs**
lincs_full_pp.h5ad / lincs_pp.h5ad

## Description

This notebook processes gene expression data from the LINCS dataset:

1. **Data Cleaning**: Loads LINCS data, cleans columns, and renames key fields.
2. **Filtering Insufficient Conditions**: Filters out conditions with fewer than 5 samples.
3. **Calculating Differentially Expressed Genes (DEGs)**: Identifies the top 50 genes most differentially expressed for each condition compared to the control (`DMSO`).
4. **Creating Data Splits**: Defines `'train'`, `'ood'`, and `'test'` splits for model training and evaluation:
   - **OOD**: A random 10% selection from the samples with top occurring conditions, assigned to `'ood'`.
   - **Test**: 16% of the remaining observations assigned to `'test'`.
   - **Train**: The rest of the observations assigned to `'train'`.







In [1]:
import os
import warnings

import numpy as np
import pandas as pd

from scipy import sparse
from tqdm.auto import tqdm

from chemCPA.helper import rank_genes_groups_by_cov
from chemCPA.paths import DATA_DIR
from pathlib import Path
import sys
import logging
from notebook_utils import suppress_output
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
import raw_data.datasets as datasets


import scanpy as sc
with suppress_output():
    sc.set_figure_params(dpi=100, frameon=False)
    sc.logging.print_header()
    warnings.filterwarnings('ignore')

# logging.info is visible when running as python script 
if not any('ipykernel' in arg for arg in sys.argv):
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )

2023-08-18 17:11:42.233290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-18 17:11:48.390383: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-08-18 17:11:48.390575: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.2 pynndescent==0.5.6


## Load data


In [1]:
# Get the absolute path to the chemCPA root directory
full = True 
load_adata = True
# Ensure adata_path is a Path object
adata_path = Path(datasets.lincs_full()) if full else DATA_DIR / datasets.lincs()
logging.info(f"Starting to load in data from {adata_path}")
adata = sc.read(adata_path) if load_adata else None
logging.info(f"Data loaded from {adata_path}")


NameError: name '__file__' is not defined

# Rename columns & clean up columns

In [3]:
logging.info("Renaming and cleaning up columns")
import re

def remove_non_alphanumeric(input_string):
    return re.sub(r'[^a-zA-Z0-9]', '', input_string)

adata.obs['condition'] = adata.obs['pert_iname'].apply(remove_non_alphanumeric)
adata.obs['cell_type'] = adata.obs['cell_id']
adata.obs['dose_val'] = adata.obs['pert_dose'].astype(float) / np.max(adata.obs['pert_dose'].astype(float))
adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)
adata.obs['cov_drug_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str)
adata.obs['eval_category'] = adata.obs['cov_drug_name']
adata.obs['control'] = (adata.obs['condition'] == 'DMSO').astype(int)

# adata.obs['cov_drug_dose_name'] = adata.obs['cov_drug_dose_name'].str.replace('/','|')

In [4]:
pd.crosstab(adata.obs.condition, adata.obs.cell_type)

cell_type,A375,A549,A673,AGS,ASC,ASC.C,BT20,CD34,CL34,CORL23,...,SW620,SW948,T3M10,THP1,TYKNU,U266,U937,VCAP,WSUDLCL2,YAPC
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10DEBC,30,7,2,2,0,0,0,0,2,2,...,4,4,2,3,2,0,2,9,2,18
10Hphenothiazin10ylptolylmethanone,6,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11,0,0
10hydroxycamptothecin,5,5,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
123456hexabromocyclohexane,5,5,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1234tetrahydroisoquinoline,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zopiclone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zosuquidar,20,6,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,17
zoxazolamine,6,5,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,0
zprolylprolinal,3,6,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,0


In [5]:
drug_abundance = adata.obs.condition.value_counts()
suff_drug_abundance = drug_abundance.index[drug_abundance>5]

In [6]:
# Delete conditions isufficient # of observations
adata = adata[adata.obs.condition.isin(suff_drug_abundance)].copy()
adata 
logging.info("Finished cleaning up columns")

AnnData object with n_obs × n_vars = 1023036 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'cov_drug_name', 'eval_category', 'control'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'cydata_pull'

Calculate differential genes manually, such that the genes are the same per condition.

In [7]:
logging.info("Processing DEGs")
%%time

de_genes = {}
de_genes_quick = {}

adata_df = adata.to_df()
adata_df = adata_df.join(adata.obs['condition'])  # Ensures correct alignment
dmso = adata_df[adata_df.condition == "DMSO"].mean(numeric_only=True)


for cond, df in tqdm(adata_df.groupby('condition')): 
    if cond != 'DMSO':
        drug_mean = df.mean(numeric_only=True)
        de_50_idx = np.argsort(abs(drug_mean - dmso))[-50:]
        de_genes_quick[cond] = drug_mean.index[de_50_idx].values

if full: 
    de_genes = de_genes_quick
else:
    sc.tl.rank_genes_groups(
        adata,
        groupby='condition', 
        reference='DMSO',
        rankby_abs=True,
        n_genes=50
    )
    for cond in tqdm(np.unique(adata.obs['condition'])):
        if cond != 'DMSO':
            df = sc.get.rank_genes_groups_df(adata, group=cond)
            de_genes[cond] = df['names'][:50].values

logging.info("Completed processing DEGs")

  0%|          | 0/17983 [00:00<?, ?it/s]

CPU times: user 55.2 s, sys: 1.32 s, total: 56.6 s
Wall time: 56.2 s


Mapping from `rank_genes_groups_cov` might cause problems when drug contains '_'

In [8]:
def extract_drug(cond): 
    split = cond.split('_')
    if len(split) == 2: 
        return split[-1]
    return '_'.join(split[1:-1])

adata.obs['cov_drug_dose_name'].apply(lambda s: len(s.split('_'))).value_counts()
adata.obs['eval_category'].apply(lambda s: len(s.split('_'))).value_counts()

2    1023036
Name: eval_category, dtype: int64

In [9]:
adata.uns['rank_genes_groups_cov'] = {cat: de_genes_quick[extract_drug(cat)] for cat in adata.obs.eval_category.unique() if extract_drug(cat) != 'DMSO'}

In [10]:
adata.uns['rank_genes_groups_cov']

{'A375_aminoguanidine': array(['NARFL', 'DYNLT3', 'KIAA0100', 'NPEPL1', 'HN1L', 'FUT1', 'ARNT2',
        'CDCA4', 'APOE', 'TEX10', 'POLR2I', 'AURKB', 'SCARB1', 'TESK1',
        'CSK', 'CD58', 'USP22', 'LRRC41', 'CHIC2', 'NR2F6', 'MMP2',
        'NPDC1', 'NOSIP', 'ECH1', 'PCNA', 'RTN2', 'CCND1', 'PAPD7',
        'MRPL12', 'BNIP3L', 'MYBL2', 'AKT1', 'METRN', 'ZNF586', 'SNCA',
        'CCNB1', 'S100A4', 'HSPA8', 'ACD', 'PAF1', 'CIRBP', 'SCAND1',
        'CHAC1', 'KIF2C', 'COG7', 'SUPV3L1', 'VPS28', 'ZNF274', 'CLTB',
        'GAPDH'], dtype=object),
 'A375_pritelivir': array(['SLC25A46', 'ARNT2', 'AURKB', 'EPN2', 'CCNB1', 'NPEPL1',
        'KIAA0100', 'ASAH1', 'HN1L', 'CD58', 'SCARB1', 'NR2F6', 'TEX10',
        'NOSIP', 'TESK1', 'LRRC41', 'USP22', 'CSK', 'APBB2', 'MMP2',
        'CHIC2', 'ECH1', 'NPDC1', 'CDCA4', 'CCND1', 'S100A4', 'MYBL2',
        'ZNF586', 'RTN2', 'HSPA8', 'MRPL12', 'BNIP3L', 'SNCA', 'METRN',
        'NARFL', 'AKT1', 'PCNA', 'PAPD7', 'ACD', 'CIRBP', 'PAF1', 'SCAND1',
   

In [11]:
adata.obs['split'] = 'train'

# take ood from top occurring perturbations to avoid losing data on low occ ones
ood_idx = sc.pp.subsample(
    adata[adata.obs.condition.isin(list(adata.obs.condition.value_counts().index[1:50]))],
    .1,
    copy=True
).obs.index
adata.obs['split'].loc[ood_idx] = 'ood'

# take test from a random subsampling of the rest
test_idx = sc.pp.subsample(
    adata[adata.obs.split != 'ood'],
    .16,
    copy=True
).obs.index
adata.obs['split'].loc[test_idx] = 'test'

In [12]:
pd.crosstab(adata.obs['split'], adata.obs['condition'])

condition,10DEBC,10Hphenothiazin10ylptolylmethanone,10hydroxycamptothecin,123456hexabromocyclohexane,1234tetrahydroisoquinoline,1271738625,12dichlorobenzene,12propyleneglycol,15deltaprostaglandinj2,1616dimethylprostaglandine2,...,zofenoprilcalcium,zolantidine,zolmitriptan,zolpidem,zonisamide,zopiclone,zosuquidar,zoxazolamine,zprolylprolinal,zuclopenthixol
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ood,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
test,47,12,6,8,4,0,13,5,61,46,...,23,12,25,30,38,2,31,13,14,15
train,257,44,31,29,14,15,58,22,301,250,...,103,52,87,175,185,10,151,79,53,68


In [13]:
try: 
    del(adata.uns['rank_genes_groups'])  # too large
except: 
    print('All good.')

All good.


In [14]:
logging.info("Converting to sparse matrix")
# code compatibility
adata.X = sparse.csr_matrix(adata.X)
logging.info("Finished converting to sparse matrix")

In [15]:
output_path = adata_path.with_name(adata_path.stem + "_pp.h5ad")
logging.info(f"Writing file to disk at {output_path}")
output_path.parent.mkdir(parents=True, exist_ok=True)
sc.write(output_path, adata)
logging.info(f"File was written successfully at {output_path}.")

### Check that `adata.uns[rank_genes_groups_cov]` has all entries in `adata.obs.cov_drug_name` as keys

In [17]:
for i, k in enumerate(adata.obs.eval_category.unique()):
    try: 
        adata.uns['rank_genes_groups_cov'][k]
    except: 
        print(f"{i}: {k}") if 'DMSO' not in k else None

### Checking the same for the stored adata object

In [18]:
adata_2 = sc.read(output_path)

In [19]:
for i, k in enumerate(adata_2.obs.eval_category.unique()):
    try: 
        adata_2.uns['rank_genes_groups_cov'][k]
    except: 
        print(f"{i}: {k}") if 'DMSO' not in k else None

In [20]:
set(list(adata.uns['rank_genes_groups_cov'])) - set((list(adata_2.uns['rank_genes_groups_cov'])))

set()

In [21]:
set((list(adata_2.uns['rank_genes_groups_cov']))) - set(list(adata.uns['rank_genes_groups_cov']))

set()

In [22]:
len(list(adata_2.uns["rank_genes_groups_cov"].keys()))

119502

In [23]:
adata.obs["dose_val"].value_counts()

 0.056306    441932
-3.750000     43907
 0.028153     36233
 0.006256     32694
 0.018769     32677
              ...  
 0.690034         2
 0.001858         1
 0.006306         1
 0.056811         1
 0.056512         1
Name: dose_val, Length: 2934, dtype: int64