In [1]:
import scanpy as sc
import numpy as np
import functools
import jax
from cfp.metrics import compute_metrics, compute_mean_metrics, compute_metrics_fast
import cfp.preprocessing as cfpp
import scanpy as sc
import gseapy as gp
import matplotlib.pyplot as plt
import gseapy as gp
import anndata as ad
import decoupler
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
split = 3

In [3]:
adata_train_path = f"/lustre/groups/ml01/workspace/ot_perturbation/data/combosciplex/adata_train_{split}.h5ad"
adata_test_path = f"/lustre/groups/ml01/workspace/ot_perturbation/data/combosciplex/adata_test_{split}.h5ad"
adata_ood_path = f"/lustre/groups/ml01/workspace/ot_perturbation/data/combosciplex/adata_ood_{split}.h5ad"

In [4]:
adata_train = sc.read(adata_train_path)
adata_test = sc.read(adata_test_path)
adata_ood = sc.read(adata_ood_path)


In [5]:
adata_ood.obs.head()

Unnamed: 0_level_0,sample,Size_Factor,n.umi,RT_well,Drug1,Drug2,Well,n_genes,n_genes_by_counts,total_counts,...,split,control,cell_type,cell_line,smiles_drug_1,smiles_drug_2,ood_1,ood_2,ood_3,ood_4
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01_A02_RT_BC_22_Lig_BC_17,sciPlex_theis,0.899999,2416,RT_22,control,Dacinostat,B10,1572,1569,2412.0,...,ood,0,A549,A549,,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)/C=...,not ood,not ood,control+Dacinostat,not ood
A01_A02_RT_BC_22_Lig_BC_26,sciPlex_theis,0.470861,1264,RT_22,control,Dacinostat,B10,902,901,1263.0,...,ood,0,A549,A549,,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)/C=...,not ood,not ood,control+Dacinostat,not ood
A01_A02_RT_BC_22_Lig_BC_27,sciPlex_theis,0.508485,1365,RT_22,control,Dacinostat,B10,934,934,1365.0,...,ood,0,A549,A549,,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)/C=...,not ood,not ood,control+Dacinostat,not ood
A01_A02_RT_BC_22_Lig_BC_64,sciPlex_theis,1.300827,3492,RT_22,control,Dacinostat,B10,2104,2102,3490.0,...,ood,0,A549,A549,,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)/C=...,not ood,not ood,control+Dacinostat,not ood
A01_A02_RT_BC_22_Lig_BC_84,sciPlex_theis,0.790852,2123,RT_22,control,Dacinostat,B10,1434,1432,2121.0,...,ood,0,A549,A549,,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)/C=...,not ood,not ood,control+Dacinostat,not ood


In [6]:
adata_ood.obs["condition"].cat.categories

Index(['Panobinostat+Crizotinib', 'Panobinostat+Curcumin',
       'Panobinostat+SRT1720', 'Panobinostat+Sorafenib',
       'SRT2104+Alvespimycin', 'control', 'control+Alvespimycin',
       'control+Dacinostat'],
      dtype='object')

In [7]:
adata_train.obs["condition"].cat.categories

Index(['Alvespimycin+Pirarubicin', 'Cediranib+PCI-34051',
       'Dacinostat+Danusertib', 'Dacinostat+Dasatinib', 'Dacinostat+PCI-34051',
       'Givinostat+Carmofur', 'Givinostat+Cediranib', 'Givinostat+Crizotinib',
       'Givinostat+Curcumin', 'Givinostat+Dasatinib', 'Givinostat+SRT1720',
       'Givinostat+SRT2104', 'Givinostat+Sorafenib', 'Givinostat+Tanespimycin',
       'Panobinostat+Alvespimycin', 'Panobinostat+Dasatinib',
       'Panobinostat+PCI-34051', 'Panobinostat+SRT2104',
       'Panobinostat+SRT3025', 'SRT3025+Cediranib', 'control',
       'control+Dasatinib', 'control+Givinostat', 'control+Panobinostat',
       'control+SRT2104'],
      dtype='object')

In [8]:
control = adata_ood[adata_ood.obs["condition"]=="control"].X.toarray()
rng = np.random.default_rng(0)
drug_effect = {}
for cond in adata_train.obs["condition"].cat.categories:
    drug_effect[cond] = rng.choice(adata_train[adata_train.obs["condition"]==cond].X.toarray(), 500) - control

drug_effect["control"] = control # note this is not a displacement vector, but the control vector
    

In [9]:
drug_effect_ood = {}

drug_effect_ood['Panobinostat+Crizotinib'] = (
    drug_effect['control'] 
    + drug_effect['Givinostat+Crizotinib'] 
    - drug_effect['control+Givinostat']
    + drug_effect['control+Panobinostat']
)

drug_effect_ood['Panobinostat+Curcumin'] = (
    drug_effect['control'] 
    + drug_effect['Givinostat+Curcumin'] 
    - drug_effect['control+Givinostat']
    + drug_effect['control+Panobinostat']
)

drug_effect_ood['Panobinostat+SRT1720'] = (
    drug_effect['control'] 
    + drug_effect['Givinostat+SRT1720'] 
    - drug_effect['control+Givinostat']
    + drug_effect['control+Panobinostat']
)

drug_effect_ood['Panobinostat+Sorafenib'] = (
    drug_effect['control'] 
    + drug_effect['Givinostat+Sorafenib'] 
    - drug_effect['control+Givinostat']
    + drug_effect['control+Panobinostat']
)

drug_effect_ood['SRT2104+Alvespimycin'] = (
    drug_effect['control'] 
    + drug_effect['Givinostat+SRT2104'] 
    - drug_effect['control+Givinostat']
    + drug_effect['Panobinostat+Alvespimycin']
    - drug_effect['control+Panobinostat']
)

drug_effect_ood['control+Alvespimycin'] = (
    drug_effect['control'] 
    + drug_effect['Panobinostat+Alvespimycin']
    - drug_effect['control+Panobinostat']
)

drug_effect_ood['control+Dacinostat'] = (
    drug_effect['control'] 
    + drug_effect['Dacinostat+Dasatinib']
    - drug_effect['control+Dasatinib']
)


In [10]:
import anndata as ad
import pandas as pd
all_data = []
conditions = []

for condition, array in drug_effect_ood.items():
    all_data.append(array)
    conditions.extend([condition] * array.shape[0])

# Stack all data vertically to create a single array
all_data_array = np.vstack(all_data)

# Create a DataFrame for the .obs attribute
obs_data = pd.DataFrame({
    'condition': conditions
})

# Create the Anndata object
adata_pred_ood = ad.AnnData(X=all_data_array, obs=obs_data)



In [11]:
adata_pred_ood = adata_pred_ood[adata_pred_ood.obs["condition"]!="control"]
adata_pred_ood.var_names = adata_ood.var_names
adata_pred_ood.obs["condition"] = adata_pred_ood.obs.apply(lambda x: x["condition"] + "_pred", axis=1)
adata_full = ad.concat((adata_train, adata_ood, adata_pred_ood))
if not Path("c2.cp.reactome.v7.5.1.symbols.gmt").is_file():
    !wget -O 'c2.cp.reactome.v7.5.1.symbols.gmt' https://figshare.com/ndownloader/files/35233771



In [12]:
import pandas as pd
def gmt_to_decoupler(pth: Path) -> pd.DataFrame:
    """
    Parse a gmt file to a decoupler pathway dataframe.
    """
    from itertools import chain, repeat

    pathways = {}

    with Path(pth).open("r") as f:
        for line in f:
            name, _, *genes = line.strip().split("\t")
            pathways[name] = genes

    return pd.DataFrame.from_records(
        chain.from_iterable(zip(repeat(k), v) for k, v in pathways.items()),
        columns=["geneset", "genesymbol"],
    )

reactome = gmt_to_decoupler("c2.cp.reactome.v7.5.1.symbols.gmt")
# Retrieving via python
msigdb = decoupler.get_resource("MSigDB")

# Get reactome pathways
reactome = msigdb.query("collection == 'reactome_pathways'")
# Filter duplicates
reactome = reactome[~reactome.duplicated(("geneset", "genesymbol"))]


  File "/home/icb/dominik.klein/mambaforge/envs/cfp/lib/python3.11/site-packages/urllib3/connectionpool.py", line 466, in _make_request
    self._validate_conn(conn)
  File "/home/icb/dominik.klein/mambaforge/envs/cfp/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1095, in _validate_conn
    conn.connect()
  File "/home/icb/dominik.klein/mambaforge/envs/cfp/lib/python3.11/site-packages/urllib3/connection.py", line 652, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/icb/dominik.klein/mambaforge/envs/cfp/lib/python3.11/site-packages/urllib3/connection.py", line 805, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "/home/icb/dominik.klein/mambaforge/envs/cfp/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 465, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostnam

In [13]:
sc.tl.rank_genes_groups(adata_full, "condition", method="t-test", key_added="t-test", reference="control")

ood_conditions = adata_ood.obs["condition"].cat.categories

pred_conds = [el+"_pred" for el in ood_conditions if el!="control"]
all_conds = list(ood_conditions) + pred_conds

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfo

In [14]:
import numpy as np
dict_t_stats = {}

for condition in all_conds:
    if condition=="control":
        continue
    t_stats = (
        # Get dataframe of DE results for condition vs. rest
        sc.get.rank_genes_groups_df(adata_full, group=condition,  key="t-test")
        .set_index("names")
        # Sort by absolute score
        .sort_values("scores", key=np.abs, ascending=False)
        # Format for decoupler
        [["scores"]]
    )
    dict_t_stats[condition] = t_stats

In [15]:
def get_mode(x):
    if "pred" in x["condition"]:
        return "ood_pred"
    if x["condition"] in ood_conditions:
        return "ood_true"
    return "seen"

In [16]:
pathways = ["REACTOME_APOPTOSIS",
            "REACTOME_ONCOGENIC_MAPK_SIGNALING",
            "REACTOME_CASPASE_ACTIVATION_VIA_EXTRINSIC_APOPTOTIC_SIGNALLING_PATHWAY",
            "REACTOME_CELL_CYCLE_MITOTIC",
             "REACTOME_CELL_CYCLE",
             "REACTOME_ANTIGEN_PROCESSING_CROSS_PRESENTATION",
             "REACTOME_INTERFERON_SIGNALING",
             "REACTOME_CLASS_I_MHC_MEDIATED_ANTIGEN_PROCESSING_PRESENTATION"
            ]

In [17]:
pathway_corrs = {}

for pathway in pathways:
    dict_gsea_results = {}

    gene_set = reactome[reactome.geneset==pathway]
    
    for condition, t_stats in dict_t_stats.items():
        scores, norm, pvals = decoupler.run_gsea(
            t_stats.T,
            gene_set,
            source="geneset",
            target="genesymbol",
            min_n=1
        )
        gsea_results = (
            pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
            .droplevel(level=1, axis=1)
            .sort_values("pval")
        )
        dict_gsea_results[condition] = gsea_results
    res_score = {}

    for cond, df in dict_gsea_results.items():
        res_score[cond] = df.loc[pathway]["score"]
    df = pd.DataFrame(res_score, index=[0]).T
    df["condition"] = df.index
    df["mode"] = df.apply(get_mode, axis=1)
    df["score"] = df[0]
    df_true = df[df["mode"]=="ood_true"]
    df_pred = df[df["mode"]=="ood_pred"]
    df_true["rank_true"] = df_true["score"].rank()
    df_pred["rank_pred"] = df_pred["score"].rank()
    df_pred["condition"] = df_pred.index
    df_pred["condition"] = df_pred.apply(lambda x: x["condition"].split('_', 1)[0], axis=1)
    df_pred = df_pred.set_index("condition")
    df_res = pd.concat((df_true, df_pred), axis=1)
    pathway_corrs[pathway] = df_res["rank_true"].corr(df_res["rank_pred"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true["rank_true"] = df_true["score"].rank()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred["rank_pred"] = df_pred["score"].rank()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred["condition"] = df_pred.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

In [18]:
pathway_corrs

{'REACTOME_APOPTOSIS': 0.7142857142857144,
 'REACTOME_ONCOGENIC_MAPK_SIGNALING': 0.8214285714285715,
 'REACTOME_CASPASE_ACTIVATION_VIA_EXTRINSIC_APOPTOTIC_SIGNALLING_PATHWAY': -0.25,
 'REACTOME_CELL_CYCLE_MITOTIC': -0.1785714285714286,
 'REACTOME_CELL_CYCLE': 0.3214285714285715,
 'REACTOME_ANTIGEN_PROCESSING_CROSS_PRESENTATION': 0.5714285714285715,
 'REACTOME_INTERFERON_SIGNALING': 0.5,
 'REACTOME_CLASS_I_MHC_MEDIATED_ANTIGEN_PROCESSING_PRESENTATION': -0.5357142857142858}

In [19]:
pd.DataFrame(pathway_corrs,index=["spearman_correlation"]).T.to_csv(f"add_spearman_corrs_{split}.csv")