In [1]:
%load_ext autoreload
%autoreload 2

**Additional dependencies to use gene splitting implemented in gears:** \
`pip install torch_geometric` \
`pip install cell-gears`

**Instructions to download dataset:**\
(mostly following [biolord documentation](https://github.com/nitzanlab/biolord_reproducibility))
1. Download data from [here](https://dataverse.harvard.edu/api/access/datafile/6894431 ) to obtain `norman2019_gears.zip`
2. Run `unzip norman2019_gears.zip` to obtain the directory `norman2019` with files `essential_norman.pkl`, `go_essential_norman.csv` and `norman2019.tar.gz`
3. Run `tar -xzvf norman2019.tar.gz` to uncompress the tar file. This creates a directory `norman2019` with files `perturb_processed.h5ad`, `data_pyg/`, `data_pyg/cell_graphs.pkl`, `splits/`
4. Copy all of the above files to their parent directory, e.g. `<path>/norman2019/norman2019/data_pyg` should instead be `<path>/norman2019/data_pyg`

In [2]:
import os
import glob
import torch
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

from sklearn.model_selection import train_test_split
from gears import PertData
from cfp import preprocessing as cfpp

In [3]:
N_PCA_COMPONENTS = 100

In [4]:
def rank_genes_groups_by_cov(
    adata,
    groupby,
    control_group,
    covariate,
    n_genes=50,
    rankby_abs=True,
    key_added="rank_genes_groups_cov",
    return_dict=False,
):
    gene_dict = {}
    cov_categories = adata.obs[covariate].unique()
    for cov_cat in cov_categories:
        # name of the control group in the groupby obs column
        control_group_cov = control_group  # "_".join([cov_cat, control_group])
        # subset adata to cells belonging to a covariate category
        adata_cov = adata[adata.obs[covariate] == cov_cat]
        # compute DEGs
        sc.tl.rank_genes_groups(
            adata_cov,
            groupby=groupby,
            reference=control_group_cov,
            rankby_abs=rankby_abs,
            n_genes=n_genes,
            use_raw=False,
        )
        # add entries to dictionary of gene sets
        de_genes = pd.DataFrame(adata_cov.uns["rank_genes_groups"]["names"])
        for group in de_genes:
            gene_dict[group] = de_genes[group].tolist()
    adata.uns[key_added] = gene_dict
    if return_dict:
        return gene_dict


def get_DE_genes(adata):
    adata.obs.loc[:, "control"] = adata.obs.condition.apply(lambda x: 1 if x == "control" else 0)
    adata.obs = adata.obs.astype("category")
    rank_genes_groups_by_cov(
        adata,
        groupby="condition",
        covariate="cell_line",
        control_group="ctrl",
        n_genes=50,
        key_added="rank_genes_groups_cov_all",
    )
    return adata

#### Data loading via GEARS

In [5]:
data_dir = "/home/haicu/soeren.becker/repos/ot_pert_reproducibility/norman2019"
pert_data = PertData(data_dir,  gene_set_path=os.path.join(data_dir, "essential_norman.pkl"))
pert_data.load(data_path = data_dir)

pert_data.adata.obs.loc[:, "cell_line"] = "A549"
pert_data.adata.obs.loc[:, ["gene_1", "gene_2"]] = pert_data.adata.obs.condition.str.split("+", expand=True).values
print(f"Found nans before fillna(): {pert_data.adata.obs.isna().any()}")
pert_data.adata.obs.gene_2.fillna("ctrl", inplace=True)
print(f"Found nans before fillna(): {pert_data.adata.obs.isna().any()}")

kategories = {2: "ctrl", 1: "single", 0: "double"} 
pert_data.adata.obs.loc[:, "num_control"] = \
    (pert_data.adata.obs.loc[:, "gene_1"] == "ctrl").astype(int) + \
    (pert_data.adata.obs.loc[:, "gene_2"] == "ctrl").astype(int)
pert_data.adata.obs.loc[:, "kategory"] = pert_data.adata.obs.loc[:, "num_control"].map(kategories)
pert_data.adata.obs.loc[:, "control"] = (pert_data.adata.obs.loc[:, "gene_1"] == "ctrl") & (pert_data.adata.obs.loc[:, "gene_2"] == "ctrl")

display(pert_data.adata)
display(pert_data.adata.obs)

# are all gene targets of double knock-ins also contained in single knock-in conditions?
single_targets = np.unique(pert_data.adata.obs.loc[pert_data.adata.obs.kategory == "single", ["gene_1", "gene_2"]].values)
double_targets = np.unique(pert_data.adata.obs.loc[pert_data.adata.obs.kategory == "double", ["gene_1", "gene_2"]].values)

print(
    "Fraction of targets in single condition that also appear as combo target condition: " +\
    f"{np.isin(single_targets, double_targets).sum()}/{len(single_targets)}"
)

print(
    "Fraction of targets in combo condition that also appear as single target condition: " +\
    f"{np.isin(double_targets, single_targets).sum()}/{len(double_targets)}"
)

Found local copy...
These perturbations are not in the GO graph and their perturbation can thus not be predicted
['RHOXF2BB+ctrl' 'LYL1+IER5L' 'ctrl+IER5L' 'KIAA1804+ctrl' 'IER5L+ctrl'
 'RHOXF2BB+ZBTB25' 'RHOXF2BB+SET']
Local copy of pyg dataset is detected. Loading...
Done!


Found nans before fillna(): condition         False
cell_type         False
dose_val          False
control           False
condition_name    False
cell_line         False
gene_1            False
gene_2             True
dtype: bool
Found nans before fillna(): condition         False
cell_type         False
dose_val          False
control           False
condition_name    False
cell_line         False
gene_1            False
gene_2            False
dtype: bool


AnnData object with n_obs × n_vars = 89357 × 5045
    obs: 'condition', 'cell_type', 'dose_val', 'control', 'condition_name', 'cell_line', 'gene_1', 'gene_2', 'num_control', 'kategory'
    var: 'gene_name'
    uns: 'non_dropout_gene_idx', 'non_zeros_gene_idx', 'rank_genes_groups_cov_all', 'top_non_dropout_de_20', 'top_non_zero_de_20'
    layers: 'counts'

Unnamed: 0_level_0,condition,cell_type,dose_val,control,condition_name,cell_line,gene_1,gene_2,num_control,kategory
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAACCTGAGGCATGTG-1,TSC22D1+ctrl,A549,1+1,False,A549_TSC22D1+ctrl_1+1,A549,TSC22D1,ctrl,1,single
AAACCTGAGGCCCTTG-1,KLF1+MAP2K6,A549,1+1,False,A549_KLF1+MAP2K6_1+1,A549,KLF1,MAP2K6,0,double
AAACCTGCACGAAGCA-1,ctrl,A549,1,True,A549_ctrl_1,A549,ctrl,ctrl,2,ctrl
AAACCTGCAGACGTAG-1,CEBPE+RUNX1T1,A549,1+1,False,A549_CEBPE+RUNX1T1_1+1,A549,CEBPE,RUNX1T1,0,double
AAACCTGCAGCCTTGG-1,MAML2+ctrl,A549,1+1,False,A549_MAML2+ctrl_1+1,A549,MAML2,ctrl,1,single
...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTCAGAATA-8,ctrl,A549,1,True,A549_ctrl_1,A549,ctrl,ctrl,2,ctrl
TTTGTCATCAGTACGT-8,FOXA3+ctrl,A549,1+1,False,A549_FOXA3+ctrl_1+1,A549,FOXA3,ctrl,1,single
TTTGTCATCCACTCCA-8,CELF2+ctrl,A549,1+1,False,A549_CELF2+ctrl_1+1,A549,CELF2,ctrl,1,single
TTTGTCATCCCAACGG-8,BCORL1+ctrl,A549,1+1,False,A549_BCORL1+ctrl_1+1,A549,BCORL1,ctrl,1,single


Fraction of targets in single condition that also appear as combo target condition: 71/103
Fraction of targets in combo condition that also appear as single target condition: 71/71


#### Add ESM2 embeddings to adata.uns

In [6]:
# Specify the folder path
folder_path = '/lustre/groups/ml01/workspace/ot_perturbation/pert_embeddings/norman/norman/'

# Use glob to get all .pt files in the folder
pt_files = glob.glob(folder_path + '*.pt')
print(f"Found {len(pt_files)} files.")

ensembleID_to_esm2_embedding = {}

for path in pt_files:
    split_path = path.split('_')
    gene = split_path[3]
    loaded = torch.load(path)
    ensembleID_to_esm2_embedding[gene] = loaded['mean_representations'][36]

adata = pert_data.adata
perturbed_genes = np.unique(adata.obs.loc[:, ["gene_1", "gene_2"]].values)
_mapping = adata.var.loc[adata.var.gene_name.isin(perturbed_genes)]
gene_name_to_ensemble_id = dict(zip(_mapping.gene_name, _mapping.index))

adata.obs.loc[:, ["gene_1", "gene_2"]] = adata.obs.condition.str.split("+", expand=True).values
# display(adata.obs.isna().any())
adata.obs.gene_1.fillna("ctrl", inplace=True)
adata.obs.gene_2.fillna("ctrl", inplace=True)
# display(adata.obs.isna().any())

gene_name_to_embedding = {}

for gene_name in perturbed_genes:
    if gene_name in ["ctrl", "control"]: 
        # add after loop when we know embedding dim
        continue
    ensemble_id = gene_name_to_ensemble_id[gene_name]
    esm2_embedding = ensembleID_to_esm2_embedding[ensemble_id]
    gene_name_to_embedding[gene_name] = esm2_embedding.numpy()
gene_name_to_embedding["ctrl"] = np.zeros_like(esm2_embedding)

print(f"Added esm2 embeddings for {len(gene_name_to_embedding.keys())} genes.")
print(f"esm2 embeddings have shape {gene_name_to_embedding["ctrl"].shape}")

adata.uns['esm2'] = gene_name_to_embedding

adata.X = adata.layers["counts"]
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, inplace=True, n_top_genes=2000, subset=True)

get_DE_genes(adata) # updates adata in-place
assert "rank_genes_groups_cov_all" in adata.uns.keys()

Found 105 files.
Added esm2 embeddings for 103 genes.
esm2 embeddings have shape (2560,)


In [7]:
if False:
    # Optionally check dataset against version available in pertpy
    import pertpy as pt
    adata_pertpy = pt.dt.norman_2019()
    adata_pertpy
    
    pd.options.display.max_columns = None
    display(adata_pertpy.obs)
    
    conditions_pertpy = np.unique(adata_pertpy.obs.loc[:, "perturbation_name"].str.split("+", expand=True).fillna("control").values)
    print(len(conditions_pertpy))
    print(conditions_pertpy)
    
    adata_biolord = pert_data.adata.obs.loc[:, "condition"].str.split("+", expand=True).fillna("control")
    adata_biolord.loc[:, 0] = adata_biolord.loc[:, 0].apply(lambda x: "control" if x == "ctrl" else x)
    adata_biolord.loc[:, 1] = adata_biolord.loc[:, 1].apply(lambda x: "control" if x == "ctrl" else x)
    
    conditions_biolord = np.unique(adata_biolord.values)
    print(len(conditions_biolord))
    print(conditions_biolord)
    
    pertpy_condition_is_in_biolord = np.isin(conditions_pertpy, conditions_biolord)
    print(f"{pertpy_condition_is_in_biolord.sum()} / {len(conditions_pertpy)}")
    
    biolord_condition_is_in_pertpy = np.isin(conditions_biolord, conditions_pertpy)
    print(f"{biolord_condition_is_in_pertpy.sum()} / {len(conditions_biolord)}")
    
    print(sorted(conditions_pertpy[~pertpy_condition_is_in_biolord]))

### Create data splits

In [8]:
def verify_split_integrity(pert_data: PertData, split_dict: dict):

    df_train = pd.DataFrame(split_dict["train"]).rename({0: "target_genes"}, axis=1)
    df_train.loc[:, ["gene1", "gene2"]] = df_train.target_genes.str.split("+", expand=True).values
    df_train.fillna("ctrl", inplace=True)
    train_genes = np.unique(df_train.gene1.unique().tolist() + df_train.gene2.unique().tolist())
    print(f"Found {len(train_genes)} unique genes in train split")
    
    df_val = pd.DataFrame(split_dict["val"]).rename({0: "target_genes"}, axis=1)
    df_val.loc[:, ["gene1", "gene2"]] = df_val.target_genes.str.split("+", expand=True).values
    df_val.fillna("ctrl", inplace=True)
    val_genes = np.unique(df_val.gene1.unique().tolist() + df_val.gene2.unique().tolist())
    print(f"Found {len(val_genes)} unique genes in val split")
    
    train_and_val_genes = np.unique(train_genes.tolist() + val_genes.tolist())
    print(f"Found {len(train_and_val_genes)} unique genes in train + val split")
    
    def create_df_from_subgroup(
        pert_data: PertData,
        subgroup: str
    ) -> pd.DataFrame:
    
        df_seen0 = pd.DataFrame().from_dict(pert_data.subgroup[subgroup]["combo_seen0"])
        df_seen0.loc[:, "kategory"] = "combo_seen0"
        df_seen1 = pd.DataFrame().from_dict(pert_data.subgroup[subgroup]["combo_seen1"])
        df_seen1.loc[:, "kategory"] = "combo_seen1"
        df_seen2 = pd.DataFrame().from_dict(pert_data.subgroup[subgroup]["combo_seen2"])
        df_seen2.loc[:, "kategory"] = "combo_seen2"
        df_single = pd.DataFrame().from_dict(pert_data.subgroup[subgroup]["unseen_single"])
        df_single.loc[:, "kategory"] = "single1"
        df = pd.concat([df_seen0, df_seen1, df_seen2, df_single], axis=0, ignore_index=True)
        df.rename({0: "target_genes"}, axis=1, inplace=True)
        df.loc[:, ["gene1", "gene2"]] = df.target_genes.str.split("+", expand=True).loc[:, [0, 1]].values
        mask = (df.gene1 == "ctrl") & (df.gene2 != "ctrl")
        df.loc[mask, "gene1"] = df.loc[mask, "gene2"]
        df.loc[mask, "gene2"] = "ctrl"
        return df
        
    df_test = create_df_from_subgroup(pert_data, subgroup="test_subgroup")
    df_test.loc[:, "gene1_is_in_train_or_val"] = np.isin(df_test.gene1.values, train_and_val_genes)
    df_test.loc[:, "gene2_is_in_train_or_val"] = np.isin(df_test.gene2.values, train_and_val_genes)

    # combo_seen0: Neither gene1 nor gene2 is contained in the train or val genes (neither in single knock-ins not knock-in combinations).
    print("combo_seen0")
    display(df_test.loc[df_test.kategory == "combo_seen0"])
    assert not any(
        df_test.loc[df_test.kategory == "combo_seen0"].gene1_is_in_train_or_val | 
        df_test.loc[df_test.kategory == "combo_seen0"].gene2_is_in_train_or_val
    )
    
    # xor gene1 or gene2 (i.e., exactly one of the two) is contained in the train or val genes.
    display(df_test.loc[df_test.kategory == "combo_seen1"])
    assert (
        np.all(
            (
                df_test.loc[df_test.kategory == "combo_seen1"].gene1_is_in_train_or_val.astype(int) + 
                df_test.loc[df_test.kategory == "combo_seen1"].gene2_is_in_train_or_val.astype(int)
            )
            == 1
        )
    )

    # both gene1 and gene2 are contained in train or val genes.
    display(df_test.loc[df_test.kategory == "combo_seen2"])
    assert all(
        df_test.loc[df_test.kategory == "combo_seen2"].gene1_is_in_train_or_val &
        df_test.loc[df_test.kategory == "combo_seen2"].gene2_is_in_train_or_val
    )

    # none of the single knock-in genes in contained in the train or val genes.
    display(df_test.loc[df_test.kategory == "single1"])
    assert (not any(df_test.loc[df_test.kategory == "single1"].gene1_is_in_train_or_val))
    print("All assertions passed.")

In [9]:
# for some reason the 'True' entries control column get lost?
pert_data.adata.obs.loc[:, "control"] = pert_data.adata.obs.loc[:, "num_control"] == 2

In [10]:
OUTPUT_DIR = os.path.join(data_dir, "./norman_preprocessed_adata")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# save entire dataset
pert_data.adata.write_h5ad(os.path.join(OUTPUT_DIR, "adata_all.h5ad"))

# we split control cells in train and test. Test control cells are the same in val and test though.
num_control_cells_in_test = 500
train_idcs_ctrl, test_and_val_idcs_ctrl = train_test_split(
    pert_data.adata.obs.loc[pert_data.adata.obs.control].index, 
    test_size=num_control_cells_in_test, 
    random_state=2024, 
    shuffle=True, 
)

for split_idx, seed in enumerate(range(1, 6)):
    
    # we follow data splitting of biolord reproducibility notebook: 
    # https://github.com/nitzanlab/biolord_reproducibility?tab=readme-ov-file
    
    pert_data.prepare_split(split = "simulation", seed = seed)
    
    # adjust the path to the splits according to the output of pert_data.prepare_split()
    # path_to_splits = f"/home/haicu/soeren.becker/repos/ot_pert_reproducibility/norman2019/splits/norman2019_simulation_{seed}_0.75.pkl"
    path_to_splits = os.path.join(data_dir, f"splits/norman2019_simulation_{seed}_0.75.pkl")
    split_dict = pd.read_pickle(path_to_splits)
    verify_split_integrity(pert_data, split_dict)

    train_conditions = split_dict["train"]
    val_conditions = split_dict["val"]
    test_conditions = split_dict["test"]

    # filter out ctrl cells, which we treat separately
    train_conditions = [c for c in train_conditions if c != "ctrl"]
    val_conditions = [c for c in val_conditions if c != "ctrl"]
    test_conditions = [c for c in test_conditions if c != "ctrl"]
    
    assert(len(np.intersect1d(train_conditions, val_conditions)) == 0)
    assert(len(np.intersect1d(train_conditions, test_conditions)) == 0)
    assert(len(np.intersect1d(val_conditions, test_conditions)) == 0)

    train_mask = pert_data.adata.obs.condition.isin(train_conditions) | pert_data.adata.obs.index.isin(train_idcs_ctrl)
    val_mask = pert_data.adata.obs.condition.isin(val_conditions) | pert_data.adata.obs.index.isin(test_and_val_idcs_ctrl)
    test_mask = pert_data.adata.obs.condition.isin(test_conditions) | pert_data.adata.obs.index.isin(test_and_val_idcs_ctrl)
    
    adata_train = pert_data.adata[train_mask].to_memory()
    adata_val = pert_data.adata[val_mask].to_memory()
    adata_test = pert_data.adata[test_mask].to_memory()
    
    print("all", pert_data.adata.shape)
    print("train", adata_train.shape)
    print("val", adata_val.shape)
    print("test", adata_test.shape)
    
    cfpp.centered_pca(adata_train, n_comps=N_PCA_COMPONENTS)
    cfpp.project_pca(query_adata = adata_val, ref_adata=adata_train)
    cfpp.project_pca(query_adata = adata_test, ref_adata=adata_train)

    adata_train.write_h5ad(os.path.join(OUTPUT_DIR, f"adata_train_pca_{N_PCA_COMPONENTS}_split_{split_idx}.h5ad"))
    adata_val.write_h5ad(os.path.join(OUTPUT_DIR, f"adata_val_pca_{N_PCA_COMPONENTS}_split_{split_idx}.h5ad"))
    adata_test.write_h5ad(os.path.join(OUTPUT_DIR, f"adata_test_pca_{N_PCA_COMPONENTS}_split_{split_idx}.h5ad"))

Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:9
combo_seen1:43
combo_seen2:19
unseen_single:36
Done!


here1
Found 69 unique genes in train split
Found 27 unique genes in val split
Found 77 unique genes in train + val split
combo_seen0


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
0,CBL+PTPN9,combo_seen0,CBL,PTPN9,False,False
1,DUSP9+PRTG,combo_seen0,DUSP9,PRTG,False,False
2,FOXL2+MEIS1,combo_seen0,FOXL2,MEIS1,False,False
3,CEBPB+OSR2,combo_seen0,CEBPB,OSR2,False,False
4,JUN+CEBPB,combo_seen0,JUN,CEBPB,False,False
5,CDKN1C+CDKN1B,combo_seen0,CDKN1C,CDKN1B,False,False
6,CDKN1C+CDKN1A,combo_seen0,CDKN1C,CDKN1A,False,False
7,CDKN1B+CDKN1A,combo_seen0,CDKN1B,CDKN1A,False,False
8,C3orf72+FOXL2,combo_seen0,C3orf72,FOXL2,False,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
9,MAP2K6+SPI1,combo_seen1,MAP2K6,SPI1,True,False
10,DUSP9+MAPK1,combo_seen1,DUSP9,MAPK1,False,True
11,UBASH3B+OSR2,combo_seen1,UBASH3B,OSR2,True,False
12,DUSP9+ETS2,combo_seen1,DUSP9,ETS2,False,True
13,ZNF318+FOXL2,combo_seen1,ZNF318,FOXL2,True,False
14,UBASH3B+PTPN9,combo_seen1,UBASH3B,PTPN9,True,False
15,JUN+CEBPA,combo_seen1,JUN,CEBPA,False,True
16,MAPK1+PRTG,combo_seen1,MAPK1,PRTG,True,False
17,KLF1+COL2A1,combo_seen1,KLF1,COL2A1,True,False
18,PTPN12+OSR2,combo_seen1,PTPN12,OSR2,True,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
52,AHR+KLF1,combo_seen2,AHR,KLF1,True,True
53,CEBPE+CNN1,combo_seen2,CEBPE,CNN1,True,True
54,CEBPE+KLF1,combo_seen2,CEBPE,KLF1,True,True
55,CNN1+MAPK1,combo_seen2,CNN1,MAPK1,True,True
56,ETS2+CEBPE,combo_seen2,ETS2,CEBPE,True,True
57,ETS2+CNN1,combo_seen2,ETS2,CNN1,True,True
58,ETS2+MAPK1,combo_seen2,ETS2,MAPK1,True,True
59,FEV+ISL2,combo_seen2,FEV,ISL2,True,True
60,FOSB+CEBPE,combo_seen2,FOSB,CEBPE,True,True
61,FOXA1+HOXB9,combo_seen2,FOXA1,HOXB9,True,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
71,DUSP9+ctrl,single1,DUSP9,ctrl,False,True
72,BCORL1+ctrl,single1,BCORL1,ctrl,False,True
73,MEIS1+ctrl,single1,MEIS1,ctrl,False,True
74,CBL+ctrl,single1,CBL,ctrl,False,True
75,SLC4A1+ctrl,single1,SLC4A1,ctrl,False,True
76,COL2A1+ctrl,single1,COL2A1,ctrl,False,True
77,S1PR2+ctrl,single1,S1PR2,ctrl,False,True
78,CELF2+ctrl,single1,CELF2,ctrl,False,True
79,CDKN1A+ctrl,single1,CDKN1A,ctrl,False,True
80,ctrl+MEIS1,single1,MEIS1,ctrl,False,True


All assertions passed.
all (89357, 2000)
train (49349, 2000)
val (11254, 2000)
test (29254, 2000)


Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:12
combo_seen1:52
combo_seen2:16
unseen_single:37
Done!


here1
Found 69 unique genes in train split
Found 22 unique genes in val split
Found 77 unique genes in train + val split
combo_seen0


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
0,MAP2K3+SLC38A2,combo_seen0,MAP2K3,SLC38A2,False,False
1,MAPK1+PRTG,combo_seen0,MAPK1,PRTG,False,False
2,MAP2K3+IKZF3,combo_seen0,MAP2K3,IKZF3,False,False
3,MAPK1+IKZF3,combo_seen0,MAPK1,IKZF3,False,False
4,ETS2+CNN1,combo_seen0,ETS2,CNN1,False,False
5,CNN1+MAPK1,combo_seen0,CNN1,MAPK1,False,False
6,CNN1+UBASH3A,combo_seen0,CNN1,UBASH3A,False,False
7,ETS2+MAPK1,combo_seen0,ETS2,MAPK1,False,False
8,FOXA1+FOXL2,combo_seen0,FOXA1,FOXL2,False,False
9,ETS2+PRTG,combo_seen0,ETS2,PRTG,False,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
12,TGFBR2+ETS2,combo_seen1,TGFBR2,ETS2,True,False
13,SGK1+TBX3,combo_seen1,SGK1,TBX3,False,True
14,FOXA3+FOXA1,combo_seen1,FOXA3,FOXA1,True,False
15,ETS2+IGDCC3,combo_seen1,ETS2,IGDCC3,False,True
16,MAP2K6+IKZF3,combo_seen1,MAP2K6,IKZF3,True,False
17,KLF1+FOXA1,combo_seen1,KLF1,FOXA1,True,False
18,DUSP9+MAPK1,combo_seen1,DUSP9,MAPK1,True,False
19,DUSP9+ETS2,combo_seen1,DUSP9,ETS2,True,False
20,ZNF318+FOXL2,combo_seen1,ZNF318,FOXL2,True,False
21,KLF1+BAK1,combo_seen1,KLF1,BAK1,True,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
64,CDKN1B+CDKN1A,combo_seen2,CDKN1B,CDKN1A,True,True
65,CDKN1C+CDKN1A,combo_seen2,CDKN1C,CDKN1A,True,True
66,CEBPB+PTPN12,combo_seen2,CEBPB,PTPN12,True,True
67,CEBPE+RUNX1T1,combo_seen2,CEBPE,RUNX1T1,True,True
68,DUSP9+KLF1,combo_seen2,DUSP9,KLF1,True,True
69,FEV+MAP7D1,combo_seen2,FEV,MAP7D1,True,True
70,FOXA3+FOXF1,combo_seen2,FOXA3,FOXF1,True,True
71,KLF1+TGFBR2,combo_seen2,KLF1,TGFBR2,True,True
72,MAP2K6+ELMSAN1,combo_seen2,MAP2K6,ELMSAN1,True,True
73,PTPN12+PTPN9,combo_seen2,PTPN12,PTPN9,True,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
80,ctrl+FOXA1,single1,FOXA1,ctrl,False,True
81,GLB1L2+ctrl,single1,GLB1L2,ctrl,False,True
82,BAK1+ctrl,single1,BAK1,ctrl,False,True
83,ctrl+ETS2,single1,ETS2,ctrl,False,True
84,SLC6A9+ctrl,single1,SLC6A9,ctrl,False,True
85,HES7+ctrl,single1,HES7,ctrl,False,True
86,FOXO4+ctrl,single1,FOXO4,ctrl,False,True
87,ctrl+CNN1,single1,CNN1,ctrl,False,True
88,MAP2K3+ctrl,single1,MAP2K3,ctrl,False,True
89,FOSB+ctrl,single1,FOSB,ctrl,False,True


All assertions passed.
all (89357, 2000)
train (50535, 2000)
val (4550, 2000)
test (34772, 2000)


Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:4
combo_seen1:51
combo_seen2:19
unseen_single:35
Done!


here1
Found 69 unique genes in train split
Found 27 unique genes in val split
Found 77 unique genes in train + val split
combo_seen0


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
0,UBASH3B+PTPN12,combo_seen0,UBASH3B,PTPN12,False,False
1,MAPK1+PRTG,combo_seen0,MAPK1,PRTG,False,False
2,CNN1+MAPK1,combo_seen0,CNN1,MAPK1,False,False
3,UBASH3B+CNN1,combo_seen0,UBASH3B,CNN1,False,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
4,DUSP9+MAPK1,combo_seen1,DUSP9,MAPK1,True,False
5,UBASH3B+OSR2,combo_seen1,UBASH3B,OSR2,False,True
6,UBASH3B+ZBTB25,combo_seen1,UBASH3B,ZBTB25,False,True
7,SET+CEBPE,combo_seen1,SET,CEBPE,False,True
8,UBASH3B+PTPN9,combo_seen1,UBASH3B,PTPN9,False,True
9,IGDCC3+MAPK1,combo_seen1,IGDCC3,MAPK1,True,False
10,UBASH3B+UBASH3A,combo_seen1,UBASH3B,UBASH3A,False,True
11,PTPN12+OSR2,combo_seen1,PTPN12,OSR2,False,True
12,DUSP9+PRTG,combo_seen1,DUSP9,PRTG,True,False
13,MAPK1+IKZF3,combo_seen1,MAPK1,IKZF3,False,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
55,BCL2L11+BAK1,combo_seen2,BCL2L11,BAK1,True,True
56,BPGM+ZBTB1,combo_seen2,BPGM,ZBTB1,True,True
57,CEBPB+OSR2,combo_seen2,CEBPB,OSR2,True,True
58,CEBPE+RUNX1T1,combo_seen2,CEBPE,RUNX1T1,True,True
59,DUSP9+IGDCC3,combo_seen2,DUSP9,IGDCC3,True,True
60,DUSP9+SNAI1,combo_seen2,DUSP9,SNAI1,True,True
61,ETS2+CEBPE,combo_seen2,ETS2,CEBPE,True,True
62,ETS2+IGDCC3,combo_seen2,ETS2,IGDCC3,True,True
63,FEV+ISL2,combo_seen2,FEV,ISL2,True,True
64,FOSB+CEBPB,combo_seen2,FOSB,CEBPB,True,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
74,GLB1L2+ctrl,single1,GLB1L2,ctrl,False,True
75,ctrl+SET,single1,SET,ctrl,False,True
76,UBASH3B+ctrl,single1,UBASH3B,ctrl,False,True
77,CBFA2T3+ctrl,single1,CBFA2T3,ctrl,False,True
78,AHR+ctrl,single1,AHR,ctrl,False,True
79,FOXO4+ctrl,single1,FOXO4,ctrl,False,True
80,ctrl+CBFA2T3,single1,CBFA2T3,ctrl,False,True
81,ctrl+CNN1,single1,CNN1,ctrl,False,True
82,MAP4K3+ctrl,single1,MAP4K3,ctrl,False,True
83,PTPN12+ctrl,single1,PTPN12,ctrl,False,True


All assertions passed.
all (89357, 2000)
train (47615, 2000)
val (11472, 2000)
test (30770, 2000)


Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:4
combo_seen1:52
combo_seen2:18
unseen_single:37
Done!


here1
Found 69 unique genes in train split
Found 23 unique genes in val split
Found 77 unique genes in train + val split
combo_seen0


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
0,TGFBR2+IGDCC3,combo_seen0,TGFBR2,IGDCC3,False,False
1,FOXF1+HOXB9,combo_seen0,FOXF1,HOXB9,False,False
2,TGFBR2+PRTG,combo_seen0,TGFBR2,PRTG,False,False
3,IGDCC3+PRTG,combo_seen0,IGDCC3,PRTG,False,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
4,TGFBR2+ETS2,combo_seen1,TGFBR2,ETS2,False,True
5,MAP2K6+SPI1,combo_seen1,MAP2K6,SPI1,True,False
6,ETS2+IGDCC3,combo_seen1,ETS2,IGDCC3,True,False
7,MAP2K3+SLC38A2,combo_seen1,MAP2K3,SLC38A2,False,True
8,SET+CEBPE,combo_seen1,SET,CEBPE,False,True
9,IGDCC3+ZBTB25,combo_seen1,IGDCC3,ZBTB25,False,True
10,IGDCC3+MAPK1,combo_seen1,IGDCC3,MAPK1,False,True
11,JUN+CEBPA,combo_seen1,JUN,CEBPA,False,True
12,ZC3HAV1+CEBPE,combo_seen1,ZC3HAV1,CEBPE,False,True
13,UBASH3B+UBASH3A,combo_seen1,UBASH3B,UBASH3A,True,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
56,BPGM+SAMD1,combo_seen2,BPGM,SAMD1,True,True
57,CBL+CNN1,combo_seen2,CBL,CNN1,True,True
58,CDKN1C+CDKN1B,combo_seen2,CDKN1C,CDKN1B,True,True
59,FOSB+PTPN12,combo_seen2,FOSB,PTPN12,True,True
60,FOXA1+FOXL2,combo_seen2,FOXA1,FOXL2,True,True
61,FOXA3+FOXL2,combo_seen2,FOXA3,FOXL2,True,True
62,KLF1+BAK1,combo_seen2,KLF1,BAK1,True,True
63,KLF1+COL2A1,combo_seen2,KLF1,COL2A1,True,True
64,KLF1+MAP2K6,combo_seen2,KLF1,MAP2K6,True,True
65,POU3F2+CBFA2T3,combo_seen2,POU3F2,CBFA2T3,True,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
74,MAML2+ctrl,single1,MAML2,ctrl,False,True
75,ctrl+SET,single1,SET,ctrl,False,True
76,LHX1+ctrl,single1,LHX1,ctrl,False,True
77,AHR+ctrl,single1,AHR,ctrl,False,True
78,FOXO4+ctrl,single1,FOXO4,ctrl,False,True
79,MAP2K3+ctrl,single1,MAP2K3,ctrl,False,True
80,PTPN1+ctrl,single1,PTPN1,ctrl,False,True
81,HOXB9+ctrl,single1,HOXB9,ctrl,False,True
82,ctrl+HOXB9,single1,HOXB9,ctrl,False,True
83,CLDN6+ctrl,single1,CLDN6,ctrl,False,True


All assertions passed.
all (89357, 2000)
train (51692, 2000)
val (8205, 2000)
test (29960, 2000)


Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:3
combo_seen1:56
combo_seen2:18
unseen_single:40
Done!


here1
Found 69 unique genes in train split
Found 26 unique genes in val split
Found 77 unique genes in train + val split
combo_seen0


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
0,BPGM+SAMD1,combo_seen0,BPGM,SAMD1,False,False
1,CEBPB+CEBPA,combo_seen0,CEBPB,CEBPA,False,False
2,KLF1+CEBPA,combo_seen0,KLF1,CEBPA,False,False


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
3,KLF1+MAP2K6,combo_seen1,KLF1,MAP2K6,False,True
4,CEBPE+RUNX1T1,combo_seen1,CEBPE,RUNX1T1,True,False
5,CBL+PTPN9,combo_seen1,CBL,PTPN9,True,False
6,MAP2K6+IKZF3,combo_seen1,MAP2K6,IKZF3,True,False
7,KLF1+FOXA1,combo_seen1,KLF1,FOXA1,False,True
8,CEBPE+KLF1,combo_seen1,CEBPE,KLF1,True,False
9,UBASH3B+ZBTB25,combo_seen1,UBASH3B,ZBTB25,True,False
10,UBASH3B+PTPN9,combo_seen1,UBASH3B,PTPN9,True,False
11,IGDCC3+ZBTB25,combo_seen1,IGDCC3,ZBTB25,True,False
12,KLF1+BAK1,combo_seen1,KLF1,BAK1,False,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
59,CDKN1B+CDKN1A,combo_seen2,CDKN1B,CDKN1A,True,True
60,CDKN1C+CDKN1A,combo_seen2,CDKN1C,CDKN1A,True,True
61,CEBPE+SPI1,combo_seen2,CEBPE,SPI1,True,True
62,CNN1+MAPK1,combo_seen2,CNN1,MAPK1,True,True
63,DUSP9+IGDCC3,combo_seen2,DUSP9,IGDCC3,True,True
64,DUSP9+MAPK1,combo_seen2,DUSP9,MAPK1,True,True
65,FOSB+OSR2,combo_seen2,FOSB,OSR2,True,True
66,FOXA1+FOXF1,combo_seen2,FOXA1,FOXF1,True,True
67,IRF1+SET,combo_seen2,IRF1,SET,True,True
68,MAPK1+PRTG,combo_seen2,MAPK1,PRTG,True,True


Unnamed: 0,target_genes,kategory,gene1,gene2,gene1_is_in_train_or_val,gene2_is_in_train_or_val
77,TSC22D1+ctrl,single1,TSC22D1,ctrl,False,True
78,ctrl+ELMSAN1,single1,ELMSAN1,ctrl,False,True
79,KLF1+ctrl,single1,KLF1,ctrl,False,True
80,ctrl+KLF1,single1,KLF1,ctrl,False,True
81,RREB1+ctrl,single1,RREB1,ctrl,False,True
82,ctrl+ZBTB25,single1,ZBTB25,ctrl,False,True
83,MAP4K5+ctrl,single1,MAP4K5,ctrl,False,True
84,MIDN+ctrl,single1,MIDN,ctrl,False,True
85,DLX2+ctrl,single1,DLX2,ctrl,False,True
86,ctrl+RUNX1T1,single1,RUNX1T1,ctrl,False,True


All assertions passed.
all (89357, 2000)
train (47694, 2000)
val (8568, 2000)
test (33595, 2000)
