In [None]:
import random
import pandas as pd
import pickle
from math import ceil

# Synthetic Lethality

In [None]:

NEG_MULT    = 10      
OUTER_FOLDS = 3       
INNER_FOLDS = 3       
SEED        = 42

random.seed(SEED)

data = pd.read_csv(
    '/data/paired/BIOGRID-ORGANISM-Homo_sapiens-4.4.240.tab3.txt',
    sep='\t'
)
data = data[
    (data['Organism ID Interactor A'] == 9606) &
    (data['Organism ID Interactor B'] == 9606) &
    (data['Experimental System Type'] == 'genetic')
]
synthetic = data[data['Experimental System'] == 'Synthetic Lethality']

positive_pairs = set()
for _, row in synthetic.iterrows():
    a, b = str(row['Entrez Gene Interactor A']), str(row['Entrez Gene Interactor B'])
    positive_pairs.add(tuple(sorted((a, b))))
print(f"Total positive pairs: {len(positive_pairs)}")

genes       = sorted({g for pair in positive_pairs for g in pair})
neg_pairs   = set()
target_neg  = NEG_MULT * len(positive_pairs)
while len(neg_pairs) < target_neg:
    g1, g2 = sorted(random.sample(genes, 2))
    if (g1, g2) not in positive_pairs:
        neg_pairs.add((g1, g2))
print(f"Total negative pairs sampled: {len(neg_pairs)}")

pairs  = list(positive_pairs) + list(neg_pairs)
labels = [1]*len(positive_pairs) + [0]*len(neg_pairs)

random.shuffle(genes)
outer_fold_size = ceil(len(genes) / OUTER_FOLDS)
gene_to_outer = {}
for fold in range(OUTER_FOLDS):
    start = fold * outer_fold_size
    end   = start + outer_fold_size
    for g in genes[start:end]:
        gene_to_outer[g] = fold

cv_splits = {}
for fold in range(OUTER_FOLDS):
    test_genes  = {g for g, f in gene_to_outer.items() if f == fold}
    train_genes = set(genes) - test_genes

    train_idx = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in train_genes and g2 in train_genes]
    test_idx  = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in test_genes  and g2 in test_genes]

    train_genes_list = sorted(train_genes)
    random.shuffle(train_genes_list)
    inner_fold_size = ceil(len(train_genes_list) / INNER_FOLDS)
    gene_to_inner = {}
    for j in range(INNER_FOLDS):
        s = j * inner_fold_size
        e = s + inner_fold_size
        for g in train_genes_list[s:e]:
            gene_to_inner[g] = j

    inner_splits = []
    for j in range(INNER_FOLDS):
        val_genes   = {g for g,f in gene_to_inner.items() if f == j}
        inner_train = [i for i in train_idx
                       if pairs[i][0] in train_genes - val_genes
                       and pairs[i][1] in train_genes - val_genes]
        inner_val   = [i for i in train_idx
                       if pairs[i][0] in val_genes
                       and pairs[i][1] in val_genes]
        if inner_train and inner_val:
            inner_splits.append((inner_train, inner_val))

    cv_splits[fold] = {
        'train_idx':   train_idx,
        'test_idx':    test_idx,
        'inner_splits': inner_splits
    }

print(f"Generated gene‐level nested CV: {OUTER_FOLDS} outer × {INNER_FOLDS} inner folds")

out_path = "sl_nested_cv_splits.pkl"
with open(out_path, 'wb') as f:
    pickle.dump({
        'pairs':     pairs,
        'labels':    labels,
        'cv_splits': cv_splits
    }, f)
print(f"Saved all splits to {out_path}")


# Negative Genetic 

In [None]:
random.seed(SEED)

data = pd.read_csv(
    '/data/paired/BIOGRID-ORGANISM-Homo_sapiens-4.4.240.tab3.txt',
    sep='\t'
)
data = data[
    (data['Organism ID Interactor A'] == 9606) &
    (data['Organism ID Interactor B'] == 9606) &
    (data['Experimental System Type'] == 'genetic')
]
synthetic = data[data['Experimental System'] == 'Negative Genetic']

positive_pairs = set()
for _, row in synthetic.iterrows():
    a, b = str(row['Entrez Gene Interactor A']), str(row['Entrez Gene Interactor B'])
    positive_pairs.add(tuple(sorted((a, b))))
print(f"Total positive pairs: {len(positive_pairs)}")

genes       = sorted({g for pair in positive_pairs for g in pair})
neg_pairs   = set()
target_neg  = NEG_MULT * len(positive_pairs)
while len(neg_pairs) < target_neg:
    g1, g2 = sorted(random.sample(genes, 2))
    if (g1, g2) not in positive_pairs:
        neg_pairs.add((g1, g2))
print(f"Total negative pairs sampled: {len(neg_pairs)}")

pairs  = list(positive_pairs) + list(neg_pairs)
labels = [1]*len(positive_pairs) + [0]*len(neg_pairs)

random.shuffle(genes)
outer_fold_size = ceil(len(genes) / OUTER_FOLDS)
gene_to_outer = {}
for fold in range(OUTER_FOLDS):
    start = fold * outer_fold_size
    end   = start + outer_fold_size
    for g in genes[start:end]:
        gene_to_outer[g] = fold

cv_splits = {}
for fold in range(OUTER_FOLDS):
    test_genes  = {g for g, f in gene_to_outer.items() if f == fold}
    train_genes = set(genes) - test_genes

    train_idx = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in train_genes and g2 in train_genes]
    test_idx  = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in test_genes  and g2 in test_genes]

    train_genes_list = sorted(train_genes)
    random.shuffle(train_genes_list)
    inner_fold_size = ceil(len(train_genes_list) / INNER_FOLDS)
    gene_to_inner = {}
    for j in range(INNER_FOLDS):
        s = j * inner_fold_size
        e = s + inner_fold_size
        for g in train_genes_list[s:e]:
            gene_to_inner[g] = j

    inner_splits = []
    for j in range(INNER_FOLDS):
        val_genes   = {g for g,f in gene_to_inner.items() if f == j}
        inner_train = [i for i in train_idx
                       if pairs[i][0] in train_genes - val_genes
                       and pairs[i][1] in train_genes - val_genes]
        inner_val   = [i for i in train_idx
                       if pairs[i][0] in val_genes
                       and pairs[i][1] in val_genes]
        if inner_train and inner_val:
            inner_splits.append((inner_train, inner_val))

    cv_splits[fold] = {
        'train_idx':   train_idx,
        'test_idx':    test_idx,
        'inner_splits': inner_splits
    }

print(f"Generated gene‐level nested CV: {OUTER_FOLDS} outer × {INNER_FOLDS} inner folds")

out_path = "ng_nested_cv_splits.pkl"
with open(out_path, 'wb') as f:
    pickle.dump({
        'pairs':     pairs,
        'labels':    labels,
        'cv_splits': cv_splits
    }, f)
print(f"Saved all splits to {out_path}")


# Transcription Factor

In [None]:
random.seed(SEED)

data = pd.read_csv('/data/paired/tf_target.txt', sep='\t')

tf_target_counts = data.groupby("TF").size()
filtered_tfs = tf_target_counts[(tf_target_counts > 500) & (tf_target_counts < 1000)].index
print(len(filtered_tfs))

synthetic = data[data["TF"].isin(filtered_tfs)]

positive_pairs = set()
for i in range(synthetic.shape[0]):
    a = str(synthetic.iloc[i]['TF'])
    b = str(synthetic.iloc[i]['Target'])
    positive_pairs.add(tuple(sorted((a, b))))
print(f"Total positive pairs: {len(positive_pairs)}")

all_positive_pairs = positive_pairs.copy()

if len(positive_pairs) > 10_000:
    positive_pairs = set(random.sample(positive_pairs, 10_000))

print(f"Total post sub-sample positive pairs: {len(positive_pairs)}")

genes       = sorted({g for pair in positive_pairs for g in pair})
neg_pairs   = set()
target_neg  = NEG_MULT * len(positive_pairs)
while len(neg_pairs) < target_neg:
    g1, g2 = sorted(random.sample(genes, 2))
    if (g1, g2) not in all_positive_pairs:
        neg_pairs.add((g1, g2))
print(f"Total negative pairs sampled: {len(neg_pairs)}")

pairs  = list(positive_pairs) + list(neg_pairs)
labels = [1]*len(positive_pairs) + [0]*len(neg_pairs)

random.shuffle(genes)
outer_fold_size = ceil(len(genes) / OUTER_FOLDS)
gene_to_outer = {}
for fold in range(OUTER_FOLDS):
    start = fold * outer_fold_size
    end   = start + outer_fold_size
    for g in genes[start:end]:
        gene_to_outer[g] = fold

cv_splits = {}
for fold in range(OUTER_FOLDS):
    test_genes  = {g for g, f in gene_to_outer.items() if f == fold}
    train_genes = set(genes) - test_genes

    train_idx = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in train_genes and g2 in train_genes]
    test_idx  = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in test_genes  and g2 in test_genes]

    train_genes_list = sorted(train_genes)
    random.shuffle(train_genes_list)
    inner_fold_size = ceil(len(train_genes_list) / INNER_FOLDS)
    gene_to_inner = {}
    for j in range(INNER_FOLDS):
        s = j * inner_fold_size
        e = s + inner_fold_size
        for g in train_genes_list[s:e]:
            gene_to_inner[g] = j

    inner_splits = []
    for j in range(INNER_FOLDS):
        val_genes   = {g for g,f in gene_to_inner.items() if f == j}
        inner_train = [i for i in train_idx
                       if pairs[i][0] in train_genes - val_genes
                       and pairs[i][1] in train_genes - val_genes]
        inner_val   = [i for i in train_idx
                       if pairs[i][0] in val_genes
                       and pairs[i][1] in val_genes]
        if inner_train and inner_val:
            inner_splits.append((inner_train, inner_val))

    cv_splits[fold] = {
        'train_idx':   train_idx,
        'test_idx':    test_idx,
        'inner_splits': inner_splits
    }

print(f"Generated gene‐level nested CV: {OUTER_FOLDS} outer × {INNER_FOLDS} inner folds")

out_path = "tf_nested_cv_splits.pkl"
with open(out_path, 'wb') as f:
    pickle.dump({
        'pairs':     pairs,
        'labels':    labels,
        'cv_splits': cv_splits
    }, f)
print(f"Saved all splits to {out_path}")


# Pombe

In [None]:
random.seed(SEED)

data = pd.read_csv('/data/paired/BIOGRID-ORGANISM-Schizosaccharomyces_pombe_972h-4.4.240.tab3.txt', sep='\t')

data = data[
    (data['Organism ID Interactor A'] == 284812) &
    (data['Organism ID Interactor B'] == 284812) &
    (data['Experimental System Type'] == 'genetic')
]

data = data.rename(columns={
    'Entrez Gene Interactor A': 'Yeast Entrez Gene Interactor A',
    'Entrez Gene Interactor B': 'Yeast Entrez Gene Interactor B'
})

orth = pd.read_csv(
    '/data/paired/orthomcl/6_1/hsa_spo_orthomcl.txt',
    sep='\t',
    dtype={'hsa': int, 'spo': int, 'score': float}
)

best = (
    orth
    .sort_values('score', ascending=False)
    .drop_duplicates(subset='spo')
    .loc[:, ['spo', 'hsa']]
    .rename(columns={
        'spo': 'Yeast Entrez Gene Interactor',
        'hsa': 'Human Entrez Gene Interactor'
    })
)

data = (
    data
    .merge(best,
           left_on='Yeast Entrez Gene Interactor A',
           right_on='Yeast Entrez Gene Interactor',
           how='left')
    .rename(columns={'Human Entrez Gene Interactor': 'Entrez Gene Interactor A'})
    .drop(columns=['Yeast Entrez Gene Interactor'])
)

data = (
    data
    .merge(best,
           left_on='Yeast Entrez Gene Interactor B',
           right_on='Yeast Entrez Gene Interactor',
           how='left')
    .rename(columns={'Human Entrez Gene Interactor': 'Entrez Gene Interactor B'})
    .drop(columns=['Yeast Entrez Gene Interactor'])
)


data = data.dropna(subset=[
    'Entrez Gene Interactor A',
    'Entrez Gene Interactor B'
])                     

data['Entrez Gene Interactor A'] = data['Entrez Gene Interactor A'].astype(int)
data['Entrez Gene Interactor B'] = data['Entrez Gene Interactor B'].astype(int)

synthetic = data[data['Experimental System'] == 'Negative Genetic']

positive_pairs = set()
for _, row in synthetic.iterrows():
    a, b = str(row['Entrez Gene Interactor A']), str(row['Entrez Gene Interactor B'])
    positive_pairs.add(tuple(sorted((a, b))))
print(f"Total positive pairs: {len(positive_pairs)}")

genes       = sorted({g for pair in positive_pairs for g in pair})
neg_pairs   = set()
target_neg  = NEG_MULT * len(positive_pairs)
while len(neg_pairs) < target_neg:
    g1, g2 = sorted(random.sample(genes, 2))
    if (g1, g2) not in positive_pairs:
        neg_pairs.add((g1, g2))
print(f"Total negative pairs sampled: {len(neg_pairs)}")

pairs  = list(positive_pairs) + list(neg_pairs)
labels = [1]*len(positive_pairs) + [0]*len(neg_pairs)

random.shuffle(genes)
outer_fold_size = ceil(len(genes) / OUTER_FOLDS)
gene_to_outer = {}
for fold in range(OUTER_FOLDS):
    start = fold * outer_fold_size
    end   = start + outer_fold_size
    for g in genes[start:end]:
        gene_to_outer[g] = fold

cv_splits = {}
for fold in range(OUTER_FOLDS):
    test_genes  = {g for g, f in gene_to_outer.items() if f == fold}
    train_genes = set(genes) - test_genes

    train_idx = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in train_genes and g2 in train_genes]
    test_idx  = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in test_genes  and g2 in test_genes]

    train_genes_list = sorted(train_genes)
    random.shuffle(train_genes_list)
    inner_fold_size = ceil(len(train_genes_list) / INNER_FOLDS)
    gene_to_inner = {}
    for j in range(INNER_FOLDS):
        s = j * inner_fold_size
        e = s + inner_fold_size
        for g in train_genes_list[s:e]:
            gene_to_inner[g] = j

    inner_splits = []
    for j in range(INNER_FOLDS):
        val_genes   = {g for g,f in gene_to_inner.items() if f == j}
        inner_train = [i for i in train_idx
                       if pairs[i][0] in train_genes - val_genes
                       and pairs[i][1] in train_genes - val_genes]
        inner_val   = [i for i in train_idx
                       if pairs[i][0] in val_genes
                       and pairs[i][1] in val_genes]
        if inner_train and inner_val:
            inner_splits.append((inner_train, inner_val))

    cv_splits[fold] = {
        'train_idx':   train_idx,
        'test_idx':    test_idx,
        'inner_splits': inner_splits
    }

print(f"Generated gene‐level nested CV: {OUTER_FOLDS} outer × {INNER_FOLDS} inner folds")

out_path = "pombe_nested_cv_splits.pkl"
with open(out_path, 'wb') as f:
    pickle.dump({
        'pairs':     pairs,
        'labels':    labels,
        'cv_splits': cv_splits
    }, f)
print(f"Saved all splits to {out_path}")
