In [None]:
import random
import pandas as pd
import pickle
from math import ceil

NEG_MULT    = 10      
OUTER_FOLDS = 3       
INNER_FOLDS = 3       
SEED        = 42

random.seed(SEED)

data = pd.read_csv(
    '/PATH/BIOGRID-ORGANISM-Homo_sapiens-4.4.240.tab3.txt',
    sep='\t'
)
data = data[
    (data['Organism ID Interactor A'] == 9606) &
    (data['Organism ID Interactor B'] == 9606) &
    (data['Experimental System Type'] == 'genetic')
]
synthetic = data[data['Experimental System'] == 'Synthetic Lethality']

positive_pairs = set()
for _, row in synthetic.iterrows():
    a, b = str(row['Entrez Gene Interactor A']), str(row['Entrez Gene Interactor B'])
    positive_pairs.add(tuple(sorted((a, b))))
print(f"Total positive pairs: {len(positive_pairs)}")

genes       = sorted({g for pair in positive_pairs for g in pair})
neg_pairs   = set()
target_neg  = NEG_MULT * len(positive_pairs)
while len(neg_pairs) < target_neg:
    g1, g2 = sorted(random.sample(genes, 2))
    if (g1, g2) not in positive_pairs:
        neg_pairs.add((g1, g2))
print(f"Total negative pairs sampled: {len(neg_pairs)}")

pairs  = list(positive_pairs) + list(neg_pairs)
labels = [1]*len(positive_pairs) + [0]*len(neg_pairs)

random.shuffle(genes)
outer_fold_size = ceil(len(genes) / OUTER_FOLDS)
gene_to_outer = {}
for fold in range(OUTER_FOLDS):
    start = fold * outer_fold_size
    end   = start + outer_fold_size
    for g in genes[start:end]:
        gene_to_outer[g] = fold

cv_splits = {}
for fold in range(OUTER_FOLDS):
    test_genes  = {g for g, f in gene_to_outer.items() if f == fold}
    train_genes = set(genes) - test_genes

    train_idx = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in train_genes and g2 in train_genes]
    test_idx  = [i for i,(g1,g2) in enumerate(pairs)
                 if g1 in test_genes  and g2 in test_genes]

    train_genes_list = sorted(train_genes)
    random.shuffle(train_genes_list)
    inner_fold_size = ceil(len(train_genes_list) / INNER_FOLDS)
    gene_to_inner = {}
    for j in range(INNER_FOLDS):
        s = j * inner_fold_size
        e = s + inner_fold_size
        for g in train_genes_list[s:e]:
            gene_to_inner[g] = j

    inner_splits = []
    for j in range(INNER_FOLDS):
        val_genes   = {g for g,f in gene_to_inner.items() if f == j}
        inner_train = [i for i in train_idx
                       if pairs[i][0] in train_genes - val_genes
                       and pairs[i][1] in train_genes - val_genes]
        inner_val   = [i for i in train_idx
                       if pairs[i][0] in val_genes
                       and pairs[i][1] in val_genes]
        if inner_train and inner_val:
            inner_splits.append((inner_train, inner_val))

    cv_splits[fold] = {
        'train_idx':   train_idx,
        'test_idx':    test_idx,
        'inner_splits': inner_splits
    }

print(f"Generated gene‐level nested CV: {OUTER_FOLDS} outer × {INNER_FOLDS} inner folds")

out_path = "sl_nested_cv_splits.pkl"
with open(out_path, 'wb') as f:
    pickle.dump({
        'pairs':     pairs,
        'labels':    labels,
        'cv_splits': cv_splits
    }, f)
print(f"Saved all splits to {out_path}")
