In [1]:
import pandas as pd
import pickle
import random
import os
import itertools
from tqdm import tqdm
import numpy as np
from scipy.stats import pearsonr
import statsmodels.api as sm
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [6]:
# read datasets
corum_human = pd.read_table('../general_datasets/corum_humanComplexes.txt')
corum_human.index = corum_human['complex_id']

ge_matrix = pickle.load(open('../general_datasets/ge_matrix.pkl', 'rb'))

genes_all_key = {}
genes_all_key['CRISPR'] = list(ge_matrix['CRISPR'].columns)
genes_all_key['RNAi'] = list(ge_matrix['RNAi'].columns)

# ge_cut
ge_cut = {}
ge_cut['CRISPR'] = -0.237
ge_cut['RNAi'] = -0.325

seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

##### 1. Benchmark datasets for functional ternary protein interactions

In [None]:
# constructing sets of positive and negative gene triplets

subunit_id = {}
for i in corum_human.index:
    subunits = tuple(sorted(list(set(corum_human['subunits_gene_name'][i].split(';')))))
    subunit_id[i] = subunits

positpis_all = set()
for i in subunit_id.keys():
    subunits = subunit_id[i]
    if len(subunits) >= 3:
        combinations = list(itertools.combinations(subunits, 3))
        for combination in combinations:
            t = sorted(list(combination))
            positpis_all.add((t[0], t[1], t[2]))

# sampling positive gene triplets
allposit_seed = {}
posit_id_seed = {}
posisubunit_id_seed = {}
for i in tqdm(subunit_id.keys()):
    subunits = subunit_id[i]
    if len(subunits) >= 3:
        combinations = sorted(list(itertools.combinations(subunits, 3)))
        if len(subunits) <= 5:
            for s in seeds:
                triangles = set()
                posisubunits = set()
                for combination in combinations:
                    t = sorted(list(combination))
                    triangles.add((t[0], t[1], t[2]))
                    posisubunits = posisubunits.union(set(combination))
                    try:
                        allposit_seed[s].add((t[0], t[1], t[2]))
                    except:
                        allposit_seed[s] = set()
                        allposit_seed[s].add((t[0], t[1], t[2]))
                try:
                    posit_id_seed[s][i] = triangles
                except:
                    posit_id_seed[s] = {}
                    posit_id_seed[s][i] = triangles

                try:
                    posisubunit_id_seed[s][i] = posisubunits
                except:
                    posisubunit_id_seed[s] = {}
                    posisubunit_id_seed[s][i] = posisubunits

        else:
            for s in seeds:
                setup_seed(s)
                combinations_ = random.sample(combinations, len(subunits) * 3)
                triangles = set()
                posisubunits = set()
                for combination in combinations_:
                    t = sorted(list(combination))
                    triangles.add((t[0], t[1], t[2]))
                    posisubunits = posisubunits.union(set(combination))
                    try:
                        allposit_seed[s].add((t[0], t[1], t[2]))
                    except:
                        allposit_seed[s] = set()
                        allposit_seed[s].add((t[0], t[1], t[2]))

                try:
                    posit_id_seed[s][i] = triangles
                except:
                    posit_id_seed[s] = {}
                    posit_id_seed[s][i] = triangles

                try:
                    posisubunit_id_seed[s][i] = posisubunits
                except:
                    posisubunit_id_seed[s] = {}
                    posisubunit_id_seed[s][i] = posisubunits

# sampling negative gene triplets --- RanNeg
negat_RanNeg_seed = {}
for s in seeds:
    posisubunit_id = posisubunit_id_seed[s]
    ids = sorted(list(posisubunit_id.keys()))

    setup_seed(s)
    random.shuffle(ids)

    n = len(ids)
    size1 = n // 3
    size2 = n // 3
    size3 = n - size1 - size2

    list1 = ids[:size1]
    list2 = ids[size1:size1 + size2]
    list3 = ids[size1 + size2:]

    setup_seed(s)
    g1 = random.sample(list1, 70)
    g2 = random.sample(list2, 70)
    g3 = random.sample(list3, 70)

    ids_ = sorted(list(itertools.product(g1, g2, g3)))

    negat = set()
    for i in ids_:
        cs = sorted(list(i))
        c1, c2, c3 = cs[0], cs[1], cs[2]
        subunits_1 = sorted(list(posisubunit_id[c1]))
        subunits_2 = sorted(list(posisubunit_id[c2]))
        subunits_3 = sorted(list(posisubunit_id[c3]))

        setup_seed(s)
        s_1 = random.choice(subunits_1)
        s_2 = random.choice(subunits_2)
        s_3 = random.choice(subunits_3)

        t = set([s_1, s_2, s_3])
        if len(t) == 3:
            flag = 0
            for k in subunit_id.keys():
                subunits = subunit_id[k]
                if len(set(subunits) & t)>1:
                    flag = 1

            if flag == 0:
                t_ = sorted(list(t))
                negat.add((t_[0], t_[1], t_[2]))

    negat = sorted(list(negat))
    setup_seed(s)
    random.shuffle(negat)
    negat = tuple(negat)
    negat_RanNeg_seed[s] = negat

# sampling negative gene triplets --- DecoyNeg
negat_DecoyNeg_seed = {}
for s in seeds:
    negat = set()
    allposits = allposit_seed[s]
    posisubunit_id = posisubunit_id_seed[s]

    ids = sorted(list(posisubunit_id.keys()))
    setup_seed(s)
    ids = random.sample(ids, 200)

    for t in allposits:
        negasubunits = set()
        for i in ids:
            posisubunits = sorted(list(posisubunit_id[i]))
            if len(set(t)&set(posisubunits)) == 0:
                setup_seed(s)
                posisubunits = random.sample(posisubunits, 1)
                negasubunits = negasubunits.union(set(posisubunits))

        negasubunits = sorted(list(negasubunits))
        setup_seed(s)
        negasubunits = random.sample(negasubunits, 5)

        for subunit in negasubunits:
            t_1 = sorted([t[0], t[1], subunit])
            t_2 = sorted([t[0], t[2], subunit])
            t_3 = sorted([t[1], t[2], subunit])
            negat.add((t_1[0], t_1[1], t_1[2]))
            negat.add((t_2[0], t_2[1], t_2[2]))
            negat.add((t_3[0], t_3[1], t_3[2]))

    negat = negat - positpis_all

    negat = sorted(list(negat))
    setup_seed(s)
    random.shuffle(negat)
    negat = tuple(negat)
    negat_DecoyNeg_seed[s] = negat

##### 2. Extraction of feature matrices for model training

In [None]:
# positive examples
ft_posi_seed_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    ft_posi_seed_key[ge_key] = {}
    for s in seeds:
        allposit = allposit_seed[s]
        
        triangles_list = []
        ft_all_list = []
        for triangle in tqdm(allposit):
            pros = sorted(list(triangle))
            g1 = pros[0]
            g2 = pros[1]
            g3 = pros[2]

            genes_all = genes_all_key[ge_key]
            if g1 in genes_all and g2 in genes_all and g3 in genes_all:
                ge_1 = ge_matrix[ge_key][g1].dropna()
                ge_2 = ge_matrix[ge_key][g2].dropna()
                ge_3 = ge_matrix[ge_key][g3].dropna()

                ge_f_1 = ge_1[ge_1 < ge_cut[ge_key]]
                ge_f_2 = ge_2[ge_2 < ge_cut[ge_key]]
                ge_f_3 = ge_3[ge_3 < ge_cut[ge_key]]

                ccls_inter = list(set(ge_f_1.index) & set(ge_f_2.index) & set(ge_f_3.index))

                ge_final_1 = ge_f_1[ccls_inter]
                ge_final_2 = ge_f_2[ccls_inter]
                ge_final_3 = ge_f_3[ccls_inter]

                if len(ccls_inter) >= 5:
                    # pearson相关性系数
                    pearson_r_gg12 = pearsonr(ge_final_1, ge_final_2)
                    pearson_r_gg23 = pearsonr(ge_final_2, ge_final_3)
                    pearson_r_gg13 = pearsonr(ge_final_1, ge_final_3)

                    pearson_corr_gg12 = pearson_r_gg12[0]
                    pearson_corr_gg23 = pearson_r_gg23[0]
                    pearson_corr_gg13 = pearson_r_gg13[0]

                    pearson_p_gg12 = pearson_r_gg12[1]
                    pearson_p_gg23 = pearson_r_gg23[1]
                    pearson_p_gg13 = pearson_r_gg13[1]

                    # ols模型
                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_2, X_with_intercept)
                    ols_r_gg12 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_2)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg23 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg13 = model_ols.fit()

                    ols_coeff_0_gg12 = ols_r_gg12.params[0]
                    ols_se_0_gg12 = ols_r_gg12.bse[0]
                    ols_coeff_1_gg12 = ols_r_gg12.params[1]
                    ols_se_1_gg12 = ols_r_gg12.bse[1]

                    ols_coeff_0_gg23 = ols_r_gg23.params[0]
                    ols_se_0_gg23 = ols_r_gg23.bse[0]
                    ols_coeff_1_gg23 = ols_r_gg23.params[1]
                    ols_se_1_gg23 = ols_r_gg23.bse[1]

                    ols_coeff_0_gg13 = ols_r_gg13.params[0]
                    ols_se_0_gg13 = ols_r_gg13.bse[0]
                    ols_coeff_1_gg13 = ols_r_gg13.params[1]
                    ols_se_1_gg13 = ols_r_gg13.bse[1]

                    g1_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_1]))
                    g2_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_2]))
                    g3_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_3]))

                    g1_mean = np.mean(ge_final_1)
                    g2_mean = np.mean(ge_final_2)
                    g3_mean = np.mean(ge_final_3)

                    g1_std = np.std(ge_final_1)
                    g2_std = np.std(ge_final_2)
                    g3_std = np.std(ge_final_3)

                    g1_min = min(ge_final_1)
                    g1_25 = np.percentile(ge_final_1, 25)
                    g1_50 = np.percentile(ge_final_1, 50)
                    g1_75 = np.percentile(ge_final_1, 75)
                    g1_max = max(ge_final_1)

                    g2_min = min(ge_final_2)
                    g2_25 = np.percentile(ge_final_2, 25)
                    g2_50 = np.percentile(ge_final_2, 50)
                    g2_75 = np.percentile(ge_final_2, 75)
                    g2_max = max(ge_final_2)

                    g3_min = min(ge_final_3)
                    g3_25 = np.percentile(ge_final_3, 25)
                    g3_50 = np.percentile(ge_final_3, 50)
                    g3_75 = np.percentile(ge_final_3, 75)
                    g3_max = max(ge_final_3)

                    ge_e_1 = []
                    for k in ge_1:
                        if k > 0:
                            ge_e_1.append(0)
                        else:
                            ge_e_1.append(k)

                    ge_e_2 = []
                    for k in ge_2:
                        if k > 0:
                            ge_e_2.append(0)
                        else:
                            ge_e_2.append(k)

                    ge_e_3 = []
                    for k in ge_3:
                        if k > 0:
                            ge_e_3.append(0)
                        else:
                            ge_e_3.append(k)

                    g1_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_1]))
                    g2_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_2]))
                    g3_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_3]))

                    g1_mean_all = np.mean(ge_e_1)
                    g2_mean_all = np.mean(ge_e_2)
                    g3_mean_all = np.mean(ge_e_3)

                    g1_std_all = np.std(ge_e_1)
                    g2_std_all = np.std(ge_e_2)
                    g3_std_all = np.std(ge_e_3)

                    g1_min_all = min(ge_e_1)
                    g1_25_all = np.percentile(ge_e_1, 25)
                    g1_50_all = np.percentile(ge_e_1, 50)
                    g1_75_all = np.percentile(ge_e_1, 75)
                    g1_max_all = max(ge_e_1)

                    g2_min_all = min(ge_e_2)
                    g2_25_all = np.percentile(ge_e_2, 25)
                    g2_50_all = np.percentile(ge_e_2, 50)
                    g2_75_all = np.percentile(ge_e_2, 75)
                    g2_max_all = max(ge_e_2)

                    g3_min_all = min(ge_e_3)
                    g3_25_all = np.percentile(ge_e_3, 25)
                    g3_50_all = np.percentile(ge_e_3, 50)
                    g3_75_all = np.percentile(ge_e_3, 75)
                    g3_max_all = max(ge_e_3)

                    features = [len(ccls_inter), pearson_corr_gg12, pearson_corr_gg23, pearson_corr_gg13,
                                pearson_p_gg12, pearson_p_gg23, pearson_p_gg13,
                                ols_coeff_0_gg12, ols_coeff_0_gg23, ols_coeff_0_gg13,
                                ols_coeff_1_gg12, ols_coeff_1_gg23, ols_coeff_1_gg13,
                                ols_se_0_gg12, ols_se_0_gg23, ols_se_0_gg13,
                                ols_se_1_gg12, ols_se_1_gg23, ols_se_1_gg13,
                                g1_essen, g2_essen, g3_essen,
                                g1_essen_all, g2_essen_all, g3_essen_all,
                                g1_mean, g2_mean, g3_mean, g1_std, g2_std, g3_std,
                                g1_25, g1_50, g1_75, g2_25, g2_50, g2_75, g3_25, g3_50, g3_75,
                                g1_min, g2_min, g3_min, g1_max, g2_max, g3_max,
                                g1_mean_all, g2_mean_all, g3_mean_all, g1_std_all, g2_std_all, g3_std_all,
                                g1_25_all, g1_50_all, g1_75_all, g2_25_all, g2_50_all, g2_75_all, g3_25_all, g3_50_all,
                                g3_75_all,
                                g1_min_all, g2_min_all, g3_min_all, g1_max_all, g2_max_all, g3_max_all,
                                ]
                    triangles_list.append(triangle)
                    ft_all_list.append(features)

        ft_posi_seed_key[ge_key][s] = pd.DataFrame(ft_all_list, index=triangles_list,
                                                   columns=['SCCLs_number', 'pearson_corr_gg12', 'pearson_corr_gg23',
                                                            'pearson_corr_gg13',
                                                            'pearson_p_gg12', 'pearson_p_gg23', 'pearson_p_gg13',
                                                            'ols_coeff_0_gg12', 'ols_coeff_0_gg23', 'ols_coeff_0_gg13',
                                                            'ols_coeff_1_gg12', 'ols_coeff_1_gg23', 'ols_coeff_1_gg13',
                                                            'ols_se_0_gg12', 'ols_se_0_gg23', 'ols_se_0_gg13',
                                                            'ols_se_1_gg12', 'ols_se_1_gg23', 'ols_se_1_gg13',
                                                            'g1_essen', 'g2_essen', 'g3_essen',
                                                            'g1_essen_all', 'g2_essen_all', 'g3_essen_all',
                                                            'g1_mean', 'g2_mean', 'g3_mean', 'g1_std', 'g2_std',
                                                            'g3_std',
                                                            'g1_25', 'g1_50', 'g1_75', 'g2_25', 'g2_50', 'g2_75',
                                                            'g3_25',
                                                            'g3_50',
                                                            'g3_75',
                                                            'g1_min', 'g2_min', 'g3_min', 'g1_max', 'g2_max', 'g3_max',
                                                            'g1_mean_all', 'g2_mean_all', 'g3_mean_all', 'g1_std_all',
                                                            'g2_std_all',
                                                            'g3_std_all',
                                                            'g1_25_all', 'g1_50_all', 'g1_75_all', 'g2_25_all',
                                                            'g2_50_all',
                                                            'g2_75_all',
                                                            'g3_25_all', 'g3_50_all', 'g3_75_all',
                                                            'g1_min_all', 'g2_min_all', 'g3_min_all',
                                                            'g1_max_all', 'g2_max_all', 'g3_max_all',
                                                            ])

# negative examples --- RanNeg
ft_posi_seed_key = pickle.load(open('PCorr_TPI_datasets/ft_posi_seed_key.pkl', 'rb'))

ft_RanNeg_seed_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    ft_posi_seed = ft_posi_seed_key[ge_key]
    ft_RanNeg_seed_key[ge_key] = {}
    for s in seeds:
        ft_posi = ft_posi_seed[s]

        negat = negat_seed[s]

        triangles_list = []
        ft_all_list = []
        for triangle in tqdm(negat):
            pros = sorted(list(triangle))
            g1 = pros[0]
            g2 = pros[1]
            g3 = pros[2]

            genes_all = genes_all_key[ge_key]
            if g1 in genes_all and g2 in genes_all and g3 in genes_all:
                ge_1 = ge_matrix[ge_key][g1].dropna()
                ge_2 = ge_matrix[ge_key][g2].dropna()
                ge_3 = ge_matrix[ge_key][g3].dropna()

                ge_f_1 = ge_1[ge_1 < ge_cut[ge_key]]
                ge_f_2 = ge_2[ge_2 < ge_cut[ge_key]]
                ge_f_3 = ge_3[ge_3 < ge_cut[ge_key]]

                ccls_inter = list(set(ge_f_1.index) & set(ge_f_2.index) & set(ge_f_3.index))

                ge_final_1 = ge_f_1[ccls_inter]
                ge_final_2 = ge_f_2[ccls_inter]
                ge_final_3 = ge_f_3[ccls_inter]

                if len(ccls_inter) >= 5:
                    # pearson相关性系数
                    pearson_r_gg12 = pearsonr(ge_final_1, ge_final_2)
                    pearson_r_gg23 = pearsonr(ge_final_2, ge_final_3)
                    pearson_r_gg13 = pearsonr(ge_final_1, ge_final_3)

                    pearson_corr_gg12 = pearson_r_gg12[0]
                    pearson_corr_gg23 = pearson_r_gg23[0]
                    pearson_corr_gg13 = pearson_r_gg13[0]

                    pearson_p_gg12 = pearson_r_gg12[1]
                    pearson_p_gg23 = pearson_r_gg23[1]
                    pearson_p_gg13 = pearson_r_gg13[1]

                    # ols模型
                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_2, X_with_intercept)
                    ols_r_gg12 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_2)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg23 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg13 = model_ols.fit()

                    ols_coeff_0_gg12 = ols_r_gg12.params[0]
                    ols_se_0_gg12 = ols_r_gg12.bse[0]
                    ols_coeff_1_gg12 = ols_r_gg12.params[1]
                    ols_se_1_gg12 = ols_r_gg12.bse[1]

                    ols_coeff_0_gg23 = ols_r_gg23.params[0]
                    ols_se_0_gg23 = ols_r_gg23.bse[0]
                    ols_coeff_1_gg23 = ols_r_gg23.params[1]
                    ols_se_1_gg23 = ols_r_gg23.bse[1]

                    ols_coeff_0_gg13 = ols_r_gg13.params[0]
                    ols_se_0_gg13 = ols_r_gg13.bse[0]
                    ols_coeff_1_gg13 = ols_r_gg13.params[1]
                    ols_se_1_gg13 = ols_r_gg13.bse[1]

                    g1_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_1]))
                    g2_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_2]))
                    g3_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_3]))

                    g1_mean = np.mean(ge_final_1)
                    g2_mean = np.mean(ge_final_2)
                    g3_mean = np.mean(ge_final_3)

                    g1_std = np.std(ge_final_1)
                    g2_std = np.std(ge_final_2)
                    g3_std = np.std(ge_final_3)

                    g1_min = min(ge_final_1)
                    g1_25 = np.percentile(ge_final_1, 25)
                    g1_50 = np.percentile(ge_final_1, 50)
                    g1_75 = np.percentile(ge_final_1, 75)
                    g1_max = max(ge_final_1)

                    g2_min = min(ge_final_2)
                    g2_25 = np.percentile(ge_final_2, 25)
                    g2_50 = np.percentile(ge_final_2, 50)
                    g2_75 = np.percentile(ge_final_2, 75)
                    g2_max = max(ge_final_2)

                    g3_min = min(ge_final_3)
                    g3_25 = np.percentile(ge_final_3, 25)
                    g3_50 = np.percentile(ge_final_3, 50)
                    g3_75 = np.percentile(ge_final_3, 75)
                    g3_max = max(ge_final_3)

                    ge_e_1 = []
                    for k in ge_1:
                        if k > 0:
                            ge_e_1.append(0)
                        else:
                            ge_e_1.append(k)

                    ge_e_2 = []
                    for k in ge_2:
                        if k > 0:
                            ge_e_2.append(0)
                        else:
                            ge_e_2.append(k)

                    ge_e_3 = []
                    for k in ge_3:
                        if k > 0:
                            ge_e_3.append(0)
                        else:
                            ge_e_3.append(k)

                    g1_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_1]))
                    g2_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_2]))
                    g3_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_3]))

                    g1_mean_all = np.mean(ge_e_1)
                    g2_mean_all = np.mean(ge_e_2)
                    g3_mean_all = np.mean(ge_e_3)

                    g1_std_all = np.std(ge_e_1)
                    g2_std_all = np.std(ge_e_2)
                    g3_std_all = np.std(ge_e_3)

                    g1_min_all = min(ge_e_1)
                    g1_25_all = np.percentile(ge_e_1, 25)
                    g1_50_all = np.percentile(ge_e_1, 50)
                    g1_75_all = np.percentile(ge_e_1, 75)
                    g1_max_all = max(ge_e_1)

                    g2_min_all = min(ge_e_2)
                    g2_25_all = np.percentile(ge_e_2, 25)
                    g2_50_all = np.percentile(ge_e_2, 50)
                    g2_75_all = np.percentile(ge_e_2, 75)
                    g2_max_all = max(ge_e_2)

                    g3_min_all = min(ge_e_3)
                    g3_25_all = np.percentile(ge_e_3, 25)
                    g3_50_all = np.percentile(ge_e_3, 50)
                    g3_75_all = np.percentile(ge_e_3, 75)
                    g3_max_all = max(ge_e_3)

                    features = [len(ccls_inter), pearson_corr_gg12, pearson_corr_gg23, pearson_corr_gg13,
                                pearson_p_gg12, pearson_p_gg23, pearson_p_gg13,
                                ols_coeff_0_gg12, ols_coeff_0_gg23, ols_coeff_0_gg13,
                                ols_coeff_1_gg12, ols_coeff_1_gg23, ols_coeff_1_gg13,
                                ols_se_0_gg12, ols_se_0_gg23, ols_se_0_gg13,
                                ols_se_1_gg12, ols_se_1_gg23, ols_se_1_gg13,
                                g1_essen, g2_essen, g3_essen,
                                g1_essen_all, g2_essen_all, g3_essen_all,
                                g1_mean, g2_mean, g3_mean, g1_std, g2_std, g3_std,
                                g1_25, g1_50, g1_75, g2_25, g2_50, g2_75, g3_25, g3_50, g3_75,
                                g1_min, g2_min, g3_min, g1_max, g2_max, g3_max,
                                g1_mean_all, g2_mean_all, g3_mean_all, g1_std_all, g2_std_all, g3_std_all,
                                g1_25_all, g1_50_all, g1_75_all, g2_25_all, g2_50_all, g2_75_all, g3_25_all, g3_50_all,
                                g3_75_all,
                                g1_min_all, g2_min_all, g3_min_all, g1_max_all, g2_max_all, g3_max_all,
                                ]
                    triangles_list.append(triangle)
                    ft_all_list.append(features)

                    if len(ft_all_list) == len(ft_posi):
                        print(len(ft_all_list))
                        break

        ft_RanNeg_seed_key[ge_key][s] = pd.DataFrame(ft_all_list, index=triangles_list,
                                                     columns=['SCCLs_number', 'pearson_corr_gg12', 'pearson_corr_gg23',
                                                              'pearson_corr_gg13',
                                                              'pearson_p_gg12', 'pearson_p_gg23', 'pearson_p_gg13',
                                                              'ols_coeff_0_gg12', 'ols_coeff_0_gg23', 'ols_coeff_0_gg13',
                                                              'ols_coeff_1_gg12', 'ols_coeff_1_gg23', 'ols_coeff_1_gg13',
                                                              'ols_se_0_gg12', 'ols_se_0_gg23', 'ols_se_0_gg13',
                                                              'ols_se_1_gg12', 'ols_se_1_gg23', 'ols_se_1_gg13',
                                                              'g1_essen', 'g2_essen', 'g3_essen',
                                                              'g1_essen_all', 'g2_essen_all', 'g3_essen_all',
                                                              'g1_mean', 'g2_mean', 'g3_mean', 'g1_std', 'g2_std',
                                                              'g3_std',
                                                              'g1_25', 'g1_50', 'g1_75', 'g2_25', 'g2_50', 'g2_75',
                                                              'g3_25',
                                                              'g3_50',
                                                              'g3_75',
                                                              'g1_min', 'g2_min', 'g3_min', 'g1_max', 'g2_max', 'g3_max',
                                                              'g1_mean_all', 'g2_mean_all', 'g3_mean_all', 'g1_std_all',
                                                              'g2_std_all',
                                                              'g3_std_all',
                                                              'g1_25_all', 'g1_50_all', 'g1_75_all', 'g2_25_all',
                                                              'g2_50_all',
                                                              'g2_75_all',
                                                              'g3_25_all', 'g3_50_all', 'g3_75_all',
                                                              'g1_min_all', 'g2_min_all', 'g3_min_all',
                                                              'g1_max_all', 'g2_max_all', 'g3_max_all',
                                                              ])

# negative examples --- DecoyNeg
ft_DecoyNeg_seed_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    ft_posi_seed = ft_posi_seed_key[ge_key]
    ft_DecoyNeg_seed_key[ge_key] = {}
    for s in seeds:
        ft_posi = ft_posi_seed[s]

        negat = negat_seed[s]

        triangles_list = []
        ft_all_list = []
        for triangle in tqdm(negat):
            pros = sorted(list(triangle))
            g1 = pros[0]
            g2 = pros[1]
            g3 = pros[2]

            genes_all = genes_all_key[ge_key]
            if g1 in genes_all and g2 in genes_all and g3 in genes_all:
                ge_1 = ge_matrix[ge_key][g1].dropna()
                ge_2 = ge_matrix[ge_key][g2].dropna()
                ge_3 = ge_matrix[ge_key][g3].dropna()

                ge_f_1 = ge_1[ge_1 < ge_cut[ge_key]]
                ge_f_2 = ge_2[ge_2 < ge_cut[ge_key]]
                ge_f_3 = ge_3[ge_3 < ge_cut[ge_key]]

                ccls_inter = list(set(ge_f_1.index) & set(ge_f_2.index) & set(ge_f_3.index))

                ge_final_1 = ge_f_1[ccls_inter]
                ge_final_2 = ge_f_2[ccls_inter]
                ge_final_3 = ge_f_3[ccls_inter]

                if len(ccls_inter) >= 5:
                    # pearson相关性系数
                    pearson_r_gg12 = pearsonr(ge_final_1, ge_final_2)
                    pearson_r_gg23 = pearsonr(ge_final_2, ge_final_3)
                    pearson_r_gg13 = pearsonr(ge_final_1, ge_final_3)

                    pearson_corr_gg12 = pearson_r_gg12[0]
                    pearson_corr_gg23 = pearson_r_gg23[0]
                    pearson_corr_gg13 = pearson_r_gg13[0]

                    pearson_p_gg12 = pearson_r_gg12[1]
                    pearson_p_gg23 = pearson_r_gg23[1]
                    pearson_p_gg13 = pearson_r_gg13[1]

                    # ols模型
                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_2, X_with_intercept)
                    ols_r_gg12 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_2)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg23 = model_ols.fit()

                    X_with_intercept = sm.add_constant(ge_final_1)
                    model_ols = sm.OLS(ge_final_3, X_with_intercept)
                    ols_r_gg13 = model_ols.fit()

                    ols_coeff_0_gg12 = ols_r_gg12.params[0]
                    ols_se_0_gg12 = ols_r_gg12.bse[0]
                    ols_coeff_1_gg12 = ols_r_gg12.params[1]
                    ols_se_1_gg12 = ols_r_gg12.bse[1]

                    ols_coeff_0_gg23 = ols_r_gg23.params[0]
                    ols_se_0_gg23 = ols_r_gg23.bse[0]
                    ols_coeff_1_gg23 = ols_r_gg23.params[1]
                    ols_se_1_gg23 = ols_r_gg23.bse[1]

                    ols_coeff_0_gg13 = ols_r_gg13.params[0]
                    ols_se_0_gg13 = ols_r_gg13.bse[0]
                    ols_coeff_1_gg13 = ols_r_gg13.params[1]
                    ols_se_1_gg13 = ols_r_gg13.bse[1]

                    g1_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_1]))
                    g2_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_2]))
                    g3_essen = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_final_3]))

                    g1_mean = np.mean(ge_final_1)
                    g2_mean = np.mean(ge_final_2)
                    g3_mean = np.mean(ge_final_3)

                    g1_std = np.std(ge_final_1)
                    g2_std = np.std(ge_final_2)
                    g3_std = np.std(ge_final_3)

                    g1_min = min(ge_final_1)
                    g1_25 = np.percentile(ge_final_1, 25)
                    g1_50 = np.percentile(ge_final_1, 50)
                    g1_75 = np.percentile(ge_final_1, 75)
                    g1_max = max(ge_final_1)

                    g2_min = min(ge_final_2)
                    g2_25 = np.percentile(ge_final_2, 25)
                    g2_50 = np.percentile(ge_final_2, 50)
                    g2_75 = np.percentile(ge_final_2, 75)
                    g2_max = max(ge_final_2)

                    g3_min = min(ge_final_3)
                    g3_25 = np.percentile(ge_final_3, 25)
                    g3_50 = np.percentile(ge_final_3, 50)
                    g3_75 = np.percentile(ge_final_3, 75)
                    g3_max = max(ge_final_3)

                    ge_e_1 = []
                    for k in ge_1:
                        if k > 0:
                            ge_e_1.append(0)
                        else:
                            ge_e_1.append(k)

                    ge_e_2 = []
                    for k in ge_2:
                        if k > 0:
                            ge_e_2.append(0)
                        else:
                            ge_e_2.append(k)

                    ge_e_3 = []
                    for k in ge_3:
                        if k > 0:
                            ge_e_3.append(0)
                        else:
                            ge_e_3.append(k)

                    g1_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_1]))
                    g2_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_2]))
                    g3_essen_all = math.sqrt(
                        np.sum([np.power((i - 0), 2) for i in ge_e_3]))

                    g1_mean_all = np.mean(ge_e_1)
                    g2_mean_all = np.mean(ge_e_2)
                    g3_mean_all = np.mean(ge_e_3)

                    g1_std_all = np.std(ge_e_1)
                    g2_std_all = np.std(ge_e_2)
                    g3_std_all = np.std(ge_e_3)

                    g1_min_all = min(ge_e_1)
                    g1_25_all = np.percentile(ge_e_1, 25)
                    g1_50_all = np.percentile(ge_e_1, 50)
                    g1_75_all = np.percentile(ge_e_1, 75)
                    g1_max_all = max(ge_e_1)

                    g2_min_all = min(ge_e_2)
                    g2_25_all = np.percentile(ge_e_2, 25)
                    g2_50_all = np.percentile(ge_e_2, 50)
                    g2_75_all = np.percentile(ge_e_2, 75)
                    g2_max_all = max(ge_e_2)

                    g3_min_all = min(ge_e_3)
                    g3_25_all = np.percentile(ge_e_3, 25)
                    g3_50_all = np.percentile(ge_e_3, 50)
                    g3_75_all = np.percentile(ge_e_3, 75)
                    g3_max_all = max(ge_e_3)

                    features = [len(ccls_inter), pearson_corr_gg12, pearson_corr_gg23, pearson_corr_gg13,
                                pearson_p_gg12, pearson_p_gg23, pearson_p_gg13,
                                ols_coeff_0_gg12, ols_coeff_0_gg23, ols_coeff_0_gg13,
                                ols_coeff_1_gg12, ols_coeff_1_gg23, ols_coeff_1_gg13,
                                ols_se_0_gg12, ols_se_0_gg23, ols_se_0_gg13,
                                ols_se_1_gg12, ols_se_1_gg23, ols_se_1_gg13,
                                g1_essen, g2_essen, g3_essen,
                                g1_essen_all, g2_essen_all, g3_essen_all,
                                g1_mean, g2_mean, g3_mean, g1_std, g2_std, g3_std,
                                g1_25, g1_50, g1_75, g2_25, g2_50, g2_75, g3_25, g3_50, g3_75,
                                g1_min, g2_min, g3_min, g1_max, g2_max, g3_max,
                                g1_mean_all, g2_mean_all, g3_mean_all, g1_std_all, g2_std_all, g3_std_all,
                                g1_25_all, g1_50_all, g1_75_all, g2_25_all, g2_50_all, g2_75_all, g3_25_all, g3_50_all,
                                g3_75_all,
                                g1_min_all, g2_min_all, g3_min_all, g1_max_all, g2_max_all, g3_max_all,
                                ]
                    triangles_list.append(triangle)
                    ft_all_list.append(features)

                    if len(ft_all_list) == len(ft_posi):
                        print(len(ft_all_list))
                        break

        ft_DecoyNeg_seed_key[ge_key][s] = pd.DataFrame(ft_all_list, index=triangles_list,
                                                       columns=['SCCLs_number', 'pearson_corr_gg12', 'pearson_corr_gg23',
                                                                'pearson_corr_gg13',
                                                                'pearson_p_gg12', 'pearson_p_gg23', 'pearson_p_gg13',
                                                                'ols_coeff_0_gg12', 'ols_coeff_0_gg23', 'ols_coeff_0_gg13',
                                                                'ols_coeff_1_gg12', 'ols_coeff_1_gg23', 'ols_coeff_1_gg13',
                                                                'ols_se_0_gg12', 'ols_se_0_gg23', 'ols_se_0_gg13',
                                                                'ols_se_1_gg12', 'ols_se_1_gg23', 'ols_se_1_gg13',
                                                                'g1_essen', 'g2_essen', 'g3_essen',
                                                                'g1_essen_all', 'g2_essen_all', 'g3_essen_all',
                                                                'g1_mean', 'g2_mean', 'g3_mean', 'g1_std', 'g2_std',
                                                                'g3_std',
                                                                'g1_25', 'g1_50', 'g1_75', 'g2_25', 'g2_50', 'g2_75',
                                                                'g3_25',
                                                                'g3_50',
                                                                'g3_75',
                                                                'g1_min', 'g2_min', 'g3_min', 'g1_max', 'g2_max', 'g3_max',
                                                                'g1_mean_all', 'g2_mean_all', 'g3_mean_all', 'g1_std_all',
                                                                'g2_std_all',
                                                                'g3_std_all',
                                                                'g1_25_all', 'g1_50_all', 'g1_75_all', 'g2_25_all',
                                                                'g2_50_all',
                                                                'g2_75_all',
                                                                'g3_25_all', 'g3_50_all', 'g3_75_all',
                                                                'g1_min_all', 'g2_min_all', 'g3_min_all',
                                                                'g1_max_all', 'g2_max_all', 'g3_max_all',
                                                                ])