In [3]:
import pandas as pd
import pickle
import random
import os
import itertools
from tqdm import tqdm
import numpy as np
from scipy.stats import pearsonr
import statsmodels.api as sm
import math
import warnings
warnings.filterwarnings("ignore")

In [None]:
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
# read datasets
ctrp_matrix = pickle.load(open('../general_datasets/ctrp_matrix.pkl','rb'))
gdsc_matrix = pickle.load(open('../general_datasets/gdsc_matrix.pkl','rb'))
prism_matrix = pickle.load(open('../general_datasets/prism_matrix.pkl','rb'))
drug_ccl_db = pickle.load(open('../general_datasets/drug_ccl_db.pkl','rb'))

ge_matrix = pickle.load(open('../general_datasets/ge_matrix.pkl','rb'))

ge_cut = {}
ge_cut['CRISPR'] = -0.237
ge_cut['RNAi'] = -0.325

genes_all_key = {}
genes_all_key['CRISPR'] = list(ge_matrix_dict['CRISPR'].columns)
genes_all_key['RNAi'] = list(ge_matrix_dict['RNAi'].columns)

dgs_idf_key = pickle.load(open('PCorr_DGI_datasets/dgs_idf_key.pkl','rb'))

corum_human = pd.read_table('../general_datasets/corum_humanComplexes.txt')
corum_human.index = corum_human['complex_id']

seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

In [2]:
# all known drug-target interactions
drug_target = pickle.load(open('../general_datasets/drug_target.pkl','rb'))
target_drug = pickle.load(open('../general_datasets/target_drug.pkl','rb'))

dgs_posi = set()
for drug in drug_target.keys():
    targets = drug_target[drug]
    for target in targets:
        dgs_posi.add((drug, target))

In [None]:
# feature matrices extraction --- positive (RanNeg model)
ft_posiRanNeg_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    dgs_list = []
    ft_all_list = []
    for dg in tqdm(dgs_posi):
        flag = 0
        corr_list = []
        drug = dg[0]
        gene = dg[1]
        for drug_db_key in drug_ccl_db.keys():
            if drug in list(drug_ccl_db[drug_db_key].keys()) and gene in list(ge_matrix_dict[ge_key].columns):
                dr_key = drug_ccl_db[drug_db_key][drug]['dr'][ge_key]

                ge = ge_matrix_dict[ge_key][gene].dropna()
                ge_f = ge[ge < ge_cut[ge_key]]

                ge_final = ge_f[list(set(dr_key.index) & set(ge_f.index))]
                dr_final = dr_key[list(ge_final.index)]

                if len(dr_final) >= 10:
                    pearson_r = pearsonr(dr_final, ge_final)
                    corr = pearson_r[0]
                    corr_list.append(corr)

                    if corr == max(corr_list):
                        pearson_coeff = pearson_r[0]
                        pearson_p = pearson_r[1]

                        X_with_intercept = sm.add_constant(dr_final)
                        model_ols = sm.OLS(ge_final, X_with_intercept)
                        ols_r = model_ols.fit()

                        ols_coeff_0 = ols_r.params[0]
                        ols_se_0 = ols_r.bse[0]

                        ols_coeff_1 = ols_r.params[1]
                        ols_se_1 = ols_r.bse[1]

                        dr_sens = math.sqrt(
                            np.sum([np.power((i - 1), 2) for i in dr_final]))
                        ge_essen = math.sqrt(
                            np.sum([np.power((i - 0), 2) for i in ge_final]))

                        dr_mean = np.mean(dr_final)
                        ge_mean = np.mean(ge_final)
                        dr_std = np.std(dr_final)
                        ge_std = np.std(ge_final)

                        dr_min = min(dr_final)
                        dr_25 = np.percentile(dr_final, 25)
                        dr_50 = np.percentile(dr_final, 50)
                        dr_75 = np.percentile(dr_final, 75)
                        dr_max = max(dr_final)

                        ge_min = min(ge_final)
                        ge_25 = np.percentile(ge_final, 25)
                        ge_50 = np.percentile(ge_final, 50)
                        ge_75 = np.percentile(ge_final, 75)
                        ge_max = max(ge_final)

                        ge_e = []
                        for k in ge:
                            if k > 0:
                                ge_e.append(0)
                            else:
                                ge_e.append(k)

                        dr_sens_all = math.sqrt(
                            np.sum([np.power((i - 1), 2) for i in dr_key]))
                        ge_essen_all = math.sqrt(
                            np.sum([np.power((i - 0), 2) for i in ge_e]))

                        dr_mean_all = np.mean(dr_key)
                        ge_mean_all = np.mean(ge_e)
                        dr_std_all = np.std(dr_key)
                        ge_std_all = np.std(ge_e)

                        dr_min_all = min(dr_key)
                        dr_25_all = np.percentile(dr_key, 25)
                        dr_50_all = np.percentile(dr_key, 50)
                        dr_75_all = np.percentile(dr_key, 75)
                        dr_max_all = max(dr_key)

                        ge_min_all = min(ge_e)
                        ge_25_all = np.percentile(ge_e, 25)
                        ge_50_all = np.percentile(ge_e, 50)
                        ge_75_all = np.percentile(ge_e, 75)
                        ge_max_all = max(ge_e)

                        d_key = drug_db_key

                        flag = 1

        if flag == 1:
            features = [len(dr_final),pearson_coeff, pearson_p,
                        ols_coeff_0, ols_se_0,
                        ols_coeff_1, ols_se_1,
                        dr_sens, ge_essen,
                        dr_sens_all, ge_essen_all,
                        dr_mean, ge_mean, dr_std, ge_std,
                        dr_25, dr_50, dr_75, ge_25, ge_50, ge_75,
                        dr_min,dr_max,ge_min,ge_max,
                        dr_mean_all, ge_mean_all, dr_std_all, ge_std_all,
                        dr_25_all, dr_50_all, dr_75_all, ge_25_all, ge_50_all, ge_75_all,
                        dr_min_all, dr_max_all, ge_min_all, ge_max_all,
                        d_key]

            dgs_list.append((drug,gene))
            ft_all_list.append(features)

    ft_posiRanNeg_key[ge_key] = pd.DataFrame(ft_all_list,index=dgs_list,
                                             columns=['SCCLs_number','pearson_coeff', 'pearson_p',
                                                      'ols_coeff_0', 'ols_se_0',
                                                      'ols_coeff_1', 'ols_se_1',
                                                      'dr_sens', 'ge_essen',
                                                      'dr_sens_all', 'ge_essen_all',
                                                      'dr_mean', 'ge_mean', 'dr_std', 'ge_std',
                                                      'dr_25', 'dr_50', 'dr_75', 'ge_25', 'ge_50', 'ge_75',
                                                      'dr_min', 'dr_max', 'ge_min', 'ge_max',
                                                      'dr_mean_all', 'ge_mean_all', 'dr_std_all', 'ge_std_all',
                                                      'dr_25_all', 'dr_50_all', 'dr_75_all', 'ge_25_all', 'ge_50_all', 'ge_75_all',
                                                      'dr_min_all', 'dr_max_all', 'ge_min_all', 'ge_max_all',
                                                      'd_key'])

In [None]:
# feature matrices extraction --- negative (RanNeg model)
drug_dg_posi_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    drug_dg_posi = {}
    ft_posi = ft_posiRanNeg_key[ge_key]
    for dg in ft_posi.index:
        try:
            drug_dg_posi[dg[0]].add(dg)
        except:
            drug_dg_posi[dg[0]] = set()
            drug_dg_posi[dg[0]].add(dg)
    drug_dg_posi_key[ge_key] = drug_dg_posi

complex_subunit = {}
for c in tqdm(corum_human.index):
    subunits = set(corum_human['subunits_gene_name'][c].split(';'))
    complex_subunit[c] = subunits

drug_dg_nega_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    dgs_idf = dgs_idf_key[ge_key]
    drug_target_posi = drug_target_posi_key[ge_key]
    drug_dg_nega = {}
    for drug in tqdm(drug_target_posi.keys()):
        dgs_nega = set()
        targets_posi = drug_target_posi[drug]
        for target in targets_posi:
            for c in complex_subunit.keys():
                subunits = complex_subunit[c]
                if target in subunits:
                    genes_nega = subunits-targets_posi
                    for gene in genes_nega:
                        dgs_nega.add((drug,gene))
        dgs_nega = dgs_nega & dgs_idf
        if len(dgs_nega) > 0:
            drug_dg_nega[drug] = dgs_nega

    print(len(drug_dg_nega))
    drug_dg_nega_key[ge_key] = drug_dg_nega

drug_dg_all_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    drug_dg_all = {}
    dgs_idf = dgs_idf_key[ge_key]
    for dg in tqdm(dgs_idf):
        try:
            drug_dg_all[dg[0]].add(dg)
        except:
            drug_dg_all[dg[0]] = set()
            drug_dg_all[dg[0]].add(dg)
    drug_dg_all_key[ge_key] = drug_dg_all

drug_dg_nega_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    drug_dg_nega_corum = drug_dg_nega_key[ge_key]

    drug_dg_all = drug_dg_all_key[ge_key]
    drug_dg_posi = drug_dg_posi_key[ge_key]
    drug_dg_nega = {}
    for drug in tqdm(drug_dg_posi.keys()):
        dgs_posi = drug_dg_posi[drug]
        if drug in list(drug_dg_nega_corum.keys()):
            dgs_nega_corum = drug_dg_nega_corum[drug]
        else:
            dgs_nega_corum = set()
        dgs_all = drug_dg_all[drug]
        drug_dg_nega[drug] = tuple(sorted(list(dgs_all - dgs_posi - dgs_nega_corum)))
    drug_dg_nega_key[ge_key] = drug_dg_nega

ft_negaRanNeg_seed_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    ft_negaRanNeg_seed_key[ge_key] = {}
    drug_dg_posi = drug_dg_posi_key[ge_key]
    drug_dg_nega = drug_dg_nega_key[ge_key]
    for s in seeds:
        dgs_nega_all = set()
        for drug in drug_dg_posi.keys():
            dgs_posi = drug_dg_posi[drug]
            setup_seed(s)
            dgs_nega = drug_dg_nega[drug]
            dgs_nega = random.sample(dgs_nega, len(dgs_posi))
            dgs_nega_all = dgs_nega_all.union(set(dgs_nega))
        print(len(dgs_nega_all))

        dgs_list = []
        ft_all_list = []
        for dg in tqdm(dgs_nega_all):
            flag = 0
            corr_list = []
            drug = dg[0]
            gene = dg[1]
            for drug_db_key in drug_ccl_db.keys():
                if drug in list(drug_ccl_db[drug_db_key].keys()) and gene in list(ge_matrix_dict[ge_key].columns):
                    dr_key = drug_ccl_db[drug_db_key][drug]['dr'][ge_key]

                    ge = ge_matrix_dict[ge_key][gene].dropna()
                    ge_f = ge[ge < ge_cut[ge_key]]

                    ge_final = ge_f[list(set(dr_key.index) & set(ge_f.index))]
                    dr_final = dr_key[list(ge_final.index)]

                    if len(dr_final) >= 10:
                        pearson_r = pearsonr(dr_final, ge_final)
                        corr = pearson_r[0]
                        corr_list.append(corr)

                        if corr == max(corr_list):
                            pearson_coeff = pearson_r[0]
                            pearson_p = pearson_r[1]

                            X_with_intercept = sm.add_constant(dr_final)
                            model_ols = sm.OLS(ge_final, X_with_intercept)
                            ols_r = model_ols.fit()

                            ols_coeff_0 = ols_r.params[0]
                            ols_se_0 = ols_r.bse[0]

                            ols_coeff_1 = ols_r.params[1]
                            ols_se_1 = ols_r.bse[1]

                            dr_sens = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_final]))
                            ge_essen = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_final]))

                            dr_mean = np.mean(dr_final)
                            ge_mean = np.mean(ge_final)
                            dr_std = np.std(dr_final)
                            ge_std = np.std(ge_final)

                            dr_min = min(dr_final)
                            dr_25 = np.percentile(dr_final, 25)
                            dr_50 = np.percentile(dr_final, 50)
                            dr_75 = np.percentile(dr_final, 75)
                            dr_max = max(dr_final)

                            ge_min = min(ge_final)
                            ge_25 = np.percentile(ge_final, 25)
                            ge_50 = np.percentile(ge_final, 50)
                            ge_75 = np.percentile(ge_final, 75)
                            ge_max = max(ge_final)

                            ge_e = []
                            for k in ge:
                                if k > 0:
                                    ge_e.append(0)
                                else:
                                    ge_e.append(k)

                            dr_sens_all = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_key]))
                            ge_essen_all = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_e]))

                            dr_mean_all = np.mean(dr_key)
                            ge_mean_all = np.mean(ge_e)
                            dr_std_all = np.std(dr_key)
                            ge_std_all = np.std(ge_e)

                            dr_min_all = min(dr_key)
                            dr_25_all = np.percentile(dr_key, 25)
                            dr_50_all = np.percentile(dr_key, 50)
                            dr_75_all = np.percentile(dr_key, 75)
                            dr_max_all = max(dr_key)

                            ge_min_all = min(ge_e)
                            ge_25_all = np.percentile(ge_e, 25)
                            ge_50_all = np.percentile(ge_e, 50)
                            ge_75_all = np.percentile(ge_e, 75)
                            ge_max_all = max(ge_e)

                            d_key = drug_db_key

                            flag = 1

            if flag == 1:
                features = [len(dr_final), pearson_coeff, pearson_p,
                            ols_coeff_0, ols_se_0,
                            ols_coeff_1, ols_se_1,
                            dr_sens, ge_essen,
                            dr_sens_all, ge_essen_all,
                            dr_mean, ge_mean, dr_std, ge_std,
                            dr_25, dr_50, dr_75, ge_25, ge_50, ge_75,
                            dr_min, dr_max, ge_min, ge_max,
                            dr_mean_all, ge_mean_all, dr_std_all, ge_std_all,
                            dr_25_all, dr_50_all, dr_75_all, ge_25_all, ge_50_all, ge_75_all,
                            dr_min_all, dr_max_all, ge_min_all, ge_max_all,
                            d_key]
                dgs_list.append((drug, gene))
                ft_all_list.append(features)

        ft_negaRanNeg_seed_key[ge_key][s] = pd.DataFrame(ft_all_list, index=dgs_list,
                                                         columns=['SCCLs_number', 'pearson_coeff', 'pearson_p',
                                                                  'ols_coeff_0', 'ols_se_0',
                                                                  'ols_coeff_1', 'ols_se_1',
                                                                  'dr_sens', 'ge_essen',
                                                                  'dr_sens_all', 'ge_essen_all',
                                                                  'dr_mean', 'ge_mean', 'dr_std', 'ge_std',
                                                                  'dr_25', 'dr_50', 'dr_75', 'ge_25', 'ge_50', 'ge_75',
                                                                  'dr_min', 'dr_max', 'ge_min', 'ge_max',
                                                                  'dr_mean_all', 'ge_mean_all', 'dr_std_all', 'ge_std_all',
                                                                  'dr_25_all', 'dr_50_all', 'dr_75_all', 'ge_25_all', 'ge_50_all',
                                                                  'ge_75_all',
                                                                  'dr_min_all', 'dr_max_all', 'ge_min_all', 'ge_max_all',
                                                                  'd_key'])

In [None]:
# feature matrices extraction --- positive (DecoyNeg model)
dgs_posi_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    drug_dg_nega = drug_dg_nega_key[ge_key]
    drug_dg_posi = drug_dg_posi_key[ge_key]
    dgs_posi = set()
    for drug in drug_dg_nega.keys():
        dgs_posi = dgs_posi.union(drug_dg_posi[drug])
    dgs_posi_key[ge_key] = dgs_posi

ft_posiDecoyNeg_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    dgs_posi = dgs_posi_key[ge_key]
    dgs_list = []
    ft_all_list = []
    for dg in tqdm(dgs_posi):
        flag = 0
        corr_list = []
        drug = dg[0]
        gene = dg[1]
        for drug_db_key in drug_ccl_db.keys():
            if drug in list(drug_ccl_db[drug_db_key].keys()) and gene in list(ge_matrix_dict[ge_key].columns):
                dr_key = drug_ccl_db[drug_db_key][drug]['dr'][ge_key]

                ge = ge_matrix_dict[ge_key][gene].dropna()
                ge_f = ge[ge < ge_cut[ge_key]]

                ge_final = ge_f[list(set(dr_key.index) & set(ge_f.index))]
                dr_final = dr_key[list(ge_final.index)]

                if len(dr_final) >= 10:
                    # pearson相关性系数
                    pearson_r = pearsonr(dr_final, ge_final)
                    corr = pearson_r[0]
                    corr_list.append(corr)

                    if corr == max(corr_list):
                        pearson_r = pearsonr(dr_final, ge_final)
                        corr = pearson_r[0]
                        corr_list.append(corr)

                        if corr == max(corr_list):
                            pearson_coeff = pearson_r[0]
                            pearson_p = pearson_r[1]

                            # ols模型
                            X_with_intercept = sm.add_constant(dr_final)
                            model_ols = sm.OLS(ge_final, X_with_intercept)
                            ols_r = model_ols.fit()

                            ols_coeff_0 = ols_r.params[0]
                            ols_se_0 = ols_r.bse[0]

                            ols_coeff_1 = ols_r.params[1]
                            ols_se_1 = ols_r.bse[1]

                            dr_sens = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_final]))
                            ge_essen = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_final]))

                            dr_mean = np.mean(dr_final)
                            ge_mean = np.mean(ge_final)
                            dr_std = np.std(dr_final)
                            ge_std = np.std(ge_final)

                            dr_min = min(dr_final)
                            dr_25 = np.percentile(dr_final, 25)
                            dr_50 = np.percentile(dr_final, 50)
                            dr_75 = np.percentile(dr_final, 75)
                            dr_max = max(dr_final)

                            ge_min = min(ge_final)
                            ge_25 = np.percentile(ge_final, 25)
                            ge_50 = np.percentile(ge_final, 50)
                            ge_75 = np.percentile(ge_final, 75)
                            ge_max = max(ge_final)

                            ge_e = []
                            for k in ge:
                                if k > 0:
                                    ge_e.append(0)
                                else:
                                    ge_e.append(k)

                            dr_sens_all = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_key]))
                            ge_essen_all = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_e]))

                            dr_mean_all = np.mean(dr_key)
                            ge_mean_all = np.mean(ge_e)
                            dr_std_all = np.std(dr_key)
                            ge_std_all = np.std(ge_e)

                            dr_min_all = min(dr_key)
                            dr_25_all = np.percentile(dr_key, 25)
                            dr_50_all = np.percentile(dr_key, 50)
                            dr_75_all = np.percentile(dr_key, 75)
                            dr_max_all = max(dr_key)

                            ge_min_all = min(ge_e)
                            ge_25_all = np.percentile(ge_e, 25)
                            ge_50_all = np.percentile(ge_e, 50)
                            ge_75_all = np.percentile(ge_e, 75)
                            ge_max_all = max(ge_e)

                            d_key = drug_db_key

                            flag = 1

        if flag == 1:
            features = [len(dr_final),pearson_coeff, pearson_p,
                        ols_coeff_0, ols_se_0,
                        ols_coeff_1, ols_se_1,
                        dr_sens, ge_essen,
                        dr_sens_all, ge_essen_all,
                        dr_mean, ge_mean, dr_std, ge_std,
                        dr_25, dr_50, dr_75, ge_25, ge_50, ge_75,
                        dr_min,dr_max,ge_min,ge_max,
                        dr_mean_all, ge_mean_all, dr_std_all, ge_std_all,
                        dr_25_all, dr_50_all, dr_75_all, ge_25_all, ge_50_all, ge_75_all,
                        dr_min_all, dr_max_all, ge_min_all, ge_max_all,
                        d_key]

            dgs_list.append((drug,gene))
            ft_all_list.append(features)

    ft_posiDecoyNeg_key[ge_key] = pd.DataFrame(ft_all_list,index=dgs_list,
                                               columns=['SCCLs_number', 'pearson_coeff', 'pearson_p',
                                                        'ols_coeff_0', 'ols_se_0',
                                                        'ols_coeff_1', 'ols_se_1',
                                                        'dr_sens', 'ge_essen',
                                                        'dr_sens_all', 'ge_essen_all',
                                                        'dr_mean', 'ge_mean', 'dr_std', 'ge_std',
                                                        'dr_25', 'dr_50', 'dr_75', 'ge_25', 'ge_50', 'ge_75',
                                                        'dr_min', 'dr_max', 'ge_min', 'ge_max',
                                                        'dr_mean_all', 'ge_mean_all', 'dr_std_all', 'ge_std_all',
                                                        'dr_25_all', 'dr_50_all', 'dr_75_all', 'ge_25_all', 'ge_50_all',
                                                        'ge_75_all',
                                                        'dr_min_all', 'dr_max_all', 'ge_min_all', 'ge_max_all',
                                                        'd_key'])

# feature matrices extraction --- negative (DecoyNeg model)
ft_negaDecoyNeg_seed_key = {}
for ge_key in ['CRISPR', 'RNAi']:
    ft_negaDecoyNeg_seed_key[ge_key] = {}
    drug_dg_posi = drug_dg_posi_key[ge_key]
    drug_dg_nega = drug_dg_nega_key[ge_key]
    for s in seeds:
        n = 0
        dgs_nega_all = set()
        for drug in drug_dg_nega.keys():
            dgs_posi = drug_dg_posi[drug]
            setup_seed(s)
            dgs_nega = sorted(list(drug_dg_nega[drug]))
            try:
                dgs_nega = random.sample(dgs_nega, len(dgs_posi))
            except:
                n = n + len(dgs_posi)-len(dgs_nega)
                print(n)
                pass
            dgs_nega_all = dgs_nega_all.union(set(dgs_nega))
        print(len(dgs_nega_all))

        dgs_list = []
        ft_all_list = []
        for dg in tqdm(dgs_nega_all):
            flag = 0
            corr_list = []
            drug = dg[0]
            gene = dg[1]
            for drug_db_key in drug_ccl_db.keys():
                if drug in list(drug_ccl_db[drug_db_key].keys()) and gene in list(ge_matrix_dict[ge_key].columns):
                    dr_key = drug_ccl_db[drug_db_key][drug]['dr'][ge_key]

                    ge = ge_matrix_dict[ge_key][gene].dropna()
                    ge_f = ge[ge < ge_cut[ge_key]]

                    ge_final = ge_f[list(set(dr_key.index) & set(ge_f.index))]
                    dr_final = dr_key[list(ge_final.index)]

                    if len(dr_final) >= 10:
                        # pearson相关性系数
                        pearson_r = pearsonr(dr_final, ge_final)
                        corr = pearson_r[0]
                        corr_list.append(corr)

                        if corr == max(corr_list):
                            pearson_coeff = pearson_r[0]
                            pearson_p = pearson_r[1]

                            # ols模型
                            X_with_intercept = sm.add_constant(dr_final)
                            model_ols = sm.OLS(ge_final, X_with_intercept)
                            ols_r = model_ols.fit()

                            ols_coeff_0 = ols_r.params[0]
                            ols_se_0 = ols_r.bse[0]

                            ols_coeff_1 = ols_r.params[1]
                            ols_se_1 = ols_r.bse[1]

                            dr_sens = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_final]))
                            ge_essen = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_final]))

                            dr_mean = np.mean(dr_final)
                            ge_mean = np.mean(ge_final)
                            dr_std = np.std(dr_final)
                            ge_std = np.std(ge_final)

                            dr_min = min(dr_final)
                            dr_25 = np.percentile(dr_final, 25)
                            dr_50 = np.percentile(dr_final, 50)
                            dr_75 = np.percentile(dr_final, 75)
                            dr_max = max(dr_final)

                            ge_min = min(ge_final)
                            ge_25 = np.percentile(ge_final, 25)
                            ge_50 = np.percentile(ge_final, 50)
                            ge_75 = np.percentile(ge_final, 75)
                            ge_max = max(ge_final)

                            ge_e = []
                            for k in ge:
                                if k > 0:
                                    ge_e.append(0)
                                else:
                                    ge_e.append(k)

                            dr_sens_all = math.sqrt(
                                np.sum([np.power((i - 1), 2) for i in dr_key]))
                            ge_essen_all = math.sqrt(
                                np.sum([np.power((i - 0), 2) for i in ge_e]))

                            dr_mean_all = np.mean(dr_key)
                            ge_mean_all = np.mean(ge_e)
                            dr_std_all = np.std(dr_key)
                            ge_std_all = np.std(ge_e)

                            dr_min_all = min(dr_key)
                            dr_25_all = np.percentile(dr_key, 25)
                            dr_50_all = np.percentile(dr_key, 50)
                            dr_75_all = np.percentile(dr_key, 75)
                            dr_max_all = max(dr_key)

                            ge_min_all = min(ge_e)
                            ge_25_all = np.percentile(ge_e, 25)
                            ge_50_all = np.percentile(ge_e, 50)
                            ge_75_all = np.percentile(ge_e, 75)
                            ge_max_all = max(ge_e)

                            d_key = drug_db_key

                            flag = 1

            if flag == 1:
                features = [len(dr_final), pearson_coeff, pearson_p,
                            ols_coeff_0, ols_se_0,
                            ols_coeff_1, ols_se_1,
                            dr_sens, ge_essen,
                            dr_sens_all, ge_essen_all,
                            dr_mean, ge_mean, dr_std, ge_std,
                            dr_25, dr_50, dr_75, ge_25, ge_50, ge_75,
                            dr_min, dr_max, ge_min, ge_max,
                            dr_mean_all, ge_mean_all, dr_std_all, ge_std_all,
                            dr_25_all, dr_50_all, dr_75_all, ge_25_all, ge_50_all, ge_75_all,
                            dr_min_all, dr_max_all, ge_min_all, ge_max_all,
                            d_key]
                dgs_list.append((drug, gene))
                ft_all_list.append(features)

        ft_negaDecoyNeg_seed_key[ge_key][s] = pd.DataFrame(ft_all_list, index=dgs_list,
                                                           columns=['SCCLs_number', 'pearson_coeff', 'pearson_p',
                                                                    'ols_coeff_0', 'ols_se_0',
                                                                    'ols_coeff_1', 'ols_se_1',
                                                                    'dr_sens', 'ge_essen',
                                                                    'dr_sens_all', 'ge_essen_all',
                                                                    'dr_mean', 'ge_mean', 'dr_std', 'ge_std',
                                                                    'dr_25', 'dr_50', 'dr_75', 'ge_25', 'ge_50', 'ge_75',
                                                                    'dr_min', 'dr_max', 'ge_min', 'ge_max',
                                                                    'dr_mean_all', 'ge_mean_all', 'dr_std_all', 'ge_std_all',
                                                                    'dr_25_all', 'dr_50_all', 'dr_75_all', 'ge_25_all', 'ge_50_all',
                                                                    'ge_75_all',
                                                                    'dr_min_all', 'dr_max_all', 'ge_min_all', 'ge_max_all',
                                                                    'd_key'])