In [11]:
import pickle
import numpy as np
import os
import random
import pandas as pd
from tqdm import tqdm

In [9]:
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [13]:
# preparing datasets
corum_human = pd.read_table('../general_datasets/corum_humanComplexes.txt')
corum_human.index = corum_human['complex_id']

cl_lst = ['A375','HCT116','HL60','MCF7','K562']

cl_data = {}
cl_data['A375'] = pd.read_excel('TPCA_TPI_datasets/aan0346__Tables_S19_to_S27.xlsx',sheet_name='Table S19',skiprows=2).iloc[:,:19]
cl_data['HCT116'] = pd.read_excel('TPCA_TPI_datasets/aan0346__Tables_S19_to_S27.xlsx',sheet_name='Table S20',skiprows=2).iloc[:,:19]
cl_data['HL60'] = pd.read_excel('TPCA_TPI_datasets/aan0346__Tables_S19_to_S27.xlsx',sheet_name='Table S22',skiprows=2).iloc[:,:19]
cl_data['MCF7'] = pd.read_excel('TPCA_TPI_datasets/aan0346__Tables_S19_to_S27.xlsx',sheet_name='Table S23',skiprows=2).iloc[:,:19]
cl_data['K562'] = pd.read_excel('TPCA_TPI_datasets/aan0346_Tables_S1_to_S18.xlsx',sheet_name='Table S7',skiprows=2).iloc[:,:19]

cl_gene_pro = {}
for cl in cl_lst:
    cl_gene_pro[cl] = {}
    data = cl_data[cl]
    genes_all = list(set(data['GeneName'].dropna()))
    for gene in genes_all:
        cl_gene_pro[cl][gene] = set(data[data['GeneName'] == gene]['Accession'])

genes_ms = set(cl_gene_pro[cl_lst[0]].keys())&set(cl_gene_pro[cl_lst[1]].keys())&set(cl_gene_pro[cl_lst[2]].keys())&set(cl_gene_pro[cl_lst[3]].keys())&set(cl_gene_pro[cl_lst[4]].keys())

subunit_id = {}
for i in corum_human.index:
    subunits = tuple(sorted(list(set(corum_human['subunits_gene_name'][i].split(';'))&genes_ms)))
    subunit_id[i] = subunits

##### 1. Benchmark datasets for functional ternary protein interactions

In [None]:
# sampling positive gene triplets
posit_id = {}
for i in tqdm(subunit_id.keys()):
    subunits = subunit_id[i]
    if len(subunits) >= 3:
        posits = set()
        combinations = list(itertools.combinations(subunits, 3))
        for combination in combinations:
            t = sorted(list(combination))
            posits.add((t[0], t[1], t[2]))
        posit_id[i] = posits

posit_all = set()
for i in tqdm(subunit_id.keys()):
    subunits = subunit_id[i]
    if len(subunits) >= 3:
        combinations = list(itertools.combinations(subunits, 3))
        for combination in combinations:
            t = sorted(list(combination))
            posit_all.add((t[0], t[1], t[2]))

allposit_seed = {}
posit_id_seed = {}
posisubunit_id_seed = {}
for i in tqdm(subunit_id.keys()):
    subunits = subunit_id[i]
    if len(subunits) >= 3:
        combinations = sorted(list(itertools.combinations(subunits, 3)))
        if len(subunits) <= 5:
            for s in seeds:
                triangles = set()
                posisubunits = set()
                for combination in combinations:
                    t = sorted(list(combination))
                    triangles.add((t[0], t[1], t[2]))
                    posisubunits = posisubunits.union(set(combination))
                    try:
                        allposit_seed[s].add((t[0], t[1], t[2]))
                    except:
                        allposit_seed[s] = set()
                        allposit_seed[s].add((t[0], t[1], t[2]))
                try:
                    posit_id_seed[s][i] = triangles
                except:
                    posit_id_seed[s] = {}
                    posit_id_seed[s][i] = triangles

                try:
                    posisubunit_id_seed[s][i] = posisubunits
                except:
                    posisubunit_id_seed[s] = {}
                    posisubunit_id_seed[s][i] = posisubunits

        else:
            for s in seeds:
                setup_seed(s)
                combinations_ = random.sample(combinations, len(subunits) * 3)
                triangles = set()
                posisubunits = set()
                for combination in combinations_:
                    t = sorted(list(combination))
                    triangles.add((t[0], t[1], t[2]))
                    posisubunits = posisubunits.union(set(combination))
                    try:
                        allposit_seed[s].add((t[0], t[1], t[2]))
                    except:
                        allposit_seed[s] = set()
                        allposit_seed[s].add((t[0], t[1], t[2]))

                try:
                    posit_id_seed[s][i] = triangles
                except:
                    posit_id_seed[s] = {}
                    posit_id_seed[s][i] = triangles

                try:
                    posisubunit_id_seed[s][i] = posisubunits
                except:
                    posisubunit_id_seed[s] = {}
                    posisubunit_id_seed[s][i] = posisubunits

In [None]:
# sampling negative gene triplets --- RanNeg
negat_RanNeg_seed = {}
for s in seeds:
    posisubunit_id = posisubunit_id_seed[s]
    ids = sorted(list(posisubunit_id.keys()))

    setup_seed(s)
    random.shuffle(ids)

    n = len(ids)
    size1 = n // 3
    size2 = n // 3
    size3 = n - size1 - size2

    list1 = ids[:size1]
    list2 = ids[size1:size1 + size2]
    list3 = ids[size1 + size2:]

    setup_seed(s)
    g1 = random.sample(list1, 50)
    g2 = random.sample(list2, 50)
    g3 = random.sample(list3, 50)

    ids_ = sorted(list(itertools.product(g1, g2, g3)))

    negat = set()
    for i in tqdm(ids_):
        cs = sorted(list(i))
        c1, c2, c3 = cs[0], cs[1], cs[2]
        subunits_1 = sorted(list(posisubunit_id[c1]))
        subunits_2 = sorted(list(posisubunit_id[c2]))
        subunits_3 = sorted(list(posisubunit_id[c3]))

        setup_seed(s)
        s_1 = random.choice(subunits_1)
        s_2 = random.choice(subunits_2)
        s_3 = random.choice(subunits_3)

        t = set([s_1, s_2, s_3])
        if len(t) == 3:
            flag = 0
            for k in subunit_id.keys():
                subunits = subunit_id[k]
                if len(set(subunits) & t)>1:
                    flag = 1

            if flag == 0:
                t_ = sorted(list(t))
                negat.add((t_[0], t_[1], t_[2]))

    negat = sorted(list(negat))
    setup_seed(s)
    random.shuffle(negat)
    negat = tuple(negat)
    negat_RanNeg_seed[s] = negat

# sampling negative gene triplets --- DecoyNeg
negat_DecoyNeg_seed = {}
for s in seeds:
    negat = set()
    allposits = allposit_seed[s]
    posisubunit_id = posisubunit_id_seed[s]

    ids = sorted(list(posisubunit_id.keys()))
    setup_seed(s)
    ids = random.sample(ids, 100)

    for t in tqdm(allposits):
        negasubunits = set()
        for i in ids:
            posisubunits = sorted(list(posisubunit_id[i]))
            if len(set(t)&set(posisubunits)) == 0:
                setup_seed(s)
                posisubunits = random.sample(posisubunits, 1)
                negasubunits = negasubunits.union(set(posisubunits))

        negasubunits = sorted(list(negasubunits))
        setup_seed(s)
        negasubunits = random.sample(negasubunits, 5)

        for subunit in negasubunits:
            t_1 = sorted([t[0], t[1], subunit])
            t_2 = sorted([t[0], t[2], subunit])
            t_3 = sorted([t[1], t[2], subunit])
            negat.add((t_1[0], t_1[1], t_1[2]))
            negat.add((t_2[0], t_2[1], t_2[2]))
            negat.add((t_3[0], t_3[1], t_3[2]))

    negat = negat-posit_all

    negat = sorted(list(negat))
    setup_seed(s)
    random.shuffle(negat)
    negat = tuple(negat)

    negat_DecoyNeg_seed[s] = negat

##### 2. Extraction of feature matrices for model training

In [None]:
# positive examples
posiprot_genet_seed_cl = {}
for cl in cl_lst:
    posiprot_genet_seed_cl[cl] = {}
    for s in tqdm(seeds):
        allposit = allposit_seed[s]
        posiprot_genet_seed_cl[cl][s] = {}
        for t in allposit:
            genes = sorted(list(t))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 3:
                prot_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                pro3 = cl_gene_pro[cl][genes[2]]
                combinations = set(itertools.product(pro1, pro2, pro3))
                for combination in combinations:
                    if len(set(combination)) == 3:
                        c = sorted(list(combination))
                        prot_set_.add(';'.join(c))
                posiprot_genet_seed_cl[cl][s][t] = prot_set_

posippi_seed = {}
for s in seeds:
    allposits = allposit_seed[s]
    posippis = set()
    for t in allposits:
        posippis.add((t[0], t[1]))
        posippis.add((t[0], t[2]))
        posippis.add((t[1], t[2]))
    posippi_seed[s] = posippis

proppi_seed_cl = {}
proppi_geneppi_seed_cl = {}
for cl in cl_lst:
    proppi_seed = {}
    proppi_geneppi_seed = {}
    for s in tqdm(seeds):
        posippi = posippi_seed[s]

        proppi_df = pd.DataFrame()
        proppi_geneppi = {}

        proppi_set = set()
        for ppi in posippi:
            genes = sorted(list(ppi))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 2:
                proppi_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                combinations = set(itertools.product(pro1, pro2))
                for combination in combinations:
                    if len(set(combination)) == 2:
                        c = sorted(list(combination))
                        proppi_set.add(';'.join(c))
                        proppi_set_.add(';'.join(c))
                proppi_geneppi[ppi] = proppi_set_
        proppi_df['subunits_protein'] = sorted(list(proppi_set))

        proppi_seed[s] = proppi_df
        proppi_geneppi_seed[s] = proppi_geneppi
    proppi_seed_cl[cl] = proppi_seed
    proppi_geneppi_seed_cl[cl] = proppi_geneppi_seed

eucl_seed_cl = {}
for cl in cl_lst:
    eucl_seed = {}
    for s in seeds:
        proppi_df = proppi_seed_cl[cl][s]
        eucl = functions.complex_dist(cl_data[cl], proppi_df, method='euclidean')
        print(len(eucl))
        eucl_seed[s] = eucl
    eucl_seed_cl[cl] = eucl_seed

posieucl_geneppi_seed_cl = {}
for cl in cl_lst:
    posieucl_geneppi_seed_cl[cl] = {}
    for s in tqdm(seeds):
        posieucl_geneppi_seed_cl[cl][s] = {}

        proppi_geneppi = proppi_geneppi_seed_cl[cl][s]
        eucl = eucl_seed_cl[cl][s]

        for ppi in proppi_geneppi.keys():
            proppi = proppi_geneppi[ppi]
            avg_dist = np.mean(eucl[eucl['subunits_protein'].isin(proppi)]['Avg_Dist'])
            posieucl_geneppi_seed_cl[cl][s][ppi] = avg_dist

ft_posi_seed = {}
for s in seeds:
    posits = posiprot_genet_seed_cl[cl_lst[0]][s]

    ts_list = []
    ft_all_list = []

    for t in tqdm(posits):
        ts_list.append(t)
        ft_all_list_ = []
        for cl in cl_lst:
            dist_1 = posieucl_geneppi_seed_cl[cl][s][(t[0], t[1])]
            dist_2 = posieucl_geneppi_seed_cl[cl][s][(t[0], t[2])]
            dist_3 = posieucl_geneppi_seed_cl[cl][s][(t[1], t[2])]
            ft_all_list_.append(dist_1)
            ft_all_list_.append(dist_2)
            ft_all_list_.append(dist_3)
            ft_all_list_.append(np.mean([dist_1, dist_2, dist_3]))

        ft_all_list.append(ft_all_list_)

    ft_cols = []
    for cl in cl_lst:
        ft_cols = ft_cols + ['ppi1_{}'.format(cl),'ppi2_{}'.format(cl),'ppi3_{}'.format(cl),'dist_{}'.format(cl)]

    ft_posi_seed[s] = pd.DataFrame(ft_all_list, index=ts_list,columns=ft_cols)

In [None]:
# negative examples --- RanNeg
negaprot_genet_seed_cl = {}
for cl in cl_lst:
    negaprot_genet_seed_cl[cl] = {}
    for s in tqdm(seeds):
        negaprot_genet_seed_cl[cl][s] = {}
        
        negat = negat_RanNeg_seed[s]

        setup_seed(s)
        negat = random.sample(negat, len(allposit_seed[s]))
        
        for t in negat:
            genes = sorted(list(t))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 3:
                prot_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                pro3 = cl_gene_pro[cl][genes[2]]
                combinations = set(itertools.product(pro1, pro2, pro3))
                for combination in combinations:
                    if len(set(combination)) == 3:
                        c = sorted(list(combination))
                        prot_set_.add(';'.join(c))
                negaprot_genet_seed_cl[cl][s][t] = prot_set_

negappi_seed = {}
for s in seeds:
    allnegats = negat_RanNeg_seed[s]
    negappis = set()
    for t in allnegats:
        negappis.add((t[0], t[1]))
        negappis.add((t[0], t[2]))
        negappis.add((t[1], t[2]))
    print(len(negappis))
    negappi_seed[s] = negappis

proppi_seed_cl = {}
proppi_geneppi_seed_cl = {}
for cl in cl_lst:
    proppi_seed = {}
    proppi_geneppi_seed = {}
    for s in tqdm(seeds):
        negappi = negappi_seed[s]

        proppi_df = pd.DataFrame()
        proppi_geneppi = {}

        proppi_set = set()
        for ppi in negappi:
            genes = sorted(list(ppi))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 2:
                proppi_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                combinations = set(itertools.product(pro1, pro2))
                for combination in combinations:
                    if len(set(combination)) == 2:
                        c = sorted(list(combination))
                        proppi_set.add(';'.join(c))
                        proppi_set_.add(';'.join(c))
                proppi_geneppi[ppi] = proppi_set_
        proppi_df['subunits_protein'] = sorted(list(proppi_set))

        proppi_seed[s] = proppi_df
        proppi_geneppi_seed[s] = proppi_geneppi
    proppi_seed_cl[cl] = proppi_seed
    proppi_geneppi_seed_cl[cl] = proppi_geneppi_seed

eucl_seed_cl = {}
for cl in cl_lst:
    eucl_seed = {}
    for s in seeds:
        proppi_df = proppi_seed_cl[cl][s]
        eucl = functions.complex_dist(cl_data[cl], proppi_df, method='euclidean')
        print(len(eucl))
        eucl_seed[s] = eucl
    eucl_seed_cl[cl] = eucl_seed

negaeucl_RanNeg_geneppi_seed_cl = {}
for cl in cl_lst:
    negaeucl_RanNeg_geneppi_seed_cl[cl] = {}
    for s in tqdm(seeds):
        negaeucl_RanNeg_geneppi_seed_cl[cl][s] = {}

        proppi_geneppi = proppi_geneppi_seed_cl[cl][s]
        eucl = eucl_seed_cl[cl][s]

        for ppi in proppi_geneppi.keys():
            proppi = proppi_geneppi[ppi]
            avg_dist = np.mean(eucl[eucl['subunits_protein'].isin(proppi)]['Avg_Dist'])
            negaeucl_RanNeg_geneppi_seed_cl[cl][s][ppi] = avg_dist

ft_RanNeg_seed = {}
for s in seeds:
    negats = negaprot_genet_seed_cl[cl_lst[0]][s]

    ts_list = []
    ft_all_list = []

    for t in tqdm(negats):
        ts_list.append(t)
        ft_all_list_ = []
        for cl in cl_lst:
            dist_1 = negaeucl_RanNeg_geneppi_seed_cl[cl][s][(t[0], t[1])]
            dist_2 = negaeucl_RanNeg_geneppi_seed_cl[cl][s][(t[0], t[2])]
            dist_3 = negaeucl_RanNeg_geneppi_seed_cl[cl][s][(t[1], t[2])]
            ft_all_list_.append(dist_1)
            ft_all_list_.append(dist_2)
            ft_all_list_.append(dist_3)
            ft_all_list_.append(np.mean([dist_1, dist_2, dist_3]))

        ft_all_list.append(ft_all_list_)

    ft_cols = []
    for cl in cl_lst:
        ft_cols = ft_cols + ['ppi1_{}'.format(cl),'ppi2_{}'.format(cl),'ppi3_{}'.format(cl),'dist_{}'.format(cl)]

    ft_RanNeg_seed[s] = pd.DataFrame(ft_all_list, index=ts_list, columns=ft_cols)

# negative examples --- DecoyNeg
negaprot_genet_seed_cl = {}
for cl in cl_lst:
    negaprot_genet_seed_cl[cl] = {}
    for s in tqdm(seeds):
        negaprot_genet_seed_cl[cl][s] = {}
        
        negat = negat_DecoyNeg_seed[s]

        setup_seed(s)
        negat = random.sample(negat, len(allposit_seed[s]))
        
        for t in negat:
            genes = sorted(list(t))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 3:
                prot_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                pro3 = cl_gene_pro[cl][genes[2]]
                combinations = set(itertools.product(pro1, pro2, pro3))
                for combination in combinations:
                    if len(set(combination)) == 3:
                        c = sorted(list(combination))
                        prot_set_.add(';'.join(c))
                negaprot_genet_seed_cl[cl][s][t] = prot_set_

negappi_seed = {}
for s in seeds:
    allnegats = negat_DecoyNeg_seed[s]
    negappis = set()
    for t in allnegats:
        negappis.add((t[0], t[1]))
        negappis.add((t[0], t[2]))
        negappis.add((t[1], t[2]))
    print(len(negappis))
    negappi_seed[s] = negappis

proppi_seed_cl = {}
proppi_geneppi_seed_cl = {}
for cl in cl_lst:
    proppi_seed = {}
    proppi_geneppi_seed = {}
    for s in tqdm(seeds):
        negappi = negappi_seed[s]

        proppi_df = pd.DataFrame()
        proppi_geneppi = {}

        proppi_set = set()
        for ppi in negappi:
            genes = sorted(list(ppi))
            if len(set(genes) & set(cl_gene_pro[cl].keys())) == 2:
                proppi_set_ = set()
                pro1 = cl_gene_pro[cl][genes[0]]
                pro2 = cl_gene_pro[cl][genes[1]]
                combinations = set(itertools.product(pro1, pro2))
                for combination in combinations:
                    if len(set(combination)) == 2:
                        c = sorted(list(combination))
                        proppi_set.add(';'.join(c))
                        proppi_set_.add(';'.join(c))
                proppi_geneppi[ppi] = proppi_set_
        proppi_df['subunits_protein'] = sorted(list(proppi_set))

        proppi_seed[s] = proppi_df
        proppi_geneppi_seed[s] = proppi_geneppi
    proppi_seed_cl[cl] = proppi_seed
    proppi_geneppi_seed_cl[cl] = proppi_geneppi_seed

eucl_seed_cl = {}
for cl in cl_lst:
    eucl_seed = {}
    for s in seeds:
        proppi_df = proppi_seed_cl[cl][s]
        eucl = functions.complex_dist(cl_data[cl], proppi_df, method='euclidean')
        print(len(eucl))
        eucl_seed[s] = eucl
    eucl_seed_cl[cl] = eucl_seed

negaeucl_DecoyNeg_geneppi_seed_cl = {}
for cl in cl_lst:
    negaeucl_DecoyNeg_geneppi_seed_cl[cl] = {}
    for s in tqdm(seeds):
        negaeucl_DecoyNeg_geneppi_seed_cl[cl][s] = {}

        proppi_geneppi = proppi_geneppi_seed_cl[cl][s]
        eucl = eucl_seed_cl[cl][s]

        for ppi in proppi_geneppi.keys():
            proppi = proppi_geneppi[ppi]
            avg_dist = np.mean(eucl[eucl['subunits_protein'].isin(proppi)]['Avg_Dist'])
            negaeucl_DecoyNeg_geneppi_seed_cl[cl][s][ppi] = avg_dist

ft_DecoyNeg_seed = {}
for s in seeds:
    negats = negaprot_genet_seed_cl[cl_lst[0]][s]

    ts_list = []
    ft_all_list = []

    for t in tqdm(negats):
        ts_list.append(t)
        ft_all_list_ = []
        for cl in cl_lst:
            dist_1 = negaeucl_DecoyNeg_geneppi_seed_cl[cl][s][(t[0], t[1])]
            dist_2 = negaeucl_DecoyNeg_geneppi_seed_cl[cl][s][(t[0], t[2])]
            dist_3 = negaeucl_DecoyNeg_geneppi_seed_cl[cl][s][(t[1], t[2])]
            ft_all_list_.append(dist_1)
            ft_all_list_.append(dist_2)
            ft_all_list_.append(dist_3)
            ft_all_list_.append(np.mean([dist_1, dist_2, dist_3]))

        ft_all_list.append(ft_all_list_)

    ft_cols = []
    for cl in cl_lst:
        ft_cols = ft_cols + ['ppi1_{}'.format(cl),'ppi2_{}'.format(cl),'ppi3_{}'.format(cl),'dist_{}'.format(cl)]
    ft_DecoyNeg_seed[s] = pd.DataFrame(ft_all_list, index=ts_list, columns=ft_cols)