In [1]:
import os
import numpy as np
import pandas as pd
import torch

In [2]:
gene_features = pd.read_table('../data/pan/mut_features_miRNA_sub_du.txt', index_col=0, header =0, sep="\t")

In [3]:
#load PPI network
def load_network(file_path):
    ppi = pd.read_table(filepath_or_buffer=file_path, header=None, index_col=None, names=['source', 'target'], dtype='str', sep='\t')
    ppi_nodes = pd.concat([ppi['source'], ppi['target']], ignore_index=True)
    ppi_nodes = pd.DataFrame(ppi_nodes, columns=['nodes']).drop_duplicates()
    ppi_nodes.reset_index(drop=True, inplace=True)
    return ppi,ppi_nodes

#obtain genes that appear simultaneously in the gene set and PPI network
def get_intersection_genes(g_lst, ppi_nodes):
    p = ppi_nodes.iloc[:,0].values.tolist()
    p_nodes = list(set(p))
    print(len(p_nodes))
    g_intersection = list(set(g_lst) & set(p_nodes))
    return g_intersection

#retain the interactions where both source and target nodes are present in the gene list
def filter_ppi_with_intersect_nodes(nodes_lst,ppi_df):
    g_lst_df=pd.DataFrame(nodes_lst,columns=['g1'])
    m1=pd.merge(left=ppi_df,right=g_lst_df,left_on='source',right_on='g1',how='left')
    m1.dropna(how='any',inplace=True)
    m1.drop(['g1'],axis=1,inplace=True)

    m2=pd.merge(left=m1,right=g_lst_df,left_on='target',right_on='g1',how='left')
    m2.dropna(how='any',inplace=True)
    m2.drop(['g1'],axis=1,inplace=True)
    return m2

#obtain biological features of the filtered genes
def select_features(p_nodes):
    g_lst = list(p_nodes.iloc[:,0].values)
    print(len(g_lst))
    gene_list = list(gene_features.index.values)
    features = pd.DataFrame()
    for gene1 in gene_list:
        if gene1 in g_lst:
            features = pd.concat([features,gene_features.loc[gene1,:]], axis=1)
    features.index = [gene_features.columns]
    features.T.to_csv(path_or_buf='../data/pan/string_850/mut_features_miRNA_sub_du_2.0.txt', sep='\t', header=True,index=True)

#filter gene and PPI
def data_filter_mut():
    ppi,ppi_nodes=load_network('../data/pan/string_850/STRING_850.txt')
    print(ppi_nodes.shape)
    mut_g_lst = list(gene_features.index.values)
    print(len(mut_g_lst))
    g_intersection = get_intersection_genes(mut_g_lst, ppi_nodes)
    print(len(g_intersection))
    ppi = filter_ppi_with_intersect_nodes(g_intersection,ppi)
    ppi.to_csv(path_or_buf='../data/pan/string_850/mut_PPI_2.0.txt', sep='\t', header=False,index=False)
    p_nodes = pd.concat([ppi['source'], ppi['target']], ignore_index=True)
    p_nodes = pd.DataFrame(p_nodes, columns=['nodes']).drop_duplicates()
    p_nodes.reset_index(drop=True, inplace=True)
    select_features(p_nodes)

In [None]:
data_filter_mut()