# Prize-Collecting Steiner Trees
## Imports

In [2]:
# Import standard libraries
import csv
import OmicsIntegrator as oi
import pandas as pd
import numpy as np
import scipy.stats as ss
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import tqdm
import time

In [47]:
# File paths
data_dir = "/home/braunger/masterthesis/data/"
save_dir = "/home/braunger/masterthesis/save/pcst/"

## Get prized list for DE genes

In [62]:
terminal_df = pd.read_csv(data_dir+'de_data/fc_top1.csv')
terminal_df = terminal_df[terminal_df['time'] == "fc_0_10"]
terminal_df.head()

Unnamed: 0,time,gene,FPKM_t0,FPKM_t1,log2_fc,abs_log2_fc
0,fc_0_10,ABI3BP,27.39505,13.021286,-1.073043,1.073043
1,fc_0_10,ACAN,95.64075,14.554714,-2.716139,2.716139
2,fc_0_10,ACTA2,742.29195,282.090643,-1.395828,1.395828
3,fc_0_10,ACTG2,28.0149,1.564571,-4.162355,4.162355
4,fc_0_10,ADAMTS15,3.15795,1.367214,-1.207749,1.207749


In [63]:
prizes_data = terminal_df[['gene', 'abs_log2_fc']]
prizes_data.columns = ['name', 'prize']
prizes_data.head()

Unnamed: 0,name,prize
0,ABI3BP,1.073043
1,ACAN,2.716139
2,ACTA2,1.395828
3,ACTG2,4.162355
4,ADAMTS15,1.207749


In [64]:
# Save prizes df to tsv
prizes_data.to_csv(save_dir+'terminals_ppi_analysis.tsv', header=True, index=None, sep='\t', quoting = csv.QUOTE_NONE, escapechar = '\t')

In [65]:
terminals = list(prizes_data['name'])
n_terminals = len(terminals)
print('Number of terminals: ' + str(n_terminals))

Number of terminals: 132


## Run PCST

In [66]:
# STRING interactome
interactome_file_name = data_dir + 'ppi_data/PPI_string_processed.csv'
# Prize file
prize_file_name = save_dir+'terminals_ppi_analysis.tsv'

In [76]:
# Graph hyperparameters
graph_params =  {
                "w": 1,
                "b": 5,
                "g": 0,
                "noise": 0.0, 
                "dummy_mode": "terminals", 
                "exclude_terminals": False, 
                "seed": 1,
                "pruning": 'strong',
                "verbosity_level": 0
                }

In [77]:
# Build graph
graph = oi.Graph(interactome_file_name, graph_params)
graph.prepare_prizes(prize_file_name)

12:09:52 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
12:09:52 - OI2: INFO - []
12:09:52 - OI2: INFO - Members of the prize file not present in the interactome:
12:09:52 - OI2: INFO - ['ACTA2', 'ALX4', 'CHAC1', 'FOXC2']


In [78]:
# Run PCSF
vertex_indices, edge_indices = graph.pcsf()
forest, augmented_forest = graph.output_forest_as_networkx(vertex_indices, edge_indices)

## Analysis of the results

In [79]:
# Number of nodes and edges
n_nodes = augmented_forest.number_of_nodes()
n_edges = augmented_forest.number_of_edges()
print('The Steiner tree has ' + str(n_nodes) + ' nodes and ' + str(n_edges) + ' edges.')

The Steiner tree has 359 nodes and 9253 edges.


In [80]:
number_components = len([c for c in nx.connected_components(augmented_forest)])
size_components = [len(c) for c in sorted(nx.connected_components(augmented_forest), key=len, reverse=True)]

network_df = oi.get_networkx_graph_as_dataframe_of_nodes(augmented_forest)
percentage_terminals = np.sum(network_df['terminal'])/n_terminals
percentage_terminals

0.9696969696969697

In [81]:
network_df.head()

Unnamed: 0,prize,terminal,type,degree,betweenness,louvain_clusters,location,general_process,specific_process,general_function,specific_function
A1BG,0.0,False,protein,678,0.000674,2,extracellular,cellular process,secretion by cell,,
AANAT,0.0,False,protein,1068,0.00051,1,cytoplasm,cellular process,cellular response to stimulus,catalytic,transferase
ABHD5,0.0,False,protein,1300,0.000234,1,cytoplasm,biological regulation,regulation of biological process,catalytic,transferase
ABI1,0.0,False,protein,1528,0.001293,2,plasma_membrane,cellular process,cell communication,binding,protein binding
ABI2,0.0,False,protein,1424,0.000922,2,plasma_membrane,cellular process,cellular component organization,binding,protein binding


In [82]:
# Save selected network to file
oi.output_networkx_graph_as_interactive_html(augmented_forest, filename=save_dir + "test_network.html")
oi.output_networkx_graph_as_pickle(augmented_forest, filename= save_dir + "test_network.pickle")

PosixPath('/home/braunger/masterthesis/save/pcst/test_network.pickle')

In [84]:
nx.get_edge_attributes(augmented_forest,'cost')

{('MYOCD', 'DCN'): 0.8488488488488488,
 ('MYOCD', 'KCNQ1'): 0.8428428428428428,
 ('MYOCD', 'TGFBR3'): 0.8048048048048049,
 ('MYOCD', 'TIMP1'): 0.8288288288288288,
 ('MYOCD', 'MMP2'): 0.7267267267267268,
 ('MYOCD', 'SERPINE1'): 0.8068068068068068,
 ('MYOCD', 'CCL2'): 0.7627627627627628,
 ('MYOCD', 'MGP'): 0.7087087087087087,
 ('MYOCD', 'TGFB3'): 0.7907907907907907,
 ('MYOCD', 'LRP1'): 0.6826826826826826,
 ('MYOCD', 'BMP4'): 0.5665665665665666,
 ('MYOCD', 'LIF'): 0.8068068068068068,
 ('MYOCD', 'JAG1'): 0.6586586586586587,
 ('MYOCD', 'NOTCH3'): 0.6726726726726726,
 ('MYOCD', 'MCAM'): 0.8118118118118118,
 ('MYOCD', 'FGF2'): 0.6706706706706707,
 ('MYOCD', 'WNT5A'): 0.7467467467467468,
 ('MYOCD', 'SRF'): 0.0,
 ('MYOCD', 'MEIS1'): 0.8188188188188188,
 ('MYOCD', 'KCNMB1'): 0.6546546546546547,
 ('MYOCD', 'TJP1'): 0.8158158158158157,
 ('MYOCD', 'WNT3A'): 0.7877877877877878,
 ('MYOCD', 'GNAQ'): 0.8418418418418419,
 ('MYOCD', 'PPARG'): 0.7677677677677678,
 ('MYOCD', 'TAL1'): 0.7827827827827828,
 (