In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import os
from scipy.sparse import csr_matrix, lil_matrix, eye
from scipy.sparse.linalg import spsolve

In [2]:
dir_name = os.path.join("..", "Data")
df_human = pd.read_csv(os.path.join(dir_name, "9606.protein.links.full.v12.0.txt"), sep=" ")
df_rvfv = pd.read_csv(os.path.join(dir_name, "string_interactions.tsv"), sep="\t")


In [3]:
df_human

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,0,0,0,0,0,0,45,0,134,0,0,0,81,173
1,9606.ENSP00000000233,9606.ENSP00000427567,0,0,0,0,0,0,0,0,128,0,0,0,70,154
2,9606.ENSP00000000233,9606.ENSP00000253413,0,0,0,0,0,49,111,0,49,0,0,0,69,151
3,9606.ENSP00000000233,9606.ENSP00000493357,0,0,0,0,0,56,0,0,53,0,0,433,81,471
4,9606.ENSP00000000233,9606.ENSP00000324127,0,0,0,0,0,0,0,0,46,0,0,153,91,201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13715399,9606.ENSP00000501317,9606.ENSP00000475489,0,0,0,0,0,60,0,0,99,0,0,126,0,195
13715400,9606.ENSP00000501317,9606.ENSP00000370447,0,0,0,0,0,0,55,0,111,0,0,0,79,158
13715401,9606.ENSP00000501317,9606.ENSP00000312272,0,0,0,0,0,0,0,0,0,0,0,187,88,226
13715402,9606.ENSP00000501317,9606.ENSP00000402092,0,0,0,0,0,0,0,0,67,0,0,146,0,169


In [4]:
unique_prots = set(df_human["protein1"]) | set(df_human["protein2"])
#print(len(unique_prots))
unique_prots = list(unique_prots)
prot_map = {p:i for i,p in enumerate(unique_prots)}
prot_map


{'9606.ENSP00000282406': 0,
 '9606.ENSP00000287844': 1,
 '9606.ENSP00000432487': 2,
 '9606.ENSP00000461879': 3,
 '9606.ENSP00000309565': 4,
 '9606.ENSP00000441405': 5,
 '9606.ENSP00000358238': 6,
 '9606.ENSP00000434511': 7,
 '9606.ENSP00000380252': 8,
 '9606.ENSP00000364912': 9,
 '9606.ENSP00000320663': 10,
 '9606.ENSP00000263934': 11,
 '9606.ENSP00000438455': 12,
 '9606.ENSP00000356224': 13,
 '9606.ENSP00000478893': 14,
 '9606.ENSP00000291971': 15,
 '9606.ENSP00000320084': 16,
 '9606.ENSP00000399457': 17,
 '9606.ENSP00000469970': 18,
 '9606.ENSP00000376609': 19,
 '9606.ENSP00000396935': 20,
 '9606.ENSP00000331791': 21,
 '9606.ENSP00000359339': 22,
 '9606.ENSP00000369965': 23,
 '9606.ENSP00000295981': 24,
 '9606.ENSP00000493000': 25,
 '9606.ENSP00000385814': 26,
 '9606.ENSP00000280701': 27,
 '9606.ENSP00000276651': 28,
 '9606.ENSP00000278222': 29,
 '9606.ENSP00000252840': 30,
 '9606.ENSP00000298705': 31,
 '9606.ENSP00000327145': 32,
 '9606.ENSP00000372005': 33,
 '9606.ENSP00000229729':

In [5]:
# Generate list of unique host proteins that interact with NSs
rvfv_prot_list = set(pd.concat([df_rvfv["node1_external_id"], df_rvfv["node2_external_id"]], axis=0, ignore_index=True))
rvfv_interactor_list = {p for p in rvfv_prot_list if "9606.ENSP" in p}

# Create dictionary mapping NSs interactor protein names to their array index
s_array = np.zeros(len(unique_prots))
for prot in rvfv_interactor_list:
    if prot in prot_map:
        s_array[prot_map[prot]] = 1.0
    else:
        print(f'Protein {prot} does not interact with any human proteins.')


Protein 9606.ENSP00000384144 does not interact with any human proteins.
Protein 9606.ENSP00000353622 does not interact with any human proteins.
Protein 9606.ENSP00000356348 does not interact with any human proteins.
Protein 9606.ENSP00000360286 does not interact with any human proteins.


In [None]:
s_array.sum()


6.0

In [None]:
# Pair human protein columns, map protein names to matrix coordinates, change matrix value to one at that position (and inverse position)
w_sparse_matrix = lil_matrix((len(prot_map), len(prot_map)))

for prot1, prot2 in zip(df_human['protein1'], df_human['protein2']):
    w_sparse_matrix[prot_map[prot1], prot_map[prot2]] = 1
    w_sparse_matrix[prot_map[prot2], prot_map[prot1]] = 1

w_sparse_matrix = w_sparse_matrix.tocsr()




In [8]:
D = np.sqrt(w_sparse_matrix.sum(-1))
D[D < 1] = 1
w_sparse_matrix = w_sparse_matrix.multiply(1.0/D)
w_sparse_matrix = w_sparse_matrix.multiply(1.0/D.T)

#w_sparse_matrix = w_sparse_matrix / D / D.T

In [9]:
alpha = 0.2
M = (1 + alpha) * eye(len(unique_prots)) - (w_sparse_matrix * alpha)

In [10]:
#y_array = spsolve(M,s_array)

In [14]:
# Generating another sparse matrix for reducing size of connectivity network

w_sparse_red = lil_matrix((len(prot_map), len(prot_map)))

for prot1, prot2 in zip(df_human['protein1'], df_human['protein2']):
    w_sparse_red[prot_map[prot1], prot_map[prot2]] = 1
    w_sparse_red[prot_map[prot2], prot_map[prot1]] = 1

w_sparse_red = w_sparse_red.tocsr()

In [43]:
first_round_matrix = w_sparse_red.multiply(s_array).sum(axis=0).T
#new_vector = first_round_matrix.sum(axis=0)
#new_vector = new_vector.T


first_round_matrix.sum()

12352.0

In [None]:
# Build a function that trims the network to a certain number of rounds of connection
def network_subset(starting_nodes, connection_network, iterations):
    if iterations > 0:
        new_nodes = connection_network.multiply(starting_nodes).sum(axis=0).T
        return network_subset(new_nodes, connection_network, iterations - 1)
    else:
        return starting_nodes
        

In [76]:
nodes1 = network_subset(s_array, w_sparse_red, 2)
nodes1

matrix([[   0.],
        [7476.],
        [   0.],
        ...,
        [1832.],
        [   0.],
        [   0.]])

In [75]:
# Create a new adjacency matrix based on the subset selected by network_subset()
#def new_network(subset_array, protein_map):

new_prot_list = [key for key in prot_map]
new_nodes_list = list(nodes1)
new_nodes_list
#new_network_dict = dict(zip(new_prot_list, nodes1))
#new_network_dict

[matrix([[0.]]),
 matrix([[7476.]]),
 matrix([[0.]]),
 matrix([[8628.]]),
 matrix([[2293.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[7015.]]),
 matrix([[0.]]),
 matrix([[5183.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[7015.]]),
 matrix([[7476.]]),
 matrix([[0.]]),
 matrix([[5183.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[7015.]]),
 matrix([[7476.]]),
 matrix([[1832.]]),
 matrix([[0.]]),
 matrix([[1613.]]),
 matrix([[2293.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[5183.]]),
 matrix([[7476.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[6115.]]),
 matrix([[0.]]),
 matrix([[9089.]]),
 matrix([[7476.]]),
 matrix([[932.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[1613.]]),
 matrix([[0.]]),
 matrix([[9560.]]),
 matrix([[2293.]]),
 matrix([[0.]]),
 matrix([[0.]]),
 matrix([[5183.]]