In [222]:
import pandas as pd
import numpy as np
import scipy as sp
import os
from scipy.sparse import csr_matrix, lil_matrix, eye
from scipy.sparse.linalg import spsolve
from openpyxl import load_workbook

In [3]:
dir_name = os.path.join("..", "Data")
df_human = pd.read_csv(os.path.join(dir_name, "9606.protein.links.full.v12.0.txt"), sep=" ")
df_rvfv = pd.read_csv(os.path.join(dir_name, "string_interactions.tsv"), sep="\t")


In [4]:
df_human

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,0,0,0,0,0,0,45,0,134,0,0,0,81,173
1,9606.ENSP00000000233,9606.ENSP00000427567,0,0,0,0,0,0,0,0,128,0,0,0,70,154
2,9606.ENSP00000000233,9606.ENSP00000253413,0,0,0,0,0,49,111,0,49,0,0,0,69,151
3,9606.ENSP00000000233,9606.ENSP00000493357,0,0,0,0,0,56,0,0,53,0,0,433,81,471
4,9606.ENSP00000000233,9606.ENSP00000324127,0,0,0,0,0,0,0,0,46,0,0,153,91,201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13715399,9606.ENSP00000501317,9606.ENSP00000475489,0,0,0,0,0,60,0,0,99,0,0,126,0,195
13715400,9606.ENSP00000501317,9606.ENSP00000370447,0,0,0,0,0,0,55,0,111,0,0,0,79,158
13715401,9606.ENSP00000501317,9606.ENSP00000312272,0,0,0,0,0,0,0,0,0,0,0,187,88,226
13715402,9606.ENSP00000501317,9606.ENSP00000402092,0,0,0,0,0,0,0,0,67,0,0,146,0,169


In [5]:
unique_prots = set(df_human["protein1"]) | set(df_human["protein2"])
#print(len(unique_prots))
unique_prots = list(unique_prots)
prot_map = {p:i for i,p in enumerate(unique_prots)}
prot_map


{'9606.ENSP00000497613': 0,
 '9606.ENSP00000496625': 1,
 '9606.ENSP00000369822': 2,
 '9606.ENSP00000418001': 3,
 '9606.ENSP00000341843': 4,
 '9606.ENSP00000368144': 5,
 '9606.ENSP00000298532': 6,
 '9606.ENSP00000305556': 7,
 '9606.ENSP00000497759': 8,
 '9606.ENSP00000457168': 9,
 '9606.ENSP00000374234': 10,
 '9606.ENSP00000441875': 11,
 '9606.ENSP00000288050': 12,
 '9606.ENSP00000263093': 13,
 '9606.ENSP00000348912': 14,
 '9606.ENSP00000385163': 15,
 '9606.ENSP00000364110': 16,
 '9606.ENSP00000477875': 17,
 '9606.ENSP00000399585': 18,
 '9606.ENSP00000256261': 19,
 '9606.ENSP00000342143': 20,
 '9606.ENSP00000474090': 21,
 '9606.ENSP00000365075': 22,
 '9606.ENSP00000259782': 23,
 '9606.ENSP00000358963': 24,
 '9606.ENSP00000260323': 25,
 '9606.ENSP00000300658': 26,
 '9606.ENSP00000362014': 27,
 '9606.ENSP00000243314': 28,
 '9606.ENSP00000308351': 29,
 '9606.ENSP00000315743': 30,
 '9606.ENSP00000279392': 31,
 '9606.ENSP00000300249': 32,
 '9606.ENSP00000354982': 33,
 '9606.ENSP00000357915':

In [None]:
# Generate list of unique host proteins that interact with NSs
rvfv_prot_list = set(pd.concat([df_rvfv["node1_external_id"], df_rvfv["node2_external_id"]], axis=0, ignore_index=True))
rvfv_interactor_list = {p for p in rvfv_prot_list if "9606.ENSP" in p}


# Create dictionary mapping NSs interactor protein names to their array index
s_array = np.zeros(len(unique_prots))
for prot in rvfv_interactor_list:
    if prot in prot_map:
        s_array[prot_map[prot]] = 1.0
    else:
        print(f'Protein {prot} does not interact with any human proteins.')


Protein 9606.ENSP00000384144 does not interact with any human proteins.
Protein 9606.ENSP00000356348 does not interact with any human proteins.
Protein 9606.ENSP00000353622 does not interact with any human proteins.
Protein 9606.ENSP00000360286 does not interact with any human proteins.


In [None]:
rvfv_interactor_list

{'9606.ENSP00000265651',
 '9606.ENSP00000335153',
 '9606.ENSP00000353622',
 '9606.ENSP00000356348',
 '9606.ENSP00000360154',
 '9606.ENSP00000360286',
 '9606.ENSP00000369581',
 '9606.ENSP00000377958',
 '9606.ENSP00000384144',
 '9606.ENSP00000451560'}

In [7]:
s_array.sum()


6.0

In [8]:
# Pair human protein columns, map protein names to matrix coordinates, change matrix value to one at that position (and inverse position)
w_sparse_matrix = lil_matrix((len(prot_map), len(prot_map)))

for prot1, prot2 in zip(df_human['protein1'], df_human['protein2']):
    w_sparse_matrix[prot_map[prot1], prot_map[prot2]] = 1
    w_sparse_matrix[prot_map[prot2], prot_map[prot1]] = 1

w_sparse_matrix = w_sparse_matrix.tocsr()




In [9]:
D = np.sqrt(w_sparse_matrix.sum(-1))
D[D < 1] = 1
w_sparse_matrix = w_sparse_matrix.multiply(1.0/D)
w_sparse_matrix = w_sparse_matrix.multiply(1.0/D.T)

#w_sparse_matrix = w_sparse_matrix / D / D.T

In [10]:
alpha = 0.2
M = (1 + alpha) * eye(len(unique_prots)) - (w_sparse_matrix * alpha)

In [11]:
#y_array = spsolve(M,s_array)

In [213]:
# Build a function that trims the network to a certain number of rounds of connection
def select_network_subset(starting_nodes, full_network, iterations, s_array=None):
    if s_array is None:
        s_array = starting_nodes
    
    # Build out network for given number of cycles
    if iterations > 0:
        new_starting_nodes = (full_network@starting_nodes) + starting_nodes
        return select_network_subset(new_starting_nodes, full_network, iterations - 1, s_array=s_array)
    
    else:
        starting_nodes[starting_nodes > 0.0] = 1.0
        print(f"Starting nodes: {starting_nodes.sum()}")
        
        trimmed_network_list = [p for p,m in zip(unique_prots, starting_nodes) if m > 0]

        starting_mask = starting_nodes > 0

        trimmed_network = full_network[starting_mask,:][:,starting_mask]

        trimmed_s_array = s_array[starting_mask]
        

        return trimmed_s_array, trimmed_network, trimmed_network_list


        

In [214]:
trimmed_s_array, trimmed_network, trimmed_network_list = select_network_subset(s_array, w_sparse_matrix.tocsr(), 3)

Starting nodes: 19622.0


In [215]:
trimmed_D = np.sqrt(trimmed_network.sum(-1))
trimmed_D[trimmed_D < 1] = 1

trimmed_matrix = trimmed_network.multiply(1.0/trimmed_D)
trimmed_matrix = trimmed_matrix.multiply(1.0/trimmed_D.T)

In [216]:
alpha = 0.2
trimmed_M = (1 + alpha) * eye(len(trimmed_network_list)) - (trimmed_matrix * alpha)

In [217]:
trimmed_y_array = spsolve(trimmed_M,trimmed_s_array)

In [218]:
trimmed_y_array

array([3.93869597e-06, 5.21913452e-05, 6.93698118e-05, ...,
       9.85350840e-05, 5.96572827e-06, 8.24787110e-06])

In [219]:
score_list = trimmed_y_array.tolist()

In [228]:
prots_and_scores = list(zip(score_list, trimmed_network_list))

prots_and_scores_sorted = sorted(prots_and_scores, key=lambda x: x[0], reverse=True)
prots_and_scores_sorted

[(0.8335602971622981, '9606.ENSP00000265651'),
 (0.8335128820599044, '9606.ENSP00000369581'),
 (0.8334944914851523, '9606.ENSP00000335153'),
 (0.8334422063511657, '9606.ENSP00000451560'),
 (0.8334354377535339, '9606.ENSP00000377958'),
 (0.8333708059828596, '9606.ENSP00000360154'),
 (0.0008737266432618309, '9606.ENSP00000355432'),
 (0.0008188426553775318, '9606.ENSP00000368296'),
 (0.0008094999659934098, '9606.ENSP00000452245'),
 (0.0006558871962877353, '9606.ENSP00000263071'),
 (0.0006088252102139335, '9606.ENSP00000356355'),
 (0.0005905566452023638, '9606.ENSP00000229330'),
 (0.0005902492859102442, '9606.ENSP00000292853'),
 (0.000570019249437017, '9606.ENSP00000453012'),
 (0.0005528554602458615, '9606.ENSP00000357588'),
 (0.0005522938564901998, '9606.ENSP00000281623'),
 (0.0005424794668687708, '9606.ENSP00000368356'),
 (0.000514423826474119, '9606.ENSP00000379025'),
 (0.0005082219017757979, '9606.ENSP00000292852'),
 (0.0005067059321498804, '9606.ENSP00000359221'),
 (0.0004968456910866

In [221]:
prot_score_df = pd.DataFrame(prots_and_scores_sorted)
excel_path = 'C:\\Users\\socce\\OneDrive\\Documents\\Kehn-Hall Lab Data\\Python\\viral_interact\\Data\\interaction_results.xlsx'
prot_score_df.to_excel(excel_path, sheet_name='Sheet2', index=False)

In [None]:
def write_to_excel(file_path, df):
    # Try to load the existing workbook
    writer = pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='new')
    workbook = load_workbook(file_path)
    writer.book = workbook

    # Generate a default sheet name if none is provided
    sheet_name = f"Sheet{len(workbook.sheetnames) + 1}"
    
    # Write the DataFrame to the new sheet
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    # Save the workbook
    writer.save()
    print(f"Data written to sheet: {sheet_name}")

    writer.close()

In [227]:
file_path = 'C:\\Users\\socce\\OneDrive\\Documents\\Kehn-Hall Lab Data\\Python\\viral_interact\\Data\\interaction_results.xlsx'
write_to_excel(file_path, prot_score_df)


TypeError: save() takes 1 positional argument but 2 were given