In [1]:
import pandas as pd

import networkx as nx

In [2]:
protein_list = pd.read_csv('protein_list.csv')
protein_list

Unnamed: 0.1,Unnamed: 0,Protein1_ID
0,0,P03428
1,1,P03431
2,2,P03433
3,3,P03452
4,4,P03466
...,...,...
15680,15680,Q6NUS8
15681,15681,P12018
15682,15682,Q96IU2
15683,15683,Q6PEW1


In [4]:
protein_class =  pd.read_csv('protein_class.csv')
protein_class

Unnamed: 0.1,Unnamed: 0,Protein1_ID,class,IAV Segment,IAV Strain
0,0,P03428,IAV,PB2,PR8
1,1,P03431,IAV,PB1,PR8
2,2,P03433,IAV,PA,PR8
3,3,P03452,IAV,HA,PR8
4,4,P03466,IAV,NP,PR8
...,...,...,...,...,...
15680,15680,Q6NUS8,Human,,
15681,15681,P12018,Human,,
15682,15682,Q96IU2,Human,,
15683,15683,Q6PEW1,Human,,


In [5]:
## Read dataframe
preds = pd.read_csv('hvppi_preds/concat_all_predictions.csv').sort_values(by=['Score'], ascending=False)
preds

Unnamed: 0,Pro1ID,Pro2ID,Score,Interaction,Segment,IAV Strain,human_pname_raw
390327,Q99729,P03496,0.990,yes,NS1,PR8,ROAA_HUMAN
113249,P51991,P03496,0.988,yes,NS1,PR8,ROA3_HUMAN
453098,Q14103,P03496,0.988,yes,NS1,PR8,HNRPD_HUMAN
423947,Q12906,P03496,0.985,yes,NS1,PR8,ILF3_HUMAN
42114,P09651,P03496,0.983,yes,NS1,PR8,ROA1_HUMAN
...,...,...,...,...,...,...,...
44556,Q0VDI3,P03437,0.000,no,HA,Aichi,TM267_HUMAN
160340,P54849,P03437,0.000,no,HA,Aichi,EMP1_HUMAN
629585,Q96RI0,P03437,0.000,no,HA,Aichi,PAR4_HUMAN
556031,Q9Y5I7,P03437,0.000,no,HA,Aichi,CLD16_HUMAN


In [6]:
pos = preds[preds['Score'] >= 0.143].reset_index(drop=True)
neg = preds[preds['Score'] < 0.143].reset_index(drop=True)

print(len(pos))
print(len(neg))

91217
550187


In [7]:
t1 = pos[pos['Score'] >= 0.375].reset_index(drop=True)
t2 = pos[(pos['Score'] >= 0.212) & (pos['Score'] < 0.375)].reset_index(drop=True)
t3 = pos[(pos['Score'] >= 0.143) & (pos['Score'] < 0.212)].reset_index(drop=True)

print(len(t1))
print(len(t2))
print(len(t3))

3738
26044
61435


In [8]:
preds_f = pos[['Pro1ID', 'Pro2ID', 'Score']]
preds_f = preds_f.rename(columns = {'Pro1ID': 'Protein1_ID', 'Pro2ID': 'Protein2_ID'}).sort_values(by=['Score'], ascending=False)
preds_f

Unnamed: 0,Protein1_ID,Protein2_ID,Score
0,Q99729,P03496,0.990
2,Q14103,P03496,0.988
1,P51991,P03496,0.988
3,Q12906,P03496,0.985
4,P09651,P03496,0.983
...,...,...,...
89799,O15162,I6TAH8,0.143
89798,Q7Z460,C3W5R9,0.143
89797,Q9UI30,P03431,0.143
89796,P42262,P03508,0.143


In [9]:
## Map to index
def map_to_index(ip_df):
    
    m_df = pd.merge(ip_df, protein_list, on=['Protein1_ID'])
    m_df = pd.merge(m_df, protein_list, left_on=['Protein2_ID'], right_on=['Protein1_ID'])

    m_df = m_df[['Unnamed: 0_x', 'Unnamed: 0_y']]
    m_df.columns = ['Protein1_ID', 'Protein2_ID']
    
    return m_df

In [10]:
## All positives
preds_idx = map_to_index(preds_f)
preds_idx

Unnamed: 0,Protein1_ID,Protein2_ID
0,9561,7
1,11092,7
2,2803,7
3,10381,7
4,1068,7
...,...,...
91212,11443,30
91213,1201,30
91214,4141,30
91215,3817,30


In [11]:
print(len(preds_idx['Protein2_ID'].unique()))
print(len(preds_idx['Protein1_ID'].unique()))

num_nodes = len(preds_idx['Protein1_ID'].unique()) + len(preds_idx['Protein2_ID'].unique())
print(num_nodes)

41
12397
12438


In [12]:
## create edgelist
edge_arr = preds_idx.to_numpy()
edge_arr

array([[ 9561,     7],
       [11092,     7],
       [ 2803,     7],
       ...,
       [ 4141,    30],
       [ 3817,    30],
       [  178,    30]], dtype=int64)

In [13]:
## networkx
G = nx.from_edgelist(edge_arr)

my_degrees = G.degree()
degree_values = dict(my_degrees).values()

print(list(my_degrees))

[(9561, 40), (7, 10985), (11092, 39), (2803, 39), (10381, 40), (1068, 40), (6280, 41), (13956, 40), (8653, 41), (1665, 30), (1249, 41), (1841, 35), (6973, 31), (10363, 40), (10138, 41), (1641, 38), (11386, 26), (6779, 36), (2002, 39), (3833, 26), (15351, 32), (5171, 28), (6259, 37), (13380, 25), (478, 41), (15256, 38), (1382, 39), (9575, 26), (1400, 40), (6162, 39), (10814, 38), (4583, 23), (7756, 25), (5375, 36), (1248, 41), (15262, 37), (5763, 23), (1844, 37), (8326, 15), (15263, 37), (791, 36), (4596, 39), (951, 39), (5885, 39), (10928, 24), (759, 34), (3041, 23), (4597, 39), (14994, 33), (10790, 31), (8381, 34), (2056, 36), (1379, 35), (2055, 40), (1754, 28), (1921, 22), (294, 22), (3925, 36), (1126, 41), (6763, 41), (4879, 20), (3907, 41), (15101, 21), (1540, 26), (7748, 39), (8310, 17), (3087, 33), (10755, 41), (952, 40), (11924, 15), (12976, 39), (743, 41), (6868, 41), (953, 14), (8961, 41), (2556, 22), (12680, 11), (10742, 24), (9011, 20), (3410, 20), (1983, 23), (3906, 25), (9

In [14]:
print(list(my_degrees)[1])

(7, 10985)


In [15]:
degree_values

dict_values([40, 10985, 39, 39, 40, 40, 41, 40, 41, 30, 41, 35, 31, 40, 41, 38, 26, 36, 39, 26, 32, 28, 37, 25, 41, 38, 39, 26, 40, 39, 38, 23, 25, 36, 41, 37, 23, 37, 15, 37, 36, 39, 39, 39, 24, 34, 23, 39, 33, 31, 34, 36, 35, 40, 28, 22, 22, 36, 41, 41, 20, 41, 21, 26, 39, 17, 33, 41, 40, 15, 39, 41, 41, 14, 41, 22, 11, 24, 20, 20, 23, 25, 40, 38, 36, 22, 27, 40, 22, 20, 11, 15, 30, 41, 40, 40, 18, 25, 18, 39, 40, 36, 40, 35, 37, 32, 39, 33, 25, 34, 39, 36, 16, 12, 35, 26, 40, 36, 35, 21, 17, 38, 29, 21, 41, 40, 26, 20, 37, 33, 38, 24, 41, 41, 37, 25, 39, 18, 36, 26, 22, 37, 20, 38, 22, 15, 23, 19, 28, 25, 39, 15, 22, 22, 37, 16, 20, 23, 39, 24, 17, 41, 37, 30, 25, 40, 20, 28, 22, 36, 19, 20, 34, 26, 28, 6, 36, 29, 41, 40, 39, 35, 25, 23, 31, 13, 37, 17, 19, 40, 39, 15, 35, 13, 36, 41, 21, 38, 23, 20, 39, 19, 10, 40, 17, 22, 33, 32, 41, 18, 32, 6, 11, 30, 15, 39, 9, 41, 23, 39, 41, 41, 10, 12, 40, 24, 40, 18, 21, 39, 32, 15, 20, 26, 25, 41, 41, 41, 33, 12, 12, 18, 9, 6, 32, 34, 40, 1

In [16]:
list_degree_values = list(degree_values)
list_degree_values

[40,
 10985,
 39,
 39,
 40,
 40,
 41,
 40,
 41,
 30,
 41,
 35,
 31,
 40,
 41,
 38,
 26,
 36,
 39,
 26,
 32,
 28,
 37,
 25,
 41,
 38,
 39,
 26,
 40,
 39,
 38,
 23,
 25,
 36,
 41,
 37,
 23,
 37,
 15,
 37,
 36,
 39,
 39,
 39,
 24,
 34,
 23,
 39,
 33,
 31,
 34,
 36,
 35,
 40,
 28,
 22,
 22,
 36,
 41,
 41,
 20,
 41,
 21,
 26,
 39,
 17,
 33,
 41,
 40,
 15,
 39,
 41,
 41,
 14,
 41,
 22,
 11,
 24,
 20,
 20,
 23,
 25,
 40,
 38,
 36,
 22,
 27,
 40,
 22,
 20,
 11,
 15,
 30,
 41,
 40,
 40,
 18,
 25,
 18,
 39,
 40,
 36,
 40,
 35,
 37,
 32,
 39,
 33,
 25,
 34,
 39,
 36,
 16,
 12,
 35,
 26,
 40,
 36,
 35,
 21,
 17,
 38,
 29,
 21,
 41,
 40,
 26,
 20,
 37,
 33,
 38,
 24,
 41,
 41,
 37,
 25,
 39,
 18,
 36,
 26,
 22,
 37,
 20,
 38,
 22,
 15,
 23,
 19,
 28,
 25,
 39,
 15,
 22,
 22,
 37,
 16,
 20,
 23,
 39,
 24,
 17,
 41,
 37,
 30,
 25,
 40,
 20,
 28,
 22,
 36,
 19,
 20,
 34,
 26,
 28,
 6,
 36,
 29,
 41,
 40,
 39,
 35,
 25,
 23,
 31,
 13,
 37,
 17,
 19,
 40,
 39,
 15,
 35,
 13,
 36,
 41,
 21,
 38,
 23,
 20

In [18]:
get_max = max(degree_values)
get_min = min(degree_values)

print(get_max, get_min)

## Get index
idx_max = list_degree_values.index(get_max)
idx_min = list_degree_values.index(get_min)
print(idx_max, idx_min)

print(list((my_degrees))[idx_max], list((my_degrees))[idx_min])

10985 1
1 3019
(7, 10985) (7998, 1)


In [23]:
get = list((my_degrees))[idx_max]

## Retrieve protein UniProt ID
protein_list[protein_list['Unnamed: 0'] == get[0]]

Unnamed: 0.1,Unnamed: 0,Protein1_ID
7,7,P03496


In [24]:
## Average degree

print(sum(degree_values))
print(len(degree_values))

avg_degree = sum(degree_values)/len(degree_values)
avg_degree

182434
12438


14.667470654446053

In [None]:
# edgelist = [(0,1)] # single edge (0,1)
# G = nx.from_edgelist(edgelist)

# G.degree[0]  # node 0 has degree 1

# list(G.degree([0, 1, 2]))

## Edgelist

In [25]:
## All edges
tt_df = pd.read_csv('./hvppi_preds/hvppi_edgelist_idx.csv').sort_values(by=['Score'], ascending=False)
tt_df

Unnamed: 0.1,Unnamed: 0,Protein1_ID,Protein2_ID,Score,Class,Label,Class_2,Class_3,Label_3,Label_2
0,0,9561,7,0.990,0.99,2,0.99,pos,1,5
1,1,11092,7,0.988,0.99,2,0.99,pos,1,5
2,2,2803,7,0.988,0.99,2,0.99,pos,1,5
3,3,10381,7,0.985,0.99,2,0.99,pos,1,5
4,4,1068,7,0.983,0.99,2,0.99,pos,1,5
...,...,...,...,...,...,...,...,...,...,...
271999,271999,14710,30,0.001,neg,0,<=0.047,neg,0,0
271998,271998,4733,11,0.001,neg,0,<=0.047,neg,0,0
271997,271997,1422,11,0.001,neg,0,<=0.047,neg,0,0
271996,271996,14299,11,0.001,neg,0,<=0.047,neg,0,0


In [26]:
e_pos = tt_df[tt_df['Score'] >= 0.143]
e_pos

Unnamed: 0.1,Unnamed: 0,Protein1_ID,Protein2_ID,Score,Class,Label,Class_2,Class_3,Label_3,Label_2
0,0,9561,7,0.990,0.99,2,0.99,pos,1,5
1,1,11092,7,0.988,0.99,2,0.99,pos,1,5
2,2,2803,7,0.988,0.99,2,0.99,pos,1,5
3,3,10381,7,0.985,0.99,2,0.99,pos,1,5
4,4,1068,7,0.983,0.99,2,0.99,pos,1,5
...,...,...,...,...,...,...,...,...,...,...
48300,48300,8841,9,0.143,0.90,3,0.90,pos,1,3
48301,48301,7721,9,0.143,0.90,3,0.90,pos,1,3
48302,48302,3274,9,0.143,0.90,3,0.90,pos,1,3
48303,48303,3431,9,0.143,0.90,3,0.90,pos,1,3


In [None]:
print(len(e_pos['Protein2_ID'].unique()))
print(len(e_pos['Protein1_ID'].unique()))

num_nodes = len(e_pos['Protein1_ID'].unique()) + len(e_pos['Protein2_ID'].unique())
print(num_nodes)

In [None]:
t1 = e_pos[e_pos['Score'] >= 0.375].reset_index(drop=True)
t2 = e_pos[(e_pos['Score'] >= 0.212) & (e_pos['Score'] < 0.375)].reset_index(drop=True)
t3 = e_pos[(e_pos['Score'] >= 0.143) & (e_pos['Score'] < 0.212)].reset_index(drop=True)

print(len(t1))
print(len(t2))
print(len(t3))

In [None]:
e_neg = tt_df[tt_df['Score'] < 0.143].reset_index(drop=True)
e_neg

In [28]:
edgelist = e_pos[['Protein1_ID', 'Protein2_ID']].to_numpy()
edgelist

array([[ 9561,     7],
       [11092,     7],
       [ 2803,     7],
       ...,
       [ 3274,     9],
       [ 3431,     9],
       [ 3191,     9]], dtype=int64)

In [32]:
## networkx
print(len(edgelist))
G = nx.from_edgelist(edgelist)

my_degrees = G.degree()
degree_values = dict(my_degrees).values()

avg_degree = sum(degree_values)/len(degree_values)
print(avg_degree)

48882
7.860738120125432


In [34]:
degree_values

dict_values([13, 10982, 12, 14, 16, 16, 15, 15, 15, 11, 16, 13, 10, 13, 16, 6411, 12, 10, 13, 13, 13, 11, 14, 12, 19, 10, 14, 13, 13, 13, 14, 15, 10, 11, 17, 14, 15, 11, 12, 6, 15, 10, 14, 14, 13, 12, 12, 10, 14, 13, 12, 13, 12, 11, 12, 17, 12, 12, 9, 15, 14, 14, 8, 8, 9, 15, 9, 13, 17, 1487, 15, 6, 16, 17, 17, 6, 15, 11, 9, 5, 9, 7, 10, 9, 11, 17, 14, 12, 8, 9, 8, 13, 6, 4, 11, 16, 17, 7, 15, 10, 9, 14, 12, 15, 15, 16, 14, 14, 14, 13, 8, 14, 15, 13, 8, 5, 11, 11, 16, 18, 13, 9721, 9, 6, 4978, 13, 11, 8, 15, 15, 13, 8, 11, 12, 11, 14, 14, 16, 12, 18, 10, 12, 6, 10, 8, 13, 9, 6, 15, 7, 7, 10, 10, 8, 6, 12, 11, 9, 12, 8, 6, 1741, 9, 18, 12, 5, 7, 14, 11, 16, 13, 8, 7205, 11, 9, 13, 6, 9, 10, 12, 5, 9, 17, 12, 18, 13, 14, 14, 10, 7, 13, 11, 7, 7, 17, 7, 14, 8, 15, 5, 16, 17, 8, 14, 13, 14, 8, 9, 4, 872, 13, 7, 8, 692, 11, 12, 10, 17, 14, 3, 5, 7, 12, 13, 16, 5, 9, 15, 15, 5, 18, 4, 15, 14, 11, 7, 11, 16, 7, 9, 10, 10, 9, 16, 14, 13, 14, 7, 6, 9, 3, 11, 13, 4, 15, 8, 6, 14, 7, 13, 9, 5, 6,

In [36]:
list_degree_values = list(degree_values)
# list_degree_values

In [37]:
get_max = max(degree_values)
get_min = min(degree_values)

print(get_max, get_min)

## Get index
idx_max = list_degree_values.index(get_max)
idx_min = list_degree_values.index(get_min)
print(idx_max, idx_min)

print(list((my_degrees))[idx_max], list((my_degrees))[idx_min])

10982 1
1 381
(7, 10982) (10106, 1)


In [None]:
degree_values

In [None]:
print(len(degree_values))

In [None]:
avg_degree = sum(degree_values)/len(degree_values)
avg_degree

In [None]:
# test_df = pd.read_csv('edges/Sept_2022_new/balanced/idx/b_test_idx.csv') ## Network Reconstruction
test_df = pd.read_csv('edges/Experimentally_Verified_Test/testset_1.csv')
test_df

In [None]:
p1 = test_df[test_df['label'] == 0]
p1

In [None]:
nodes = list(p1['Protein1_ID'].unique()) + list(p1['Protein2_ID'].unique())
nodes

In [None]:
nodes_f = set(nodes)
len(nodes_f)

In [None]:
neg_nodes = pd.concat

In [None]:
print(len(p1['Protein1_ID'].unique()), len(p1['Protein2_ID'].unique()))

In [None]:
len(p1['Protein1_ID'].unique()) + len(p1['Protein2_ID'].unique())

In [None]:
pos_org = pd.read_csv('edges/Sept_2022_new/balanced/idx/b_pos_idx.csv')
pos_org

In [None]:
print(len(pos_org['Protein1_ID'].unique()), len(pos_org['Protein2_ID'].unique()))

In [None]:
len(pos_org['Protein1_ID'].unique()) + len(pos_org['Protein2_ID'].unique())

In [None]:
test_df = pd.read_csv('edges/Sept_2022_new/balanced/idx/b_test_idx.csv') ## Network Reconstruction
# test_df = pd.read_csv('edges/Experimentally_Verified_Test/testset_1.csv')
test_df

In [None]:
test_df_pos = test_df[test_df['label'] == 1]
test_df_pos

In [None]:
edgelist = test_df_pos[['Protein1_ID', 'Protein2_ID']].to_numpy()
edgelist

In [None]:
## networkx
G = nx.from_edgelist(edgelist)

my_degrees = G.degree()
degree_values = dict(my_degrees).values()
# print(degree_values)
print(len(degree_values))

avg_degree = sum(degree_values)/len(degree_values)
avg_degree

In [None]:
pos_t = pd.read_csv('./edges/unbalanced_dataset/Dec_2022/NEW_pos_test.csv')
pos_t

In [None]:
pos_raw = pd.read_csv('./edges/unbalanced_dataset/Dec_2022/NEW_pos_train_val.csv')
pos_raw

In [None]:
edgelist = pos_t[['Protein1_ID', 'Protein2_ID']].to_numpy()
edgelist

In [None]:
## networkx
G = nx.from_edgelist(edgelist)

my_degrees = G.degree()
degree_values = dict(my_degrees).values()
# print(degree_values)
print(len(degree_values))

avg_degree = sum(degree_values)/len(degree_values)
avg_degree