In [2]:
# All imports
import pandas as pd
import glob
import os
import networkx as nx
import pickle
import numpy as np
from scipy import sparse
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [3]:
# read data from CSV files
edgesFile = 'Raw/edges.csv'
nodesFile = 'Raw/nodes.csv'

edgesDf = pd.read_csv(edgesFile, sep=',')
edgesDf = edgesDf[["source", "sink"]]
print('edgesDf shape', edgesDf.shape)

nodesDf = pd.read_csv(nodesFile, sep=',')
print('nodesDf shape', nodesDf.shape)

edgesDf shape (559147, 2)
nodesDf shape (7751, 18)


In [4]:
# convert pval to float and use NaN for non numeric values
nodesDf['pval'] = pd.to_numeric(nodesDf['pval'], errors='coerce')

# remove nodes with empty or non-numeric pval
# nodesPvalDf = nodesDf[nodesDf['pval'].notna()]
# print('nodesPvalDf shape', nodesPvalDf.shape)

# construct dataframe with the selected features
features = nodesDf[['id', 'jaccard_similarity', 'pval', 'chromosome']]

# filter out nan chromosome and jaccard_similarity
features = features[features['chromosome'].notna()]
features = features[features['jaccard_similarity'].notna()]

print('features shape', features.shape)
print(features)

features shape (5253, 4)
        id  jaccard_similarity          pval chromosome
3        3            0.004432           NaN          2
4        4            0.002947  4.000000e-08         16
5        5            0.013490  2.000000e-18         11
7        7            0.012022  1.000000e-09         15
10      10            0.000247  1.000000e-58  GCST00732
...    ...                 ...           ...        ...
7713  7713            0.002464  3.000000e-08         18
7714  7714            0.001723  5.000000e-08         19
7715  7715            0.001719  8.000000e-11  GCST00732
7716  7716            0.013771  1.000000e-18          1
7717  7717            0.000493  2.000000e-10          1

[5253 rows x 4 columns]


In [6]:
print('chromosome count \n', features.chromosome.value_counts())

# clean up invalid chromosome

features.loc[features['chromosome'] == 'X', 'chromosome'] = '23'
features.loc[features['chromosome'] == 'Y', 'chromosome'] = '24'

# convert chromosome to number and use NaN for non numeric values
features['chromosome'] = pd.to_numeric(features['chromosome'], errors='coerce')
# remove nodes with empty or non-numeric chromosome
features = features[features['chromosome'].notna()]
print('features.shape', features.shape)

# count number of rows by chromosome
print('chromosome count after filter \n', features.groupby('chromosome')['id'].nunique())

chromosome count 
 14.0    442
1.0     441
19.0    375
11.0    298
2.0     296
21.0    293
7.0     290
3.0     260
10.0    257
17.0    243
5.0     236
6.0     220
12.0    214
15.0    190
8.0     174
4.0     160
9.0     141
16.0    135
13.0     87
20.0     80
18.0     63
22.0     59
23.0     26
24.0      2
Name: chromosome, dtype: int64
features.shape (4982, 4)
chromosome count after filter 
 chromosome
1.0     441
2.0     296
3.0     260
4.0     160
5.0     236
6.0     220
7.0     290
8.0     174
9.0     141
10.0    257
11.0    298
12.0    214
13.0     87
14.0    442
15.0    190
16.0    135
17.0    243
18.0     63
19.0    375
20.0     80
21.0    293
22.0     59
23.0     26
24.0      2
Name: id, dtype: int64


In [7]:
# Replace NaN with 2 temporarily
features['pval'] = features['pval'].fillna(2)

# remove the outliers
features = features[features['pval'] >= 1e-10]
print(features)

# replace all two's of 'pval' with NaN values
features['pval'] = features['pval'].replace(2, np.nan)
print(features)

        id  jaccard_similarity          pval  chromosome
3        3            0.004432  2.000000e+00         2.0
4        4            0.002947  4.000000e-08        16.0
7        7            0.012022  1.000000e-09        15.0
20      20            0.004910  2.000000e+00         6.0
21      21            0.005878  2.000000e+00         6.0
...    ...                 ...           ...         ...
7707  7707            0.000737  3.000000e-08         7.0
7711  7711            0.004177  2.000000e-09         7.0
7713  7713            0.002464  3.000000e-08        18.0
7714  7714            0.001723  5.000000e-08        19.0
7717  7717            0.000493  2.000000e-10         1.0

[4780 rows x 4 columns]
        id  jaccard_similarity          pval  chromosome
3        3            0.004432           NaN         2.0
4        4            0.002947  4.000000e-08        16.0
7        7            0.012022  1.000000e-09        15.0
20      20            0.004910           NaN         6.0
21    

In [8]:
# sort by node_id
features = features.sort_values(by=['id'])

# seperate predicting attribute from input feature
predicting_attr = features['pval']
features = features.drop('pval', axis=1)

# one hot encode chromosome
oneHot = pd.get_dummies(features['chromosome'], prefix = 'chromosome')
features = features.join(oneHot)
features = features.drop('chromosome', axis = 1)

In [9]:
print(predicting_attr)

3                NaN
4       4.000000e-08
7       1.000000e-09
20               NaN
21               NaN
            ...     
7707    3.000000e-08
7711    2.000000e-09
7713    3.000000e-08
7714    5.000000e-08
7717    2.000000e-10
Name: pval, Length: 4780, dtype: float64


In [10]:
print(features)

        id  jaccard_similarity  chromosome_1.0  chromosome_2.0  \
3        3            0.004432               0               1   
4        4            0.002947               0               0   
7        7            0.012022               0               0   
20      20            0.004910               0               0   
21      21            0.005878               0               0   
...    ...                 ...             ...             ...   
7707  7707            0.000737               0               0   
7711  7711            0.004177               0               0   
7713  7713            0.002464               0               0   
7714  7714            0.001723               0               0   
7717  7717            0.000493               1               0   

      chromosome_3.0  chromosome_4.0  chromosome_5.0  chromosome_6.0  \
3                  0               0               0               0   
4                  0               0               0           

In [11]:
nodeList = features["id"].values.tolist()
print('nodeList len', len(nodeList))

# filter edges that are not in selected nodes
edgesDfFiltered = edgesDf[edgesDf['source'].isin(nodeList)]
edgesDfFiltered = edgesDfFiltered[edgesDfFiltered['sink'].isin(nodeList)]
print('edgesDfFiltered shape', edgesDfFiltered.shape)

# build adj matrix
numNodes = len(nodeList)
adj = np.zeros((numNodes, numNodes))

for index, row in edgesDfFiltered.iterrows():
    u = nodeList.index(row['source'])
    v = nodeList.index(row['sink'])
    adj[u][v] = 1
    adj[v][u] = 1

print('adj shape', adj.shape)
adj = sparse.csr_matrix(adj)
print('adj sparse shape', adj.shape)
print(adj)

nodeList len 4780
edgesDfFiltered shape (440517, 2)
adj shape (4780, 4780)
adj sparse shape (4780, 4780)
  (0, 4)	1.0
  (0, 256)	1.0
  (0, 257)	1.0
  (0, 258)	1.0
  (0, 322)	1.0
  (0, 535)	1.0
  (0, 717)	1.0
  (0, 718)	1.0
  (0, 719)	1.0
  (0, 730)	1.0
  (0, 808)	1.0
  (0, 825)	1.0
  (0, 918)	1.0
  (0, 1163)	1.0
  (0, 1170)	1.0
  (0, 1519)	1.0
  (0, 1905)	1.0
  (0, 1934)	1.0
  (0, 2031)	1.0
  (0, 2049)	1.0
  (0, 2054)	1.0
  (0, 2196)	1.0
  (0, 2276)	1.0
  (0, 2281)	1.0
  (0, 2282)	1.0
  :	:
  (4776, 3261)	1.0
  (4776, 3284)	1.0
  (4776, 3372)	1.0
  (4776, 3382)	1.0
  (4776, 3391)	1.0
  (4776, 3401)	1.0
  (4776, 3443)	1.0
  (4776, 3632)	1.0
  (4776, 3639)	1.0
  (4776, 3641)	1.0
  (4776, 4099)	1.0
  (4776, 4172)	1.0
  (4776, 4306)	1.0
  (4777, 923)	1.0
  (4777, 1257)	1.0
  (4777, 1258)	1.0
  (4777, 1418)	1.0
  (4777, 1607)	1.0
  (4777, 1625)	1.0
  (4778, 1224)	1.0
  (4778, 2053)	1.0
  (4778, 3012)	1.0
  (4778, 3016)	1.0
  (4778, 3017)	1.0
  (4778, 3019)	1.0


In [12]:
# drop node_id column
features = features.drop('id', axis=1).reset_index(drop=True)
features = sparse.csr_matrix(features.to_numpy())

data_name = 'Alzheimer'
f1 = 'Processed/' + data_name + '_features.pickle'
a1 = 'Processed/' + data_name + '_adj.pickle'
p1 = 'Processed/' + data_name + '_pval.pickle'
n1 = 'Processed/' + data_name + '_node_id_list.pickle'
print('Storing in ', f1, 'and ', a1, 'and', p1, 'and', n1)

with open(f1, 'wb') as handle: pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(a1, 'wb') as handle: pickle.dump(adj, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(p1, 'wb') as handle: pickle.dump(predicting_attr, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(n1, 'wb') as handle: pickle.dump(nodeList, handle, protocol=pickle.HIGHEST_PROTOCOL)

Storing in  Processed/Alzheimer_features.pickle and  Processed/Alzheimer_adj.pickle and Processed/Alzheimer_pval.pickle and Processed/Alzheimer_node_id_list.pickle


In [15]:
np.isnan(nodeList).any()

False