In [46]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [47]:
# All imports
import pandas as pd
import glob
import os
import networkx as nx
import pickle
import numpy as np
from scipy import sparse
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [48]:
# read data from CSV files
edgesFile = 'Raw/edges.csv'
nodesFile = 'Raw/nodes.csv'

edgesDf = pd.read_csv(edgesFile, sep=',')
edgesDf = edgesDf[["source", "sink"]]
print('edgesDf shape', edgesDf.shape)

nodesDf = pd.read_csv(nodesFile, sep=',')
print('nodesDf shape', nodesDf.shape)

edgesDf shape (559147, 2)
nodesDf shape (7751, 18)


In [49]:
# convert pval to float and use NaN for non numeric values
nodesDf['pval'] = pd.to_numeric(nodesDf['pval'], errors='coerce')

# remove nodes with empty or non-numeric pval
# nodesPvalDf = nodesDf[nodesDf['pval'].notna()]
# print('nodesPvalDf shape', nodesPvalDf.shape)

# construct dataframe with the selected features
tmp = nodesDf[['id', 'jaccard_similarity', 'pval', 'chromosome']]
tmp = tmp[tmp['chromosome'].notna()]
all_data = tmp[tmp['jaccard_similarity'].notna()]
print('all_data shape', all_data.shape)
print(all_data.chromosome.value_counts())

all_data shape (5253, 4)
14                                                                            442
1                                                                             441
19                                                                            375
11                                                                            298
2                                                                             296
21                                                                            293
7                                                                             290
3                                                                             260
10                                                                            257
17                                                                            243
5                                                                             236
6                                                                        

In [50]:
# count number of rows by chromosome
# TODO clean up invalid chromosome

all_data.loc[all_data['chromosome'] == 'X', 'chromosome'] = '23'
all_data.loc[all_data['chromosome'] == 'Y', 'chromosome'] = '24'

# convert pval to float and use NaN for non numeric values
all_data['chromosome'] = pd.to_numeric(all_data['chromosome'], errors='coerce')
# remove nodes with empty or non-numeric pval
all_data = all_data[all_data['chromosome'].notna()]
print(all_data.shape)


temp = all_data.groupby('chromosome')['id'].nunique()
print(temp)

(4982, 4)
chromosome
1.0     441
2.0     296
3.0     260
4.0     160
5.0     236
6.0     220
7.0     290
8.0     174
9.0     141
10.0    257
11.0    298
12.0    214
13.0     87
14.0    442
15.0    190
16.0    135
17.0    243
18.0     63
19.0    375
20.0     80
21.0    293
22.0     59
23.0     26
24.0      2
Name: id, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['chromosome'] = pd.to_numeric(all_data['chromosome'], errors='coerce')


In [51]:
# Apply the function
all_data['pval'] = all_data['pval'].fillna(2)

# remove the outliers
all_data = all_data[all_data['pval'] >= 1e-10]
print(all_data)

# replace all two's of 'pval' with NaN values
# all_data[['pval']].replace(2, np.nan, inplace=True)
all_data['pval'] = all_data['pval'].replace(2, np.nan)
print(all_data)

        id  jaccard_similarity          pval  chromosome
3        3            0.004432  2.000000e+00         2.0
4        4            0.002947  4.000000e-08        16.0
7        7            0.012022  1.000000e-09        15.0
20      20            0.004910  2.000000e+00         6.0
21      21            0.005878  2.000000e+00         6.0
...    ...                 ...           ...         ...
7707  7707            0.000737  3.000000e-08         7.0
7711  7711            0.004177  2.000000e-09         7.0
7713  7713            0.002464  3.000000e-08        18.0
7714  7714            0.001723  5.000000e-08        19.0
7717  7717            0.000493  2.000000e-10         1.0

[4780 rows x 4 columns]
        id  jaccard_similarity          pval  chromosome
3        3            0.004432           NaN         2.0
4        4            0.002947  4.000000e-08        16.0
7        7            0.012022  1.000000e-09        15.0
20      20            0.004910           NaN         6.0
21    

In [52]:
# seperate predicting attribute from input feature
preicting_attr = all_data['pval']
all_data = all_data.drop('pval', axis=1)

# one hot encode chromosome
oneHot = pd.get_dummies(all_data['chromosome'], prefix = 'chromosome')
all_data = all_data.join(oneHot)
all_data = all_data.drop('chromosome', axis = 1)
print(all_data)

        id  jaccard_similarity  chromosome_1.0  chromosome_2.0  \
3        3            0.004432               0               1   
4        4            0.002947               0               0   
7        7            0.012022               0               0   
20      20            0.004910               0               0   
21      21            0.005878               0               0   
...    ...                 ...             ...             ...   
7707  7707            0.000737               0               0   
7711  7711            0.004177               0               0   
7713  7713            0.002464               0               0   
7714  7714            0.001723               0               0   
7717  7717            0.000493               1               0   

      chromosome_3.0  chromosome_4.0  chromosome_5.0  chromosome_6.0  \
3                  0               0               0               0   
4                  0               0               0           

In [53]:
print(all_data)

        id  jaccard_similarity  chromosome_1.0  chromosome_2.0  \
3        3            0.004432               0               1   
4        4            0.002947               0               0   
7        7            0.012022               0               0   
20      20            0.004910               0               0   
21      21            0.005878               0               0   
...    ...                 ...             ...             ...   
7707  7707            0.000737               0               0   
7711  7711            0.004177               0               0   
7713  7713            0.002464               0               0   
7714  7714            0.001723               0               0   
7717  7717            0.000493               1               0   

      chromosome_3.0  chromosome_4.0  chromosome_5.0  chromosome_6.0  \
3                  0               0               0               0   
4                  0               0               0           

In [54]:
# sort by node_id
nodesSelectedCols = all_data.sort_values(by=['id'])
nodeList = nodesSelectedCols["id"].values.tolist()
print('nodeList len', len(nodeList))

# filter edges that are not in selected nodes
edgesDfFiltered = edgesDf[edgesDf['source'].isin(nodeList)]
edgesDfFiltered = edgesDfFiltered[edgesDfFiltered['sink'].isin(nodeList)]
print('edgesDfFiltered shape', edgesDfFiltered.shape)

# build adj matrix
numNodes = len(nodeList)
adj = np.zeros((numNodes, numNodes))

for index, row in edgesDfFiltered.iterrows():
    u = nodeList.index(row['source'])
    v = nodeList.index(row['sink'])
    adj[u][v] = 1
    adj[v][u] = 1

print(adj.shape)
adj = sparse.csr_matrix(adj)
print('adj shape', adj.shape)
# print(adj)

nodeList len 4780
edgesDfFiltered shape (440517, 2)
(4780, 4780)
adj shape (4780, 4780)


In [55]:
# drop node_id column
all_data = all_data.drop('id', axis=1).reset_index(drop=True)
print(all_data)
all_data.to_numpy()

featuresOneHot = sparse.csr_matrix(all_data)

data_name = 'Alzheimer'
f1 = 'Processed/' + data_name + '_features_all_hop_' + str(0) + '.pickle'
a1 = 'Processed/' + data_name + '_adj_all_hop_' + str(0) + '.pickle'
p1 = 'Processed/' + data_name + '_pval_all_hop_' + str(0) + '.pickle'
print('Stored in ', f1, 'and ', a1, 'and', p1)
with open(f1, 'wb') as handle: pickle.dump(featuresOneHot, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(a1, 'wb') as handle: pickle.dump(adj, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(p1, 'wb') as handle: pickle.dump(preicting_attr, handle, protocol=pickle.HIGHEST_PROTOCOL)

      jaccard_similarity  chromosome_1.0  chromosome_2.0  chromosome_3.0  \
0               0.004432               0               1               0   
1               0.002947               0               0               0   
2               0.012022               0               0               0   
3               0.004910               0               0               0   
4               0.005878               0               0               0   
...                  ...             ...             ...             ...   
4775            0.000737               0               0               0   
4776            0.004177               0               0               0   
4777            0.002464               0               0               0   
4778            0.001723               0               0               0   
4779            0.000493               1               0               0   

      chromosome_4.0  chromosome_5.0  chromosome_6.0  chromosome_7.0  \
0              

In [56]:
# print('All data shape', all_data.shape)
# # sort by node_id
# nodesSelectedCols = data_pval.sort_values(by=['id'])

# nodeList = nodesSelectedCols["id"].values.tolist()
# print('nodeList len', len(nodeList))

# # seperate data according to non-empty and empty pval
# data_pval = all_data[all_data['pval'].notna()]
# print('non-empty pval:', data_pval.shape)

# data_emptyPval = all_data[all_data['pval'].isnull()]
# print('empty pval:', data_emptyPval.shape)