In [1]:
import json, pandas as pd, copy, functools, itertools, collections, operator, time, urllib.request
import concurrent.futures
import numpy as np
from scipy import sparse
import hetio.readwrite
from hetmech.dwpc import *

## Load the graph

In [2]:
%%time
url = 'data/hetionet-v1.0.json'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 53.4 s, sys: 1.19 s, total: 54.6 s
Wall time: 54.6 s


## Load metapaths

In [3]:
metapaths_url = 'https://raw.githubusercontent.com/dhimmel/learn/{}/all-features/data/metapaths.json'.format(
    'ef5f7a6b76b6a01499d65b95e3d7ca93ac5aba57')

with urllib.request.urlopen(metapaths_url) as data_file:
    metapaths = json.loads(data_file.read().decode())

metapaths.sort(key=lambda x: x['join_complexities'][0])

len(metapaths)

1206

# Extract the actual metapaths from the list of metapath dictionaries

In [4]:
abbrevs = [metapath['abbreviation'] for metapath in metapaths]

## Categorize the metapaths

In [5]:
types = []
for metapath in abbrevs:
    m_path = metagraph.metapath_from_abbrev(metapath)
    cat = categorize(m_path)
    types.append([metapath, cat])
types = pd.DataFrame(types, columns=("Metapath", "Category"))

In [6]:
types.head()

Unnamed: 0,Metapath,Category
0,CpDpCpD,BABA
1,CpDpCtD,BABA
2,CpDtCpD,BABA
3,CtDpCpD,BABA
4,CiPCiCpD,short_repeat


In [7]:
len(types[types.Category == 'BABA'])

278

In [8]:
frequency = []
for path_type in set(list(types.Category)):
    frequency.append([path_type, len(types[types.Category == path_type])])
frequency = pd.DataFrame(frequency, columns=('PathType', 'Number'))

In [9]:
frequency

Unnamed: 0,PathType,Number
0,no_repeats,18
1,long_repeat,4
2,disjoint,131
3,BABA,278
4,other,32
5,BAAB,144
6,short_repeat,599


## ALL but OTHER and Long_repeat

In [10]:
all_strings = types[(types['Category'] != 'other')]
all_strings = all_strings[all_strings['Category'] != 'long_repeat']
all_strings = list(all_strings.Metapath)

alls = [metagraph.metapath_from_abbrev(s) for s in all_strings]

In [11]:
%%time
all_arrays = {}
all_times = []
n=0
for i, m_path in enumerate(alls):
    row, col, mat, timed = dwpc(graph, m_path, damping=0.4, sparse_threshold=1)
    st = all_strings[i]
    print(f'{n}; metapath: {st}; time: {timed:.3}')
    all_arrays[st] = mat
    all_times.append([st, timed])
    n += 1

0; metapath: CpDpCpD; time: 0.202
1; metapath: CpDpCtD; time: 0.197
2; metapath: CpDtCpD; time: 0.201
3; metapath: CtDpCpD; time: 0.196
4; metapath: CiPCiCpD; time: 0.214
5; metapath: CpDtCtD; time: 0.196
6; metapath: CtDpCtD; time: 0.203
7; metapath: CtDtCpD; time: 0.198
8; metapath: CiPCiCtD; time: 0.216
9; metapath: CbGbCpD; time: 0.814
10; metapath: CtDtCtD; time: 0.196
11; metapath: CbGuCpD; time: 0.82
12; metapath: CuGbCpD; time: 0.82
13; metapath: CbGdCpD; time: 0.966
14; metapath: CdGbCpD; time: 0.967
15; metapath: CbGdDpCpD; time: 0.948
16; metapath: CpDdGbCpD; time: 1.01
17; metapath: CpDpCbGdD; time: 0.831
18; metapath: CbGuDpCpD; time: 0.887
19; metapath: CpDpCbGuD; time: 0.858
20; metapath: CpDuGbCpD; time: 0.957
21; metapath: CpDrD; time: 0.113
22; metapath: CbGbCtD; time: 0.816
23; metapath: CrCpD; time: 0.191
24; metapath: CbGdD; time: 0.705
25; metapath: CuGuCpD; time: 0.88
26; metapath: CbGuD; time: 0.8
27; metapath: CdGuCpD; time: 0.87
28; metapath: CuGdCpD; time: 0.

224; metapath: CdGcGdCpD; time: 7.06
225; metapath: CbGbCdGdD; time: 4.59
226; metapath: CbGdCbGdD; time: 4.53
227; metapath: CdGbCbGdD; time: 4.54
228; metapath: CbGbCdGuD; time: 4.61
229; metapath: CbGdCbGuD; time: 4.53
230; metapath: CdGbCbGuD; time: 4.55
231; metapath: CbG<rGuCpD; time: 7.66
232; metapath: CbGr>GuCpD; time: 7.38
233; metapath: CuG<rGbCpD; time: 7.41
234; metapath: CuGr>GbCpD; time: 7.39
235; metapath: CbGdDrD; time: 0.759
236; metapath: CbGcGdCtD; time: 7.02
237; metapath: CdGcGbCtD; time: 7.06
238; metapath: CuGuCpDrD; time: 0.893
239; metapath: CbGuDrD; time: 0.75
240; metapath: CdGaDtCtD; time: 0.872
241; metapath: CtDaGdCtD; time: 0.89
242; metapath: CtDtCdGaD; time: 0.874
243; metapath: CrCbGdD; time: 0.818
244; metapath: CrCuGuCpD; time: 0.991
245; metapath: CuGuCrCpD; time: 0.992
246; metapath: CrCbGuD; time: 0.824
247; metapath: CbG<rGdCpD; time: 7.35
248; metapath: CbGr>GdCpD; time: 7.36
249; metapath: CdG<rGbCpD; time: 7.39
250; metapath: CdGr>GbCpD; time

446; metapath: CuGiGdD; time: 4.69
447; metapath: CdG<rGuD; time: 4.66
448; metapath: CdGr>GuD; time: 4.65
449; metapath: CuGiGuD; time: 4.74
450; metapath: CbGiGaD; time: 4.79
451; metapath: CpDdG<rGdD; time: 7.28
452; metapath: CpDdGr>GdD; time: 7.26
453; metapath: CdGaDrD; time: 0.795
454; metapath: CpDdG<rGuD; time: 7.3
455; metapath: CpDdGr>GuD; time: 7.25
456; metapath: CpDuG<rGdD; time: 7.28
457; metapath: CpDuGr>GdD; time: 7.24
458; metapath: CpDuG<rGuD; time: 7.3
459; metapath: CpDuGr>GuD; time: 7.56
460; metapath: CuG<rGuCtD; time: 7.82
461; metapath: CuGr>GuCtD; time: 7.81
462; metapath: CpDaGdDrD; time: 0.865
463; metapath: CpDdGaDrD; time: 0.818
464; metapath: CpDrDaGdD; time: 0.827
465; metapath: CpDrDdGaD; time: 0.836
466; metapath: CpDaGuDrD; time: 0.802
467; metapath: CpDrDaGuD; time: 0.816
468; metapath: CpDrDuGaD; time: 0.801
469; metapath: CpDuGaDrD; time: 0.809
470; metapath: CdGdCdGdD; time: 4.86
471; metapath: CrCdGaD; time: 0.847
472; metapath: CdGdCdGuD; time: 

666; metapath: CtDrDaGaD; time: 0.808
667; metapath: CrCuGcGdD; time: 4.38
668; metapath: CrCuGcGuD; time: 4.38
669; metapath: CrCbGcGaD; time: 4.38
670; metapath: CrCtDaGaD; time: 0.883
671; metapath: CbGcGiGdD; time: 10.9
672; metapath: CbGiGcGdD; time: 10.9
673; metapath: CdGcGdDrD; time: 4.32
674; metapath: CbGcGiGuD; time: 10.9
675; metapath: CbGiGcGuD; time: 10.9
676; metapath: CtDaGiGdD; time: 7.48
677; metapath: CtDdGiGaD; time: 7.39
678; metapath: CdGcGuDrD; time: 4.33
679; metapath: CtDaGiGuD; time: 7.4
680; metapath: CtDuGiGaD; time: 7.41
681; metapath: CrCdGcGdD; time: 4.38
682; metapath: CuGaDaGdD; time: 4.17
683; metapath: CuGaDdGaD; time: 4.17
684; metapath: CuGdDaGaD; time: 4.21
685; metapath: CrCdGcGuD; time: 4.38
686; metapath: CuGaDaGuD; time: 4.16
687; metapath: CuGaDuGaD; time: 4.14
688; metapath: CuGuDaGaD; time: 4.13
689; metapath: CbGaDaGaD; time: 4.16
690; metapath: CuGcGcGaD; time: 10.6
691; metapath: CrCtDrDrD; time: 0.287
692; metapath: CrCrCtDrD; time: 0.36

884; metapath: CrCuGr>GaD; time: 4.8
885; metapath: CdGpMFpGdD; time: 5.08
886; metapath: CdGpMFpGuD; time: 5.06
887; metapath: CdG<rG<rGdD; time: 11.7
888; metapath: CdG<rGr>GdD; time: 12.0
889; metapath: CdGr>G<rGdD; time: 12.3
890; metapath: CdGr>Gr>GdD; time: 11.6
891; metapath: CuG<rGiGdD; time: 11.9
892; metapath: CuGiG<rGdD; time: 11.6
893; metapath: CuGiGr>GdD; time: 11.6
894; metapath: CuGr>GiGdD; time: 11.6
895; metapath: CdG<rG<rGuD; time: 11.8
896; metapath: CdG<rGr>GuD; time: 11.9
897; metapath: CdGr>G<rGuD; time: 12.2
898; metapath: CdGr>Gr>GuD; time: 11.6
899; metapath: CuG<rGiGuD; time: 11.8
900; metapath: CuGiG<rGuD; time: 11.6
901; metapath: CuGiGr>GuD; time: 11.6
902; metapath: CuGr>GiGuD; time: 11.6
903; metapath: CbG<rGiGaD; time: 11.5
904; metapath: CbGiG<rGaD; time: 11.5
905; metapath: CbGiGr>GaD; time: 11.5
906; metapath: CbGr>GiGaD; time: 11.5
907; metapath: CuGpCCpGdD; time: 4.88
908; metapath: CuGpPWpGdD; time: 4.91
909; metapath: CdG<rGaDrD; time: 4.74
910; 

1100; metapath: CtDaGeAlD; time: 1.99
1101; metapath: CtDlAeGaD; time: 2.25
1102; metapath: CdGdAdGaD; time: 10.1
1103; metapath: CcSEcCdGdD; time: 1.73
1104; metapath: CdGpBPpGaD; time: 9.31
1105; metapath: CcSEcCdGuD; time: 1.74
1106; metapath: CbGeAlDrD; time: 2.0
1107; metapath: CrCbGeAlD; time: 2.07
1108; metapath: CcSEcCuGaD; time: 1.74
1109; metapath: CuGcGeAlD; time: 5.56
1110; metapath: CbGeAuGdD; time: 12.3
1111; metapath: CbGuAeGdD; time: 15.0
1112; metapath: CbGeAuGuD; time: 12.5
1113; metapath: CbGuAeGuD; time: 15.0
1114; metapath: CbGdAeGdD; time: 14.8
1115; metapath: CbGeAdGdD; time: 12.1
1116; metapath: CbGdAeGuD; time: 14.8
1117; metapath: CbGeAdGuD; time: 12.1
1118; metapath: CcSEcCdGaD; time: 1.77
1119; metapath: CdGcGeAlD; time: 5.61
1120; metapath: CbG<rGeAlD; time: 5.93
1121; metapath: CbGr>GeAlD; time: 5.93
1122; metapath: CuGeAlDrD; time: 2.02
1123; metapath: CrCuGeAlD; time: 2.08
1124; metapath: CbGiGeAlD; time: 5.99
1125; metapath: CdGeAlDrD; time: 2.02
1126; 

In [12]:
np.save('data/all_times', all_times)
np.save('data/all_arrays', all_arrays)

#### Save DWPC times as a .tsv file.

In [13]:
times = np.load('data/all_times.npy')

times_df = pd.DataFrame(times, columns=('Metapath', 'Time'))

times_df.to_csv(path_or_buf='data/all_times.tsv', sep='\t', float_format='%.6g', index=False)

#### Save DWPC matrices as outputs in Compound-Disease pairs

In [14]:
arrs = np.load('data/all_arrays.npy')
arrs = arrs.tolist()

In [15]:
arrs

{'CpDpCpD': matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'CpDpCtD': matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'CpDtCpD': matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'CtDpCpD': matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  

In [16]:
row, col, mat = metaedge_to_adjacency_matrix(graph, 'CpD')

In [17]:
comp_disease_pairs = numpy.array([[comp, disease] for comp in row for disease in col])

In [18]:
mat_dict = {'compound': comp_disease_pairs[:,0], 'disease': comp_disease_pairs[:,1]}

In [19]:
mat_dict = dict()
for meta, mat in arrs.items():
    if type(mat) == numpy.matrix:
        mat = mat.A
    elif sparse.issparse(mat):
        mat = mat.toarray()
    assert type(mat) == numpy.ndarray
    mat = mat.flatten(order='C').tolist()
    mat_dict[meta] = mat
mat_dict['disease'] = comp_disease_pairs[:,1]
mat_dict['compound'] = comp_disease_pairs[:,0]

In [20]:
dwpc_matrices = pd.DataFrame.from_dict(mat_dict)

In [21]:
colnames = list(reversed(list(dwpc_matrices)))
colnames = [colnames.pop(1)] + colnames

In [22]:
dwpc_matrices = dwpc_matrices.ix[:, colnames]
dwpc_matrices

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,compound,disease,CuGuDuGuD,CuGuDuGdD,CuGuDuGaD,CuGuDtCtD,CuGuDtCpD,CuGuDrDrD,CuGuDrD,CuGuDpSpD,...,CbG<rGcGuD,CbG<rGcGdD,CbG<rGcGaD,CbG<rGbCtD,CbG<rGbCpD,CbG<rGaDrD,CbG<rGaD,CbG<rG<rGuD,CbG<rG<rGdD,CbG<rG<rGaD
0,DB00014,DOID:0050156,0.001791,0.000662,0.000216,0.000000,0.002885,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00014,DOID:0050425,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DB00014,DOID:0050741,0.001160,0.000211,0.000000,0.000000,0.000000,0.000000,0.000000,0.000413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DB00014,DOID:0050742,0.001002,0.000816,0.000000,0.000000,0.008050,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DB00014,DOID:0060073,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,DB00014,DOID:0060119,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,DB00014,DOID:10021,0.000000,0.000000,0.000000,0.000000,0.000000,0.001642,0.000000,0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,DB00014,DOID:10153,0.000000,0.000000,0.000000,0.000000,0.000000,0.001915,0.003420,0.000622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,DB00014,DOID:1024,0.000000,0.000000,0.000283,0.000000,0.000000,0.000000,0.000000,0.000466,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,DB00014,DOID:10283,0.000196,0.000000,0.001891,0.000273,0.002448,0.000000,0.000000,0.000736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
%%time
dwpc_matrices.to_csv(path_or_buf='data/dwpc_data.tsv', sep='\t', float_format='%.6g', index=False)

CPU times: user 3min 49s, sys: 2.61 s, total: 3min 52s
Wall time: 3min 56s
