-
Notifications
You must be signed in to change notification settings - Fork 148
/
dataCenter.py
113 lines (91 loc) · 3.61 KB
/
dataCenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
import os
from collections import defaultdict
import numpy as np
class DataCenter(object):
"""docstring for DataCenter"""
def __init__(self, config):
super(DataCenter, self).__init__()
self.config = config
def load_dataSet(self, dataSet='cora'):
if dataSet == 'cora':
cora_content_file = self.config['file_path.cora_content']
cora_cite_file = self.config['file_path.cora_cite']
feat_data = []
labels = [] # label sequence of node
node_map = {} # map node to Node_ID
label_map = {} # map label to Label_ID
with open(cora_content_file) as fp:
for i,line in enumerate(fp):
info = line.strip().split()
feat_data.append([float(x) for x in info[1:-1]])
node_map[info[0]] = i
if not info[-1] in label_map:
label_map[info[-1]] = len(label_map)
labels.append(label_map[info[-1]])
feat_data = np.asarray(feat_data)
labels = np.asarray(labels, dtype=np.int64)
adj_lists = defaultdict(set)
with open(cora_cite_file) as fp:
for i,line in enumerate(fp):
info = line.strip().split()
assert len(info) == 2
paper1 = node_map[info[0]]
paper2 = node_map[info[1]]
adj_lists[paper1].add(paper2)
adj_lists[paper2].add(paper1)
assert len(feat_data) == len(labels) == len(adj_lists)
test_indexs, val_indexs, train_indexs = self._split_data(feat_data.shape[0])
setattr(self, dataSet+'_test', test_indexs)
setattr(self, dataSet+'_val', val_indexs)
setattr(self, dataSet+'_train', train_indexs)
setattr(self, dataSet+'_feats', feat_data)
setattr(self, dataSet+'_labels', labels)
setattr(self, dataSet+'_adj_lists', adj_lists)
elif dataSet == 'pubmed':
pubmed_content_file = self.config['file_path.pubmed_paper']
pubmed_cite_file = self.config['file_path.pubmed_cites']
feat_data = []
labels = [] # label sequence of node
node_map = {} # map node to Node_ID
with open(pubmed_content_file) as fp:
fp.readline()
feat_map = {entry.split(":")[1]:i-1 for i,entry in enumerate(fp.readline().split("\t"))}
for i, line in enumerate(fp):
info = line.split("\t")
node_map[info[0]] = i
labels.append(int(info[1].split("=")[1])-1)
tmp_list = np.zeros(len(feat_map)-2)
for word_info in info[2:-1]:
word_info = word_info.split("=")
tmp_list[feat_map[word_info[0]]] = float(word_info[1])
feat_data.append(tmp_list)
feat_data = np.asarray(feat_data)
labels = np.asarray(labels, dtype=np.int64)
adj_lists = defaultdict(set)
with open(pubmed_cite_file) as fp:
fp.readline()
fp.readline()
for line in fp:
info = line.strip().split("\t")
paper1 = node_map[info[1].split(":")[1]]
paper2 = node_map[info[-1].split(":")[1]]
adj_lists[paper1].add(paper2)
adj_lists[paper2].add(paper1)
assert len(feat_data) == len(labels) == len(adj_lists)
test_indexs, val_indexs, train_indexs = self._split_data(feat_data.shape[0])
setattr(self, dataSet+'_test', test_indexs)
setattr(self, dataSet+'_val', val_indexs)
setattr(self, dataSet+'_train', train_indexs)
setattr(self, dataSet+'_feats', feat_data)
setattr(self, dataSet+'_labels', labels)
setattr(self, dataSet+'_adj_lists', adj_lists)
def _split_data(self, num_nodes, test_split = 3, val_split = 6):
rand_indices = np.random.permutation(num_nodes)
test_size = num_nodes // test_split
val_size = num_nodes // val_split
train_size = num_nodes - (test_size + val_size)
test_indexs = rand_indices[:test_size]
val_indexs = rand_indices[test_size:(test_size+val_size)]
train_indexs = rand_indices[(test_size+val_size):]
return test_indexs, val_indexs, train_indexs