In [4]:
import torch
import pickle
import os
import ipdb
import numpy as np
import pandas as pd
import scipy.sparse as sp
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset

from torch_sparse import coalesce



In [5]:
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
# if args.cuda:
# 	torch.cuda.manual_seed(seed)
# 为所有GPU设置
torch.cuda.manual_seed_all(seed)

In [6]:
'''
[[ -V- | -E- ]      [ -E- | -V- ]]
'''

def load_LE_dataset(path, dataset):
    # load edges, features, and labels.
    print('Loading {} dataset...'.format(dataset))
    
    file_name = f'{dataset}.content'
    p2idx_features_labels = os.path.join(path, file_name)
    idx_features_labels = np.genfromtxt(p2idx_features_labels,
                                        dtype=np.dtype(str))
    # features = np.array(idx_features_labels[:, 1:-1])
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
#     labels = encode_onehot(idx_features_labels[:, -1])
    labels = torch.LongTensor(idx_features_labels[:, -1].astype(float))


    print ('load features')

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    
    file_name = f'{dataset}.edges'
    p2edges_unordered = os.path.join(path, file_name)
    edges_unordered = np.genfromtxt(p2edges_unordered,
                                    dtype=np.int32)
    
    
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)

    print ('load edges')


    projected_features = torch.FloatTensor(np.array(features.todense()))

    
    # From adjacency matrix to edge_list
    edge_index = edges.T 
    assert edge_index[0].max() == edge_index[1].min() - 1

    # check if values in edge_index is consecutive. i.e. no missing value for node_id/he_id.
    assert len(np.unique(edge_index)) == edge_index.max() + 1
    
    num_nodes = edge_index[0].max() + 1
    num_he = edge_index[1].max() - num_nodes + 1
    
    edge_index = np.hstack((edge_index, edge_index[::-1, :]))
    
    # build torch data class
    data = Data(
            x = torch.FloatTensor(np.array(features[:num_nodes].todense())), 
            edge_index = torch.LongTensor(edge_index),
            y = labels[:num_nodes])

    # data.coalesce()
    # There might be errors if edge_index.max() != num_nodes.
    # used user function to override the default function.
    # the following will also sort the edge_index and remove duplicates. 
    total_num_node_id_he_id = len(np.unique(edge_index))
    data.edge_index, data.edge_attr = coalesce(data.edge_index, 
            None, 
            total_num_node_id_he_id, 
            total_num_node_id_he_id)
            
    
    data.num_features = data.x.shape[-1]
    data.num_classes = len(np.unique(labels[:num_nodes].numpy()))
    data.num_nodes = num_nodes
    data.num_hyperedges = num_he
    
    return data

In [None]:
path='./20newsW100/' # 67, 成功
dataset='20newsW100'
# zoo/Mushroom/20newsW100/NTU2012/
data = load_LE_dataset(path,dataset)
print(data)

In [7]:
def load_cornell_dataset(path, dataset, feature_noise = 0.1, feature_dim = None):
    '''
    this will read the yelp dataset from source files, and convert it edge_list to 
    [[ -V- | -E- ]
     [ -E- | -V- ]]

    each node is a restaurant, a hyperedge represent a set of restaurants one user had been to.

    node features:
        - add gaussian noise with sigma = nosie, mean = one hot coded label.

    node label:
        - average stars from 2-10, converted from original stars which is binned in x.5, min stars = 1
    '''
    print(f'Loading hypergraph dataset from cornell: {dataset}')

    # first load node labels
    df_labels = pd.read_csv(os.path.join(path, f'node-labels-{dataset}.txt'), names = ['node_label'])
    num_nodes = df_labels.shape[0]
    labels = df_labels.values.flatten()

    # then create node features.
    num_classes = df_labels.values.max()
    features = np.zeros((num_nodes, num_classes))

    features[np.arange(num_nodes), labels - 1] = 1
    if feature_dim is not None:
        num_row, num_col = features.shape
        zero_col = np.zeros((num_row, feature_dim - num_col), dtype = features.dtype)
        features = np.hstack((features, zero_col))

    features = np.random.normal(features, feature_noise, features.shape)# 没有特征数据，形状和标签类数一样
    print(f'number of nodes:{num_nodes}, feature dimension: {features.shape[1]}')

    features = torch.FloatTensor(features)
    labels = torch.LongTensor(labels)
    labels = labels - labels.min() # shift label to 0

    # The last, load hypergraph.
    # Corenll datasets are stored in lines of hyperedges. Each line is the set of nodes for that edge.
    p2hyperedge_list = os.path.join(path, f'hyperedges-{dataset}.txt')
    node_list = []
    he_list = []
    he_id = num_nodes

    with open(p2hyperedge_list, 'r') as f:
        for line in f:
            if line[-1] == '\n':
                line = line[:-1]
            cur_set = line.split(',')
            cur_set = [int(x) for x in cur_set]

            node_list += cur_set
            he_list += [he_id] * len(cur_set)
            he_id += 1
    # shift node_idx to start with 0.
    node_idx_min = np.min(node_list)
    node_list = [x - node_idx_min for x in node_list]

    edge_index = [node_list + he_list, 
                  he_list + node_list]

    edge_index = torch.LongTensor(edge_index)

    data = Data(x = features,
                edge_index = edge_index,
                y = labels)
    assert data.y.min().item() == 0

    # data.coalesce()
    # There might be errors if edge_index.max() != num_nodes.
    # used user function to override the default function.
    # the following will also sort the edge_index and remove duplicates. 
    total_num_node_id_he_id = edge_index.max() + 1
    data.edge_index, data.edge_attr = coalesce(data.edge_index, 
            None, 
            total_num_node_id_he_id, 
            total_num_node_id_he_id)

    data.num_features = features.shape[-1]
    data.num_classes = len(np.unique(labels.numpy()))
    data.num_nodes = num_nodes
    data.num_hyperedges = he_id - num_nodes
    
    return data

In [8]:
path='./house-committees/' # 67, 成功
dataset='house-committees'
# zoo/Mushroom/20newsW100/NTU2012/
data = load_cornell_dataset(path,dataset)
print(data)

Loading hypergraph dataset from cornell: house-committees
number of nodes:1290, feature dimension: 2
Data(x=[1290, 2], edge_index=[2, 23686], y=[1290], num_features=2, num_classes=2, num_nodes=1290, num_hyperedges=341)


In [None]:
def get_binary_mask(total_size, indices):
    mask = torch.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

In [57]:
def rand_train_test_idx(label, train_prop=.5, valid_prop=.25, ignore_negative=True, balance=False):
    """ Adapted from https://github.com/CUAI/Non-Homophily-Benchmarks"""
    """ randomly splits label into train/valid/test splits """
    if ignore_negative:
        labeled_nodes = torch.where(label != -1)[0]
    else:
        labeled_nodes = label

    n = labeled_nodes.shape[0]

    if not balance:
        train_num = int(n * train_prop)
        valid_num = int(n * valid_prop)

        perm = torch.as_tensor(np.random.permutation(n))

        train_indices = perm[:train_num]
        val_indices = perm[train_num:train_num + valid_num]
        test_indices = perm[train_num + valid_num:]

        if not ignore_negative:
            return train_indices, val_indices, test_indices

        train_idx = labeled_nodes[train_indices]
        valid_idx = labeled_nodes[val_indices]
        test_idx = labeled_nodes[test_indices]

        split_idx = {'train': train_idx,
                     'valid': valid_idx,
                     'test': test_idx}
    else:
        #         ipdb.set_trace()
        indices = []
        for i in range(label.max()+1):
            index = torch.where((label == i))[0].view(-1)
            index = index[torch.randperm(index.size(0))]
            indices.append(index)

        percls_trn = int(train_prop/(label.max()+1)*len(labeled_nodes))
        val_lb = int(valid_prop*len(labeled_nodes))
        train_idx = torch.cat([i[:percls_trn] for i in indices], dim=0)
        rest_index = torch.cat([i[percls_trn:] for i in indices], dim=0)
        rest_index = rest_index[torch.randperm(rest_index.size(0))]
        valid_idx = rest_index[:val_lb]
        test_idx = rest_index[val_lb:]
        split_idx = {'train': train_idx,
                     'valid': valid_idx,
                     'test': test_idx}
    return split_idx

In [12]:
def rand_train_test_idx_pure(label, train_prop=.5, ignore_negative=True, balance=False):
    """ Adapted from https://github.com/CUAI/Non-Homophily-Benchmarks"""
    """ randomly splits label into train/valid/test splits """
    if ignore_negative:
        labeled_nodes = torch.where(label != -1)[0]
    else:
        labeled_nodes = label

    n = labeled_nodes.shape[0]

    if not balance:
        train_num = int(n * train_prop)
        # valid_num = int(n * valid_prop)

        perm = torch.as_tensor(np.random.permutation(n))

        train_indices = perm[:train_num]
        # val_indices = perm[train_num:train_num + valid_num]
        test_indices = perm[train_num:]

        if not ignore_negative:
            return train_indices, test_indices

        train_idx = labeled_nodes[train_indices]
        # valid_idx = labeled_nodes[val_indices]
        test_idx = labeled_nodes[test_indices]

        split_idx = {'train': train_idx,
                     'test': test_idx}
    else:
        #         ipdb.set_trace()
        indices = []
        for i in range(label.max()+1):
            index = torch.where((label == i))[0].view(-1)
            index = index[torch.randperm(index.size(0))]
            indices.append(index)

        percls_trn = int(train_prop/(label.max()+1)*len(labeled_nodes))
        # val_lb = int(valid_prop*len(labeled_nodes))
        train_idx = torch.cat([i[:percls_trn] for i in indices], dim=0)
        rest_index = torch.cat([i[percls_trn:] for i in indices], dim=0)
        rest_index = rest_index[torch.randperm(rest_index.size(0))]
        # valid_idx = rest_index[:val_lb]
        test_idx = rest_index[percls_trn:]
        split_idx = {'train': train_idx,
                     'test': test_idx}
    return split_idx

In [9]:
path

'./house-committees/'

In [10]:
split_path = path+'splits/'
if not os.path.isdir(split_path):
    os.makedirs(split_path)

In [13]:
for i in range(1,11):
    idx_list = rand_train_test_idx_pure(data.y)
    split_idxs = {'train': idx_list['train'].tolist(),'test': idx_list['test'].tolist()}
    with open(split_path+str(i)+".pickle", "wb") as fp:   #Pickling
        pickle.dump(split_idxs, fp)

In [14]:
split_name = split_path+'1.pickle'
Splits=None
with open(split_name, 'rb') as H: 
    Splits = pickle.load(H)

In [15]:
Splits

{'train': [893,
  990,
  1241,
  222,
  265,
  169,
  88,
  287,
  1119,
  503,
  1047,
  1074,
  756,
  473,
  862,
  1246,
  627,
  825,
  1060,
  846,
  642,
  486,
  383,
  330,
  472,
  443,
  39,
  896,
  49,
  730,
  316,
  991,
  936,
  341,
  532,
  134,
  935,
  427,
  163,
  423,
  1108,
  290,
  386,
  748,
  1038,
  798,
  764,
  84,
  742,
  662,
  1012,
  363,
  937,
  527,
  1098,
  784,
  1210,
  728,
  184,
  467,
  979,
  268,
  74,
  1256,
  375,
  1143,
  1112,
  704,
  982,
  669,
  820,
  1229,
  111,
  930,
  809,
  441,
  1201,
  1034,
  376,
  1117,
  311,
  21,
  626,
  1002,
  668,
  638,
  883,
  71,
  1071,
  1076,
  464,
  391,
  147,
  941,
  188,
  947,
  390,
  26,
  551,
  148,
  465,
  580,
  346,
  1141,
  540,
  415,
  884,
  1070,
  961,
  335,
  1169,
  1248,
  515,
  200,
  468,
  1155,
  1258,
  656,
  226,
  479,
  605,
  1109,
  1097,
  744,
  116,
  213,
  251,
  332,
  1046,
  130,
  240,
  1178,
  187,
  1279,
  878,
  136,
  1285,
  783,


In [62]:
len(Splits['train'])

140

In [67]:
Splits.keys()

dict_keys(['train', 'test'])