In [1]:
import numpy as np
from sklearn.metrics import normalized_mutual_info_score as NMI, adjusted_mutual_info_score as AMI, adjusted_rand_score as ARI
import scipy.sparse as sp

In [2]:
def load_assortative(dataset="cora"):
    import pickle as pkl
    import networkx as nx
    import scipy.sparse as sp
    import torch

    def parse_index_file(filename):
        index = []
        for line in open(filename):
            index.append(int(line.strip()))
        return index

    def sample_mask(idx, l):
        """Create mask."""
        mask = np.zeros(l)
        mask[idx] = 1
        return np.array(mask, dtype=np.bool)

    if dataset in ["cora", "citeseer", "pubmed"]:
        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []

        for i in range(len(names)):
            '''
            fix Pickle incompatibility of numpy arrays between Python 2 and 3
            https://stackoverflow.com/questions/11305790/pickle-incompatibility-of-numpy-arrays-between-python-2-and-3
            '''
            with open("/data/liuyue/New/SBM/mySBM/data/{}/ind.{}.{}".format(dataset, dataset, names[i]), 'rb') as rf:
                u = pkl._Unpickler(rf)
                u.encoding = 'latin1'
                cur_data = u.load()
                objects.append(cur_data)
            # objects.append(
            #     pkl.load(open("data/ind.{}.{}".format(dataset, names[i]), 'rb')))
        x, y, tx, ty, allx, ally, graph = tuple(objects)
        test_idx_reorder = parse_index_file(
            "data/{}/ind.{}.test.index".format(dataset, dataset))
        test_idx_range = np.sort(test_idx_reorder)


        if dataset == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(
                min(test_idx_reorder), max(test_idx_reorder) + 1)
            tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
            tx_extended[test_idx_range - min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
            ty_extended[test_idx_range - min(test_idx_range), :] = ty
            ty = ty_extended

        features = sp.vstack((allx, tx)).tolil()
        features[test_idx_reorder, :] = features[test_idx_range, :]
        features = torch.FloatTensor(np.array(features.todense()))
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
        
        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]
        
        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y) + 500)

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask = sample_mask(idx_val, labels.shape[0])
        test_mask = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]

        adj = adj.toarray()
        labels = labels.argmax(1)
        # idx = labels.argsort(0)
        # adj = adj[idx, :][:, idx]
        # labels = labels[idx]
        # features = features[idx]

        adj = sp.coo_matrix(adj)
        features = sp.coo_matrix(features)

        return adj, features, labels
    elif dataset == "wiki":
        f = open('/data/liuyue/New/SBM/mySBM/data/wiki/graph.txt','r')
        adj, xind, yind = [], [], []
        for line in f.readlines():
            line = line.split()
            
            xind.append(int(line[0]))
            yind.append(int(line[1]))
            adj.append([int(line[0]), int(line[1])])
        f.close()
        ##print(len(adj))

        f = open('/data/liuyue/New/SBM/mySBM/data/wiki/group.txt','r')
        label = []
        for line in f.readlines():
            line = line.split()
            label.append(int(line[1]))
        f.close()

        f = open('/data/liuyue/New/SBM/mySBM/data/wiki/tfidf.txt','r')
        fea_idx = []
        fea = []
        adj = np.array(adj)
        adj = np.vstack((adj, adj[:,[1,0]]))
        adj = np.unique(adj, axis=0)
        
        labelset = np.unique(label)
        labeldict = dict(zip(labelset, range(len(labelset))))
        label = np.array([labeldict[x] for x in label])
        adj = sp.coo_matrix((np.ones(len(adj)), (adj[:,0], adj[:,1])), shape=(len(label), len(label)))

        for line in f.readlines():
            line = line.split()
            fea_idx.append([int(line[0]), int(line[1])])
            fea.append(float(line[2]))
        f.close()

        fea_idx = np.array(fea_idx)
        features = sp.coo_matrix((fea, (fea_idx[:,0], fea_idx[:,1])), shape=(len(label), 4973)).toarray()
        scaler = preprocessing.MinMaxScaler()
        #features = preprocess.normalize(features, norm='l2')
        features = scaler.fit_transform(features)
        # features = torch.FloatTensor(features)
        features = sp.coo_matrix(features)

        return adj, features, label
    elif dataset in ["ogbn-arxiv", "ogbn-products"]:
        dataset = DglNodePropPredDataset(name="{}".format(dataset))
        g, labels = dataset[0]
        edge_indices = g.adj_sparse(fmt="coo")
        n, m = labels.shape[0], edge_indices[0].shape[0]
        adj = sp.coo_matrix((np.ones(m), (edge_indices[0].numpy(), edge_indices[1].numpy())), shape=(n,n))
        features = g.ndata["feat"]
        features = sp.coo_matrix(features)

        if labels.ndim > 1:
            if labels.shape[1] == 1:
                labels = labels.view(-1)
            else:
                labels = labels.argmax(1)
        labels = labels.numpy()
        return adj, features, labels
    elif dataset in ["amazon-photo", "amazon-computers", "cora-full"]:
        map2names = {
            "amazon-photo": "/data/liuyue/New/SBM/mySBM/data/amazon_electronics_photo.npz",
            "amazon-computers": "/data/liuyue/New/SBM/mySBM/data/amazon_electronics_computers.npz",
            "cora-full": "/data/liuyue/New/SBM/mySBM/data/cora_full.npz",
        }

        data = np.load(map2names[dataset])
        # print(list(data.keys()))
        adj_data, adj_indices, adj_indptr, adj_shape = data["adj_data"], data["adj_indices"], data["adj_indptr"], data["adj_shape"]
        attr_data, attr_indices, attr_indptr, attr_shape = data["attr_data"], data["attr_indices"], data["attr_indptr"], data["attr_shape"]
        labels = data["labels"]

        adj = sp.csr_matrix((adj_data, adj_indices, adj_indptr), shape=adj_shape).tocoo()
        features = sp.csr_matrix((attr_data, attr_indices, attr_indptr), shape=attr_shape).tocoo()

        if labels.ndim > 1:
            if labels.shape[1] == 1:
                labels = labels.reshape(-1)
            else:
                labels = labels.argmax(1)

        return adj, features, labels
    else:
        raise NotImplementedError()
def load_cora_full_diff_cls(nclass=10, seed=None):
    filename = "/data/liuyue/New/SBM/mySBM/data_diff_cls/cora-full_{}_{}.npz".format(nclass, seed)
    data = np.load(filename)

    adj_raw, features_raw, labels_raw = load_assortative("cora-full")

    adj_data, adj_row, adj_col, features_load, labels_load, mask = data["data"], data["row"], data["col"], data["features"], data["labels"], data["mask"]
    adj_load = sp.coo_matrix((adj_data, (adj_row, adj_col)), shape=(labels_load.shape[0], labels_load.shape[0]))

    adj_mask = adj_raw.toarray()[mask,:][:,mask]
    assert (adj_mask - adj_load).sum() < 1e-7
    features_mask = features_raw.toarray()[mask]
    assert (features_mask - features_load).sum() < 1e-7

    return adj_load, features_load, labels_load, mask

In [3]:
model = "MVGRL"
nclasses = np.arange(5, 30, 5, dtype=int)
seeds = np.arange(0, 3, 1, dtype=int)

In [8]:
nmi_m, ami_m, ari_m = {}, {}, {}
for nclass in nclasses:
    nmis, amis, aris = [], [], []
    for seed in seeds:
        print(nclass, seed)
        
        adj, features, labels, mask = load_cora_full_diff_cls(nclass, seed)
        
        data = np.load("Cluster/{}/lo_cora-full_preds_{:d}_{:d}.npz".format(model, nclass, seed))
        preds = data["preds"]
        
        nmi = NMI(labels, preds)
        ami = AMI(labels, preds)
        ari = ARI(labels, preds)
        
        nmis.append(nmi)
        amis.append(ami)
        aris.append(ari)
    nmi_m[nclass] = np.mean(nmis)
    ami_m[nclass] = np.mean(amis)
    ari_m[nclass] = np.mean(aris)
    
print(ari_m.values())
        

5 0
5 1
5 2
10 0
10 1
10 2
15 0
15 1
15 2
20 0
20 1
20 2
25 0
25 1
25 2
dict_values([0.684763233893273, 0.5615929873994042, 0.42745470833124966, 0.28558800050804667, 0.2622590953339026])


In [9]:
nmi_m, ami_m, ari_m = {}, {}, {}
for nclass in nclasses:
    nmis, amis, aris = [], [], []
    for seed in seeds:
        print(nclass, seed)
        
        adj, features, labels, mask = load_cora_full_diff_cls(nclass, seed)
        
        data = np.load("Cluster/{}/lo_cora-full_preds_{:d}_{:d}.npz".format(model, nclass, seed))
        preds = data["preds"]
        
        nmi = NMI(labels, preds)
        ami = AMI(labels, preds)
        ari = ARI(labels, preds)
        
        nmis.append(nmi)
        amis.append(ami)
        aris.append(ari)
    nmi_m[nclass] = np.mean(nmis)
    ami_m[nclass] = np.mean(amis)
    ari_m[nclass] = np.mean(aris)
    
print(ami_m.values())
        

5 0
5 1
5 2
10 0
10 1
10 2
15 0
15 1
15 2
20 0
20 1
20 2
25 0
25 1
25 2
dict_values([0.6895115180624249, 0.6498257368682231, 0.5984047566740517, 0.502911088009132, 0.5070790600237952])


In [10]:
model = "MVGRL"
nclasses = np.arange(10, 30, 5, dtype=int)
seeds = np.arange(0, 3, 1, dtype=int)
nmi_m, ami_m, ari_m = {}, {}, {}
for nclass in nclasses:
    nmis, amis, aris = [], [], []
    for seed in seeds:
        print(nclass, seed)
        
        adj, features, labels, mask = load_cora_full_diff_cls(nclass, seed)
        
        data = np.load("Cluster/{}/lo_cora-full_preds_{:d}_{:d}_20m.npz".format(model, nclass, seed))
        preds = data["preds"]
        
        nmi = NMI(labels, preds)
        ami = AMI(labels, preds)
        ari = ARI(labels, preds)
        
        nmis.append(nmi)
        amis.append(ami)
        aris.append(ari)
    nmi_m[nclass] = np.mean(nmis)
    ami_m[nclass] = np.mean(amis)
    ari_m[nclass] = np.mean(aris)
    
print(ari_m.values())
        

10 0
10 1
10 2
15 0
15 1
15 2
20 0
20 1
20 2
25 0
25 1
25 2
dict_values([0.520174650272346, 0.3951582340722349, 0.2963118914818221, 0.24482040317321627])


In [11]:
model = "MVGRL"
nclasses = np.arange(2, 12, 2, dtype=int)
seeds = np.arange(0, 3, 1, dtype=int)
nmi_m, ami_m, ari_m = {}, {}, {}
for nclass in nclasses:
    nmis, amis, aris = [], [], []
    for seed in seeds:
        print(nclass, seed)
        
        adj, features, labels, mask = load_cora_full_diff_cls(nclass, seed)
        
        data = np.load("Cluster/{}/lo_cora-full_preds_{:d}_{:d}.npz".format(model, nclass, seed))
        preds = data["preds"]
        
        nmi = NMI(labels, preds)
        ami = AMI(labels, preds)
        ari = ARI(labels, preds)
        
        nmis.append(nmi)
        amis.append(ami)
        aris.append(ari)
    nmi_m[nclass] = np.mean(nmis)
    ami_m[nclass] = np.mean(amis)
    ari_m[nclass] = np.mean(aris)
    
print(ari_m.values())
        

2 0
2 1
2 2
4 0
4 1
4 2
6 0
6 1
6 2
8 0
8 1
8 2
10 0
10 1
10 2
dict_values([0.7686434049993044, 0.7547663270464376, 0.6755129409229564, 0.5658450546388248, 0.5399687429222942])


In [12]:
model = "MVGRL"
nclasses = np.arange(2, 12, 2, dtype=int)
seeds = np.arange(0, 3, 1, dtype=int)
nmi_m, ami_m, ari_m = {}, {}, {}
for nclass in nclasses:
    nmis, amis, aris = [], [], []
    for seed in seeds:
        print(nclass, seed)
        
        adj, features, labels, mask = load_cora_full_diff_cls(nclass, seed)
        
        data = np.load("Cluster/{}/lo_cora-full_preds_{:d}_{:d}.npz".format(model, nclass, seed))
        preds = data["preds"]
        
        nmi = NMI(labels, preds)
        ami = AMI(labels, preds)
        ari = ARI(labels, preds)
        
        nmis.append(nmi)
        amis.append(ami)
        aris.append(ari)
    nmi_m[nclass] = np.mean(nmis)
    ami_m[nclass] = np.mean(amis)
    ari_m[nclass] = np.mean(aris)
    
print(ami_m.values())
        

2 0
2 1
2 2
4 0
4 1
4 2
6 0
6 1
6 2
8 0
8 1
8 2
10 0
10 1
10 2
dict_values([0.6965557337232374, 0.7271197258025127, 0.7133557899835897, 0.6400073151098621, 0.6480345678867704])
