In [1]:
from __future__ import print_function, division
import argparse
import random
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.nn import Linear
from collections import Counter

from sdcn.utils import load_data, load_graph
from sdcn.GNN import GNNLayer
from sdcn.evaluation import eva

import time
import networkx as nx
from sdcn.evaluation import modularity
from sdcn.pretrain import AE
from myutils import load_ordered_graph
from mnn.decoder import BerpoDecoder

# torch.cuda.set_device(1)

In [2]:

batch_norm = True       # whether to use batch norm
max_epochs = 200        # number of epochs to train
display_step = 100       # how often to compute validation loss
stochastic_loss = True  # whether to use stochastic or full-batch training
batch_size = 20000      # batch size (only for stochastic training)


In [3]:
# cuda
cuda = torch.cuda.is_available()
device = torch.device( "cuda" if cuda else "cpu")
print("use cuda: {}".format( cuda))

'''
 'bio72' not ok, almost all in one community
'''
# Graph matrix (coo_matrix, csr_matrix, dense )
name = 'dpwk'
A, A_csr, A_dense = load_graph( name )
A = A.cuda()
# AttributeError: cuda not found
# A_csr = A_csr.cuda()
A_dense = torch.Tensor(A_dense).to(device)
# Features
dataset = load_data( name )
N, K = len(dataset.x), len(dataset.x[0])
print('features shape {}(nodes) * {}(columns) '.format(N,K))
# Loss class
decoder = BerpoDecoder(N, A_csr.nnz, balance_loss=False)
print('graph has {}(edges) '.format(A_csr.nnz))
# pretrain AutoEncoder data
pretrain_path = './pretrain/' + name + '.pkl'
# sdcn epochs
epochs = 50 # 30 # 100 # 200

use cuda: True
training graph(./mygraph/dpwk_edgelist.txt) reading...
reading dpwk.txt, please wait for about a thousand years....
features shape 25023(nodes) * 128(columns) 
graph has 4624939(edges) 


In [4]:
# deep learning model 
class SDCN(nn.Module):

    def __init__(self, n_input, n_1, n_2, n_3, n_z, n_d3, n_d2, n_d1, n_clusters, v=1):
        super(SDCN, self).__init__()

        # TODO  not separately pretrain but altogether
        
        # autoencoder for intra information    #  symmetric 
        self.ae = AE( args.n_input, args.n_1, args.n_2, args.n_3, 
                     args.n_z, args.n_3, args.n_2, args.n_1 )
        self.ae.load_state_dict(torch.load( pretrain_path, map_location='cpu'))

        # GCN for inter information
        self.gnn_1 = GNNLayer(n_input, n_1)
        self.gnn_2 = GNNLayer(n_1, n_2)
        self.gnn_3 = GNNLayer(n_2, n_3)
        self.gnn_4 = GNNLayer(n_3, n_z)
        self.gnn_5 = GNNLayer(n_z, n_clusters)

        # cluster layer
        self.cluster_layer = Parameter(torch.Tensor(n_clusters, n_z))
        torch.nn.init.xavier_normal_(self.cluster_layer.data)

        # degree
        self.v = v

    def forward(self, x, A):
        # DNN Module
        x_bar, tra1, tra2, tra3, z = self.ae(x)

        # GCN Module
        h1 = self.gnn_1(x, A)
        h2 = self.gnn_2(h1+tra1, A)
        h3 = self.gnn_3(h2+tra2, A)
        h4 = self.gnn_4(h3+tra3, A)
        h5 = self.gnn_5(h4+z, A, active=False)
        predict = F.softmax(h5, dim=1)
        # z_sdcn = F.relu(h5)
        
        '''
        # Dual Self-supervised Module
        q = 1.0 / (1.0 + torch.sum(torch.pow(z.unsqueeze(1) - self.cluster_layer, 2), 2) / self.v)
        q = q.pow((self.v + 1.0) / 2.0)
        q = (q.t() / torch.sum(q, 1)).t()
        '''
       
        return x_bar, predict, z # , z_sdcn


In [5]:
def target_distribution(q):
    weight = q**2 / q.sum(0)
    return (weight.t() / weight.sum(1)).t()


def train_sdcn(dataset, args):
    # initial dimensions 500,500,2000,10,2000,500,500
    model = SDCN(   n_input=args.n_input, n_1 = args.n_1, n_2 = args.n_2, n_3 = args.n_3, 
                    n_z=args.n_z, n_d3 = args.n_3, n_d2 = args.n_2, n_d1 = args.n_1, 
                    n_clusters=args.n_clusters, v=1.0).to(device)
    print(model)

    optimizer = Adam(model.parameters(), lr=args.lr)

    # X
    data = torch.Tensor(dataset.x).to(device)
    
    '''
    # TODO only AutoEncoder result
    
    with torch.no_grad():
        _, _, _, _, z = model.ae(data)
    kmeans = KMeans(n_clusters=args.n_clusters, n_init=20)
    y_pred = kmeans.fit_predict(z.data.cpu().numpy())
    y_pred_last = y_pred
    model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).to(device)
    # eva(y, y_pred, 'pae')
    '''
    
    # training...
    for epoch in range(epochs):
        
        x_bar, pred, z  = model(data, A)
        
        '''
        # my initial naive loss
        re_loss = F.mse_loss(x_bar, data)
        # 社区发现重构边损失
        A_rec = torch.mm( z_sdcn, z_sdcn.T ).cuda()     
        cd_loss = F.mse_loss( A_rec, A_dense )
        # 72维features重构、25023*25023图社区划分结果重构损失加权
        loss =  0.001 * cd_loss + re_loss          
        '''
        
        # predict result
        res = pred.data.cpu().numpy().argmax(1)   #Z
        # BernoulliDecoder loss
        loss = decoder.loss_full( pred, A_csr )
        print('loss of epoch {}:{} '.format(epoch,loss))
        
        
        # last modularity evaluation 
        if epoch == epochs-1:    
            # G = nx.read_adjlist( './mygraph/bio30_ps.adjlist' )    # my graph
            graph_path = ''
            if  name == 'bio72':
                graph_path = './mygraph/bio30_ps.adjlist'
            elif args.name in ['dpwk','line','lle','n2v'] :
                graph_path = './od2graphs/' +  name + '.adjlist'
            print('evaluating graph({}) reading...'.format(graph_path))
            G = load_ordered_graph( graph_path )

            pred_dic = {}
            for idx,pred_label in enumerate(res):
                pred_dic[idx] = pred_label
            pred_modul = modularity(G, pred_dic)
            print('epoch {}  modularity {:.4f}'.format(epoch,pred_modul))
        
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

In [6]:
# 矩阵25023 * 25023 内存不够，彻底失败
if __name__ == "__main__":
    print('begin time:{}'.format(time.asctime(time.localtime(time.time()))))    # time 
    parser = argparse.ArgumentParser(
        description='train',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--name', type=str, default= name )
    parser.add_argument('--k', type=int, default=3)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--pretrain_path', type=str, default='./pretrain/bio72.pkl')
    parser.add_argument('--data_path', type=str, default = './mydata/bio72.csv')    # redundant......
    
    parser.add_argument('--n_input', default=72, type=int)
    parser.add_argument('--n_1', default=500, type=int)             # dimensions 
    parser.add_argument('--n_2', default=500, type=int)             # n_in  ->  n_1  ->  n_2  ->  n_3  ->  n_z
    parser.add_argument('--n_3', default=2000, type=int)            # n_in  <-  d_1  <-  d_2  <-  d_3  <-  n_z
    parser.add_argument('--n_z', default=10, type=int)
    parser.add_argument('--n_clusters', default=8, type=int)

    args = parser.parse_known_args()[0]


    if args.name == 'bio72':
        args.n_input = 72
        args.pretrain_path = './pretrain/bio72.pkl'
        args.data_path = './mydata/bio72.csv'
    
    if args.name in ['dpwk','line','lle','n2v'] :
        args.lr = 1e-3
        args.n_input = 128
        
    if args.name == 'biomat':      # totally failed
        args.n_input = 25023
        args.pretrain_path = './pretrain/biomat.pkl'
        args.data_path = './mydata/biomat.txt'
    
    
    print(args)
    train_sdcn(dataset, args)
    print('end time:{}'.format(time.asctime(time.localtime(time.time()))))    # time 


begin time:Sun Apr 18 16:01:18 2021
Namespace(data_path='./mydata/bio72.csv', k=3, lr=0.001, n_1=500, n_2=500, n_3=2000, n_clusters=8, n_input=128, n_z=10, name='dpwk', pretrain_path='./pretrain/bio72.pkl')
SDCN(
  (ae): AE(
    (enc_1): Linear(in_features=128, out_features=500, bias=True)
    (enc_2): Linear(in_features=500, out_features=500, bias=True)
    (enc_3): Linear(in_features=500, out_features=2000, bias=True)
    (z_layer): Linear(in_features=2000, out_features=10, bias=True)
    (dec_3): Linear(in_features=10, out_features=2000, bias=True)
    (dec_2): Linear(in_features=2000, out_features=500, bias=True)
    (dec_1): Linear(in_features=500, out_features=500, bias=True)
    (x_bar_layer): Linear(in_features=500, out_features=128, bias=True)
  )
  (gnn_1): GNNLayer()
  (gnn_2): GNNLayer()
  (gnn_3): GNNLayer()
  (gnn_4): GNNLayer()
  (gnn_5): GNNLayer()
)
loss of epoch 0:0.14422288537025452 
loss of epoch 1:0.19342853128910065 
loss of epoch 2:0.14399345219135284 
loss of ep