**Lemane**

This paper is based on STRAP. The main contribution of this paper is the training alpha part. This paper first use a unsurprised learning to learning the alpha, which is used as coefficient of power iteration of PPR. For small graphs, the authors use the whole graph to train the model, whereas for big graphs, they use BFS to generate subgraph to get the alpha. After that, they use PPR on the entire graph with these alphas to generate embedding for each nodes for the downstream tasks.

Although the source code has some mistakes, the authors still beats SOTA. 

In [None]:
!git clone https://github.com/Josiah96Zhang/Lemane.git

Cloning into 'Lemane'...
remote: Enumerating objects: 309, done.[K
remote: Counting objects: 100% (309/309), done.[K
remote: Compressing objects: 100% (211/211), done.[K
remote: Total 309 (delta 145), reused 240 (delta 78), pack-reused 0[K
Receiving objects: 100% (309/309), 22.21 MiB | 18.85 MiB/s, done.
Resolving deltas: 100% (145/145), done.


In [None]:
import torch.optim as optim
import numpy as np
import random
import torch
import time
import os
import torch.nn.functional as F
import scipy.sparse as sp
import networkx as nx
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
from torch.nn.parameter import Parameter
import math

In [None]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fa245a12050>

In [None]:
nepoch = 10
lr = 0.01
wdecay1 = 0.01
nhop = 15
sample = 10
sample_sup_label_set = False
patience = 10
data = 'wiki'
dist = 'p'
param = 5
beta = 1.0
gamma = 1.0
seed = 1629459462
task = 'cl'
sample_size = 2000

In [None]:
def set_seed(seed):
  if seed == 0:
      seed = int(time.time())
  random.seed(seed)
  np.random.seed(seed)
  np.random.RandomState(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  torch.backends.cudnn.enabled = False
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  return seed

In [None]:
def load_info(dataset):
  attr_path = "Lemane/data/" + dataset + "/attr.txt"
  if not attr_path or not os.path.exists(attr_path):
    raise Exception("graph attr file does not exist!")
  with open(attr_path) as fin:
    n = int(fin.readline().split("=")[1])
    m = int(fin.readline().split("=")[1])
    directed = (fin.readline().strip()=="directed")
  fin.close()
  print("graph name: {}".format(dataset))
  return n, m, directed

In [None]:
def sup_label_set(dataset,n):
  label_path = "Lemane/label/" + dataset + ".txt"
  sup_label_path = "Lemane/label/" + dataset + "_sup.txt"
  labeled_list = []
  if dataset in ["tweibo", "orkut"]:
    if not label_path or not os.path.exists(label_path):
      raise Exception("node label file does not exist!")
    else:
      with open(label_path) as f:
        for line in f:
          vec = line.strip().split()
          i = int(vec[0])
          labeled_list.append(i)
      f.close()
    index_list = random.sample(list(range(len(labeled_list))), int(0.05 * len(labeled_list)))
    sup_list = []
    for i in index_list:
      sup_list.append(labeled_list[i])
    sup_list.sort()
  else:
    sup_list = random.sample(list(range(n)), int(0.05 * n))
    sup_list.sort()
  fout = open(sup_label_path, "w")
  for i in sup_list:
    fout.write(str(i) + "\n")
  fout.close()

In [None]:
def show_info(G):
  print('Num of nodes: %d, num of edges: %d, Avg degree: %f, Directed:%s' % (G.number_of_nodes(), G.number_of_edges(), G.number_of_edges()*2./G.number_of_nodes(), str(nx.is_directed(G))))
  return G.number_of_edges()

In [None]:
def load_edge(dataset, n, directed, task):
  if task == "lp":
    edgelist_path = "Lemane/lp_data/train_graph/" + dataset + ".txt"
  else:
    edgelist_path = "Lemane/data/" + dataset + ".txt"
  if not edgelist_path or not os.path.exists(edgelist_path):
    raise Exception("edgelist file does not exist!")
  t1 = time.time()
  with open(edgelist_path, 'r') as f:
    if directed:
      G = nx.DiGraph()
    else:
      G = nx.Graph()
    _ = f.readline()
    for line in f:
      edge = line.strip().split()
      u, v = int(edge[0]), int(edge[1])
      G.add_edge(u, v)
    for i in range(n):
      if i not in G.nodes():
        G.add_edge(i,i)
  f.close()
  t2 = time.time()
  print('%fs taken for loading graph' % (t2 - t1))
  m = show_info(G)
  adj = nx.adjacency_matrix(G, sorted(G.nodes()))
  print('%fs taken for generating adj matrix' % (time.time() - t2))
  return G, adj, m

In [None]:
def get_sample_neg_laplacian(sample, num_sample):
  adj_neg = sp.random(num_sample, num_sample, density=sample/num_sample, data_rvs=np.ones, random_state=1628837069)
  deg_neg = sp.diags(np.array(adj_neg.sum(1)).flatten())
  Lapneg = (deg_neg - adj_neg).tocoo().astype(np.float64)

  indices_neg = torch.from_numpy(np.vstack((Lapneg.row, Lapneg.col)).astype(np.int64))
  values_neg = torch.from_numpy(Lapneg.data)
  shape_neg = torch.Size(Lapneg.shape)

  return torch.sparse.DoubleTensor(indices_neg, values_neg, shape_neg), adj_neg


In [None]:
def get_trans_prob_mat(adj, sample_list, sample_size, task):
  assert len(sample_list) == sample_size
  if sample_size == 0:
    sample_adj = adj
  else:
    sample_adj = adj[sample_list]
    sample_adj = sp.csc_matrix(sample_adj)[:,sample_list]
    sample_adj = sp.coo_matrix(sample_adj)
  row_sum = np.array(sample_adj.sum(1))
  row_sum[row_sum < 1] = 1
  degree = np.array(sample_adj.sum(1)).flatten()
  prob = sp.coo_matrix(sample_adj / row_sum).astype(np.float64)
  indices = torch.from_numpy(np.vstack((prob.row, prob.col)).astype(np.int64))
  values = torch.from_numpy(prob.data)
  shape = torch.Size(prob.shape)
  if task == 'lp':
    adj = sample_adj
    return torch.sparse.DoubleTensor(indices, values, shape), adj, torch.DoubleTensor(degree)
  if task == 'cl':
    return torch.sparse.DoubleTensor(indices, values, shape)

In [None]:
def load_label(dataset, n):
  sup_label_path = "Lemane/label/" + dataset + "_sup.txt"
  Sup_List = []
  if not sup_label_path or not os.path.exists(sup_label_path):
    sup_label_set(dataset, n)
  with open(sup_label_path) as f:
    for line in f:
      vec = line.strip().split()
      i = int(vec[0])
      Sup_List.append(i)
  f.close()


  label_path = "Lemane/label/" + dataset + ".txt"
  Node_Label = [[]for i in range(n)]
  if not label_path or not os.path.exists(label_path):
    raise Exception("node label file does not exist!")
  if dataset in ["tweibo", "orkut"]:
    with open(label_path) as f:
      for line in f:
        vec = line.strip().split()
        i = int(vec[0])
        if i in Sup_List:
          Node_Label[i] = vec[1:]
          for j in range(len(Node_Label[i])):
            Node_Label[i][j] = int(Node_Label[i][j]) + 1
  else:
    with open(label_path) as f:
      for line in f:
        vec = line.strip().split()
        i = int(vec[0])
        if i in Sup_List:
          Node_Label[i] = vec[1:]
          for j in range(len(Node_Label[i])):
            Node_Label[i][j] = int(Node_Label[i][j])
  
  f.close()
  return Node_Label

In [None]:
def get_sample_label_list(Node_label, sample_list,  num_label):
  if len(sample_list) == 0:
    node_label = Node_label
  else:
    node_label = [Node_label[i] for i in sample_list]

  label_list = [ [] for i in range(num_label+1)]
  for i in range(len(node_label)):
    for j in node_label[i]:
      label_list[j].append(i)
  label_list = [k for k in label_list if len(k)>0]
  binarizer = MultiLabelBinarizer(sparse_output=False, classes=list(range(1,num_label+1)))
  node_label = torch.tensor(binarizer.fit_transform(node_label))
  return label_list, node_label

In [None]:
def get_label_laplacian(L):
  Laplabel = []
  for i in range(len(L)):
    n = len(L[i])
    Laplabel_i = sp.coo_matrix(np.eye(n) * n - 1).tocoo().astype(np.float64)
    indices_i = torch.from_numpy(np.vstack((Laplabel_i.row, Laplabel_i.col)).astype(np.int64))
    values_i = torch.from_numpy(Laplabel_i.data)
    shape_i = torch.Size(Laplabel_i.shape)
    Laplabel_i = torch.sparse.DoubleTensor(indices_i, values_i, shape_i)
    Laplabel.append(Laplabel_i)
  return Laplabel

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def simple_randomized_torch_svd(B, k, task):
  _, n = B.size()
  rand_matrix = torch.rand((n,k), dtype=torch.float64).to(device)   
  Q, _ = torch.qr(B @ rand_matrix)                                # qr decomposition
  Q.to(device)
  smaller_matrix = (Q.transpose(0, 1) @ B).to(device)
  U_hat, s, V = torch.svd(smaller_matrix, True)                   # matrix decompostion
  U_hat.to(device)
  U = (Q @ U_hat)

  if task == 'lp':
    return U @ (s.pow(0.5).diag()), V @ (s.pow(0.5).diag())     # for link prediction
  if task == 'cl':
    return torch.cat((U @ (s.pow(0.5).diag()),V @ (s.pow(0.5).diag())), 1) # for node classification

In [None]:
class ComputeProximity4SVD(nn.Module):
  def __init__(self, ngraph, niter, dist, param, nclass=1):
    super(ComputeProximity4SVD, self).__init__()
    self.ngraph = ngraph
    self.niter = niter
    self.nclass = nclass
    self.params1 = Parameter(torch.DoubleTensor(self.niter + 1))
    self.init_params(dist, param)
    self.fcs = nn.ModuleList()
    self.fcs.append(nn.Linear(256, nclass).double())
    torch.nn.init.xavier_uniform_(self.fcs[0].weight)

  # compute the teleport probabilities of the random walks generated from Poisson distribution
  def poisson_dist(self, t):
    K = 100
    poisson = [0]*(K+1)
    poissonsum = [0]*(K+2)
    stay = torch.DoubleTensor(self.niter+1)
    poisson[0] = 1.0/math.exp(t)
    poissonsum[0] = 0.
    poissonsum[1] =  poissonsum[0] + poisson[0]
    for i in range(1, K+1):
      poisson[i] = poisson[i-1] * t * 1.0 / i
      poissonsum[i+1] = poissonsum[i] + poisson[i]

    for i in range(self.niter+1):
      stay[i] = poisson[i]/(1.0 - poissonsum[i])
    return stay

  # teleport probabilities initialization
  def init_params(self, dist, param):
    if dist == 'p':
      self.params1.data = self.poisson_dist(param)
    if dist == 'g':
      self.params1.data = torch.ones(self.niter + 1) * param

  # forward propagation process
  def forward(self, prob, identity, threshold, task):
    hi = identity
    prx_mat = hi * self.params1[0]
    for i in range(self.niter):
      hi = (prob @ hi) * (1 - self.params1[i])
      prx_mat = prx_mat + hi * self.params1[i+1]
    prx_mat = prx_mat / threshold
    prx_mat[prx_mat < 1] = 1.
    prx_mat_log = prx_mat.log()
    if task == 'lp':
      U, V = simple_randomized_torch_svd(prx_mat_log, 128, task)
      return U, V
    if task == 'cl':
      embds_svd = simple_randomized_torch_svd(prx_mat_log, 128, task)
      embds = self.fcs[0](embds_svd)
      return embds

In [None]:
def train():
  length_flag = True
  model.train()
  optimizer.zero_grad()
  output = model(prob, identity, threshold, task)
  output_prob = F.log_softmax(output, dim=1)
  output = torch.nn.functional.normalize(output)
  output_0 = output[Label_list[0]]
  neg_loss = torch.trace(torch.transpose(output, 0, 1) @ (Lap_neg @ output))

  class_loss_fn1 = torch.trace(torch.transpose(output_0, 0, 1) @ (Lap_label[0] @ output_0))
  for i in range(1, len(Label_list)):
    output_i = output[Label_list[i]]
    class_loss_fn1 = class_loss_fn1 + torch.trace(torch.transpose(output_i, 0, 1) @ (Lap_label[i] @ output_i))
  if data == "tweibo":
    class_loss_fn1 = class_loss_fn1  / (neg_loss * len(Label_list))
    class_loss_fn2 = - torch.mul(output_prob, node_Label.to(device)).sum() / len(bfs_sample_node_list)

  #derivative of alphas on orkut is too small, or we can select larger learning rate, beta and gamma.
  elif data == "orkut":
    class_loss_fn1 = class_loss_fn1  / neg_loss
    class_loss_fn2 = - torch.mul(output_prob, node_Label.to(device)).sum()

  else:
    class_loss_fn1 = class_loss_fn1  / (neg_loss * len(Label_list))
    class_loss_fn2 = - torch.mul(output_prob, node_Label.to(device)).sum() / len(bfs_sample_node_list)

  loss_fn = class_loss_fn1 * beta + class_loss_fn2 * gamma
  loss_fn.backward()

  temp_dist = model.params1.clone().cpu().detach().numpy()
  optimizer.step()
  params_zero = model.params1.data[model.params1.data>=0]
  satlen = len(params_zero[params_zero<=1])
  if satlen < (nhop+1):
    length_flag = False
    print("Some teleport probabilities are out of range!")
  return loss_fn.item(), temp_dist, length_flag

In [None]:
def dfs_sampling(G, max_sample):#学长代码似乎写错了
  sample_list = set()
  while len(sample_list) < 5000:
    seed = random.randint(0,G.number_of_nodes()-1)
    #print("sample seed node: " + str(seed))
    queue = [seed]
    while queue:
      temp_node = queue.pop()
      if temp_node not in sample_list:
        sample_list.add(temp_node)
        queue.extend(list(set(G.adj[temp_node]) - sample_list))
      if len(sample_list) >= max_sample:
          return list(sample_list)

def neighbor_sampling(G, max_sample):
  sample_list = set()
  while len(sample_list) < 5000:
    seed = random.randint(0,G.number_of_nodes()-1)
    #print("sample seed node: " + str(seed))
    queue = [seed]
    while queue:
      temp_node = queue.pop(0)
      if temp_node not in sample_list:
        sample_list.add(temp_node)
        nextneighs = list(set(G.adj[temp_node]) - sample_list)
        random.shuffle(nextneighs)
        nextneighs = nextneighs[:int(len(nextneighs) * max_sample / G.number_of_nodes())]
        queue.extend(nextneighs)
      if len(sample_list) >= max_sample:
          return list(sample_list)

def bfs_sampling(G, max_sample):
  sample_list = set()
  while len(sample_list) < 5000:
    seed = random.randint(0,G.number_of_nodes()-1)
    #print("sample seed node: " + str(seed))
    queue = [seed]
    while queue:
      temp_node = queue.pop(0)
      if temp_node not in sample_list:
        sample_list.add(temp_node)
        queue.extend(list(set(G.adj[temp_node]) - sample_list))
      if len(sample_list) >= max_sample:
          return list(sample_list)

def node2vec_sampling(G, max_sample):
  gamma = 0.5

  sample_list = set()
  pre = random.randint(0,G.number_of_nodes()-1)
  samples = [pre]
  while len(sample_list) < max_sample:
    
    curs = list(set(G.adj[pre]) - sample_list)
    if not curs:
      pre = random.choice(samples)
      continue

    cur = random.choice(curs)
    samples.append(cur)
    sample_list |= set(samples)
    if random.random() < gamma:
      pre = cur

  return list(sample_list)

In [None]:
seed = set_seed(seed)
if data == 'BlogCatalog':
  num_label = 39
elif data == 'wiki':
  num_label = 40
elif data == "tweibo":
    num_label = 100
elif data == "orkut":
    num_label = 100
else:
  raise Exception("A wrong graph name!")

n, _, directed = load_info(data)

'''if sample_sup_label_set == True:
  sup_label_set(data,n)
alphafile = "Lemane/alpha/" + data + "_class.txt"'''

G, adj, m = load_edge(data, n, directed, task)

cudaid = "cuda:" + str(0) if torch.cuda.is_available() else "cpu"
device = torch.device(cudaid)
identity = torch.eye(sample_size,dtype=torch.float64).to(device)

Node_Label = load_label(data, n)

threshold = 1e-5
model = ComputeProximity4SVD(ngraph=sample_size, niter=nhop, dist=dist, param=param, nclass=num_label).to(device)
optimizer = torch.optim.SGD([{'params':model.params1,'weight_decay':wdecay1}],lr=lr)

best_dist = []
best_dist.append(model.params1.clone().cpu().detach().numpy())
min_loss = 999999999
best_epoch = 0
bad_count = 0
length_flag = True
train_begin = time.time()

from collections import defaultdict
f = open("Lemane/label/" + data + ".txt", "r")
lines = f.readlines()
f.close()

"""counts = defaultdict(set)
labels = defaultdict(set)
for line in lines:
  vec = line.strip().split()
  key = int(vec[0])
  for item in vec[1:]:
    counts[key].add(int(item))
    labels[int(item)].add(key)

total = 0
for key in labels:
  total += len(labels[key])

globaldistribution = {}
for key in labels:
  globaldistribution[key] = len(labels[key]) / total

globaldistribution = sorted(globaldistribution.items(), key=lambda x:x[1], reverse=True)

print("global top 5 labels:")
print(globaldistribution[:5])

sortedlabel = []
for item in globaldistribution:
  sortedlabel.append(item[0])
print("global sorted labels:")
print(sortedlabel)

samerates = []"""

for epoch in range(nepoch):
  begin_time = time.time()

  bfs_sample_node_list = bfs_sampling(G, sample_size)
  #neighbor_sample_node_list = neighbor_sampling(G, sample_size)
  #dfs_sample_node_list = dfs_sampling(G, sample_size)
  #node2vec_sample_node_list = node2vec_sampling(G, sample_size)
  

  samplelist = bfs_sample_node_list

  prob = get_trans_prob_mat(adj, samplelist, sample_size, task).to(device)
  Lap_neg, adj_neg = get_sample_neg_laplacian(sample, len(samplelist))
  Lap_neg = Lap_neg.to(device)
  Label_list, node_Label = get_sample_label_list(Node_Label, samplelist, num_label)


  """samplelabels = defaultdict(set)
  for node in samplelist:
    for la in counts[node]:
      samplelabels[la].add(node)

  total = 0
  for key in samplelabels:
    total += len(samplelabels[key])

  localdistribution = {}
  for key in samplelabels:
    localdistribution[key] = len(samplelabels[key]) / total

  localdistribution = sorted(localdistribution.items(), key=lambda x:x[1], reverse=True)
  print("epoch: " + str(epoch))
  print("local top 5 labels:")
  print(localdistribution[:5])

  sortedlabel = []
  for item in localdistribution:
    sortedlabel.append(item[0])
  print("local sorted labels:")
  print(sortedlabel)

  ind = 0
  index2node = {}
  for node in samplelist:
    index2node[ind] = node
    ind += 1


  samelabelnum = 0
  adjmatrix = adj_neg.todense()
  for i in range(len(samplelist)):
    for j in range(len(samplelist)):
      if adjmatrix[i, j]:
        node1 = index2node[i]
        node2 = index2node[j]
        if counts[node1] & counts[node2]:
          samelabelnum += 1

  samerates.append(samelabelnum / np.sum(adjmatrix))"""


  Lap_label = get_label_laplacian(Label_list)
  
  for i in range(len(Lap_label)):
    Lap_label[i] = Lap_label[i].to(device)

  loss_train, temp_dist, length_flag = train()
  if length_flag == False:
    break
  if loss_train < min_loss:
    min_loss = loss_train
    best_epoch = epoch+1
    best_dist.append(temp_dist)
    bad_count = 0
  else:
    bad_count += 1

  if(epoch+1)%10 == 0:
    print('Epoch:{:03d}'.format(epoch+1),'train_loss:{:.5f}'.format(loss_train),'time_spent:{:.5f}s'.format(time.time() - begin_time))
    print("----------------------------------------------------------------")
  if bad_count == patience:
    break

"""print("negative sample rate:")
print(sorted(samerates, reverse=True))"""

graph name: wiki
1.093647s taken for loading graph
Num of nodes: 4777, num of edges: 184812, Avg degree: 77.375759, Directed:True
1.218156s taken for generating adj matrix


The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:1980.)
  """


Some teleport probabilities are out of range!


'print("negative sample rate:")\nprint(sorted(samerates, reverse=True))'

In [None]:
print("----------------------------------------------------------------")
print("Training time cost: {:.4f}s".format(time.time() - train_begin))
print("Best epoch:{}th".format(best_epoch))
print("Best distribution: {}".format(best_dist[-1]))
print("----------------------------------------------------------------")
if not os.path.exists("alpha"):
  os.mkdir("alpha")
alphafile = "alpha/" + data + "_class.txt"
np.savetxt(alphafile, best_dist[-1],fmt="%.4f",delimiter="\n")

----------------------------------------------------------------
Training time cost: 67.0506s
Best epoch:4th
Best distribution: [0.007231   0.03295404 0.08826171 0.16038287 0.23866818 0.31352434
 0.38063343 0.43905137 0.48929756 0.53242423 0.56953991 0.60164604
 0.62958962 0.65406759 0.67564693 0.69478766]
----------------------------------------------------------------
