In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="gensim.models")

import os
os.environ['PYSPARK_LOG_LEVEL'] = 'ERROR'  # Set the log level to ERROR or higher

import logging
logging.getLogger('node2vec').setLevel(logging.ERROR)

In [2]:
### 使用 networkx 包中的函数 LFR_benchmark_graph 生成随机图
import networkx as nx
from networkx.generators.community import LFR_benchmark_graph

n = 1000
tau1 = 2  # Power-law exponent for the degree distribution
tau2 = 1.1 # Power-law exponent for the community size distribution 
            #S hould be >1
mu = 0.05 # Mixing parameter
avg_deg = 25 # Average Degree
max_deg = 100 # Max Degree
min_commu = 50 # Min Community Size
max_commu = 100 # Max Community Size


G = LFR_benchmark_graph(
    n, tau1, tau2, mu, average_degree=avg_deg, max_degree=max_deg, min_community=min_commu, max_community=max_commu, 
    seed=2
)

# nx.draw(G, pos=nx.spring_layout(G),node_color='r', node_size=3, edge_color='b')  # Draw the graph generated above

In [3]:
### 去掉 G 中的重边和自环 
G = nx.Graph(G) # Remove multi-edges

selfloop_edges = list(nx.selfloop_edges(G)) # a list of self loops

G.remove_edges_from(selfloop_edges) # Remove self-loops

In [4]:
### LFR 图是有内在的社群结构的，每个节点的社群存储在其 community 属性中，是一个 set
# 通过运行循环，按照内在的社群结构给每个节点一个标签 即为其 intrinsic_membership
# 为了方便 intrinsic_membership 一开始是作为一个 dict 存储的
intrinsic_communities = {frozenset(G.nodes[v]["community"]) for v in G}
intrinsic_membership = {}
for node in range(G.number_of_nodes()):
    for index, inner_set in enumerate(intrinsic_communities):
        if node in inner_set:
            intrinsic_membership[node] = index
            break
# intrinsic_membership = list(intrinsic_membership.values())

# 存储 list 和 clustering 格式的拷贝 省得以后需要再做类型转换了
intrinsic_list = list(intrinsic_membership.values())
from clusim.clustering import Clustering
intrinsic_clustering = Clustering(elm2clu_dict={i: [intrinsic_membership[i]] for i in intrinsic_membership.keys()})

In [5]:
### 利用 Louvain 算法进行社群识别并画图
# louvain_membership 是作为一个 dict 给出的
from community import community_louvain

louvain_membership = community_louvain.best_partition(G)

In [6]:
### 利用 InfoMap 算法进行社群识别
# 输出类型为一个 list

# Convert the NetworkX graph to an igraph graph
import igraph as ig
iG = ig.Graph.from_networkx(G)

# Perform Infomap clustering using igraph, and get the membership as a list
infomap_membership = iG.community_infomap().membership # 类型为 list
#print(f"Infomap algorithm gives {max(infomap_membership)+1} communities.")

In [7]:
### 导入计算 NMI 和 ECSim 的包 我自己封装的
from auxpack.evaluate_clustering import NMI
from auxpack.evaluate_clustering import ECSim as ECS

### 使用范例
print(NMI(louvain_membership, intrinsic_membership))
print(ECS(infomap_membership, intrinsic_membership))

1.0
1.0


In [8]:
### 导入 图嵌入评估函数 我自己封装的
from auxpack.evaluate_embedding import evaluate_embedding as EE

In [9]:
%%time
### 1 HOPE 方法
from gem.embedding.hope import HOPE

hope_model = HOPE(d=30, beta=0.01) 
# A higher value of beta places more emphasis on capturing higher-order proximities

hope_embd = hope_model.learn_embedding(graph=G, is_weighted=False, no_python=True)
print(EE(intrinsic_list, intrinsic_clustering, hope_embd))

SVD error (low rank): 1.249493


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/24 22:30:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/24 22:30:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/24 22:30:13 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/06/24 22:30:13 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/06/24 22:30:13 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/06/24 22:30:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


[0.9436771465718152, 0.9418075240728796, 0.800055110496287, 0.8216228269085412]
CPU times: user 4.1 s, sys: 6.61 s, total: 10.7 s
Wall time: 15.4 s


In [10]:
%%time
### 2 Laplacian 方法
from gem.embedding.lap import LaplacianEigenmaps
lap_model = LaplacianEigenmaps(d=20)

lap_embd = lap_model.learn_embedding(graph=G, is_weighted=True, no_python=True)
print(EE(intrinsic_list, intrinsic_clustering, lap_embd))

Laplacian matrix recon. error (low rank): 32.016612
[0.8980805051395988, 0.9392077818157224, 0.71992, 0.8058916122233929]
CPU times: user 4.34 s, sys: 16.9 s, total: 21.2 s
Wall time: 5.67 s


In [11]:
%%time
### 3 MNMF 方法

from karateclub import MNMF

# Create an instance of the MNMF model
MNMF_model = MNMF(dimensions = 64, clusters = 14, lambd = 0.2, 
             alpha = 0.05, beta = 0.05, iterations = 100, 
             lower_control = 1e-15, eta = 5.0, seed = 42)

# Fit the model to the graph
MNMF_model.fit(G)

# Obtain the graph embeddings
MNMF_embd = MNMF_model.get_embedding()
print(EE(intrinsic_list, intrinsic_clustering, MNMF_embd))

[0.9721863798808087, 0.9771770912977618, 0.9144221634131068, 0.9271471953132777]
CPU times: user 1min 44s, sys: 6min 22s, total: 8min 7s
Wall time: 13.1 s


In [12]:
%%time
### 4 Node2Vec 方法 

from node2vec import Node2Vec

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec_model = Node2Vec(G, dimensions=64, walk_length=30, num_walks=50, workers=32) #, temp_folder='test' # Use temp_folder for big graphs
 
# Embed nodes 
node2vec_fit = node2vec_model.fit(window=10, min_count=1, batch_words=4096)  
# Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed 
# (from the Node2Vec constructor)
# print("Embedding already generated!!")
node2vec_embd = []
for i in range(G.number_of_nodes()):
    node2vec_embd.append(node2vec_fit.wv[str(i)])
print(EE(intrinsic_list, intrinsic_clustering, node2vec_embd))

Computing transition probabilities:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2/2 [00:00<00:00,  5.64it/s]
Generating walks (CPU: 2): 100%|██████████| 2/2 [00:00<00:00,  5.68it/s]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:00<00:00,  5.66it/s]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:00<00:00,  5.64it/s]
Generating walks (CPU: 5): 100%|██████████| 2/2 [00:00<00:00,  5.66it/s]
Generating walks (CPU: 6): 100%|██████████| 2/2 [00:00<00:00,  5.72it/s]
Generating walks (CPU: 7): 100%|██████████| 2/2 [00:00<00:00,  5.68it/s]
Generating walks (CPU: 8): 100%|██████████| 2/2 [00:00<00:00,  5.55it/s]
Generating walks (CPU: 9): 100%|██████████| 2/2 [00:00<00:00,  5.73it/s]
Generating walks (CPU: 10): 100%|██████████| 2/2 [00:00<00:00,  5.52it/s]
Generating walks (CPU: 11): 100%|██████████| 2/2 [00:00<00:00,  5.59it/s]
Generating walks (CPU: 12): 100%|██████████| 2/2 [00:00<00:00,  5.60it/s]
Generating walks (CPU: 13): 100%|██████████| 2/2 [00:00<00:00,  5.61it/s]
Generating walks (CPU: 14): 100%|██████████| 2/

[1.0, 0.9317446102555503, 1.0, 0.7618462880719485]
CPU times: user 5min 7s, sys: 383 ms, total: 5min 7s
Wall time: 35.5 s


In [13]:
%%time
### 5 DeepWalk方法
from karateclub import DeepWalk
model = DeepWalk(dimensions=64, walk_length=30, window_size=10)
model.fit(G)
deepwalk_embd = model.get_embedding()
print(EE(intrinsic_list, intrinsic_clustering, deepwalk_embd))



[0.9698507705119342, 0.9766813035012658, 0.891169291877825, 0.9133832442067736]
CPU times: user 1.82 s, sys: 58.8 ms, total: 1.88 s
Wall time: 6.42 s


In [14]:
%%time
### 6 LINE 方法
from ge import LINE
model = LINE(G,embedding_size=60,order='first');
model.train(batch_size=1024,epochs=50,verbose=0);# train model
LINE_embd = model.get_embeddings();# get embedding vectors

LINE_embd = list(LINE_embd.values())
print(EE(intrinsic_list, intrinsic_clustering, LINE_embd))

2023-06-24 22:31:27.772208: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-24 22:31:34.114569: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 683 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:02:00.0, compute capability: 8.0
2023-06-24 22:31:34.115901: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 37596 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0
2023-06-24 22:31:34.215980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and

[1.0, 0.9451257533725198, 1.0, 0.8065274495463591]
CPU times: user 20.8 s, sys: 3.7 s, total: 24.5 s
Wall time: 20.6 s


In [15]:
%%time
### 7 LLE 方法
from auxpack.lle import lle
D = 15
lle_embd = lle(G, D)
print(EE(intrinsic_list, intrinsic_clustering, lle_embd))

[0.9331044589042856, 0.9688817515840863, 0.7892344406987435, 0.8852461538461539]
CPU times: user 20.2 s, sys: 40.6 s, total: 1min
Wall time: 5.72 s
