In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
### 使用 networkx 包中的函数 LFR_benchmark_graph 生成随机图
import networkx as nx
from networkx.generators.community import LFR_benchmark_graph

n = 1000
tau1 = 2  # Power-law exponent for the degree distribution
tau2 = 1.1 # Power-law exponent for the community size distribution 
            #S hould be >1
mu = 0.05 # Mixing parameter
avg_deg = 25 # Average Degree
max_deg = 100 # Max Degree
min_commu = 50 # Min Community Size
max_commu = 100 # Max Community Size


G = LFR_benchmark_graph(
    n, tau1, tau2, mu, average_degree=avg_deg, max_degree=max_deg, min_community=min_commu, max_community=max_commu, 
    seed=2
)

# nx.draw(G, pos=nx.spring_layout(G),node_color='r', node_size=3, edge_color='b')  # Draw the graph generated above

In [3]:
### 去掉 G 中的重边和自环 
G = nx.Graph(G) # Remove multi-edges

selfloop_edges = list(nx.selfloop_edges(G)) # a list of self loops

G.remove_edges_from(selfloop_edges) # Remove self-loops

In [4]:
### LFR 图是有内在的社群结构的，每个节点的社群存储在其 community 属性中，是一个 set
# 通过运行循环，按照内在的社群结构给每个节点一个标签 即为其 intrinsic_membership
# 为了方便 intrinsic_membership 一开始是作为一个 dict 存储的
intrinsic_communities = {frozenset(G.nodes[v]["community"]) for v in G}
intrinsic_membership = {}
for node in range(G.number_of_nodes()):
    for index, inner_set in enumerate(intrinsic_communities):
        if node in inner_set:
            intrinsic_membership[node] = index
            break
# intrinsic_membership = list(intrinsic_membership.values())

# 存储 list 和 clustering 格式的拷贝 省得以后需要再做类型转换了
intrinsic_list = list(intrinsic_membership.values())
from clusim.clustering import Clustering
intrinsic_clustering = Clustering(elm2clu_dict={i: [intrinsic_membership[i]] for i in intrinsic_membership.keys()})

In [5]:
### 利用 Louvain 算法进行社群识别并画图
# louvain_membership 是作为一个 dict 给出的
from community import community_louvain

louvain_membership = community_louvain.best_partition(G)

print(f"Louvain algorithm gives {max(louvain_membership.values())+1} communities.")

Louvain algorithm gives 14 communities.


In [6]:
### 利用 InfoMap 算法进行社群识别
# 输出类型为一个 list

# Convert the NetworkX graph to an igraph graph
import igraph as ig
iG = ig.Graph.from_networkx(G)

# Perform Infomap clustering using igraph, and get the membership as a list
infomap_membership = iG.community_infomap().membership # 类型为 list

print(f"Infomap algorithm gives {max(infomap_membership)+1} communities.")

Infomap algorithm gives 14 communities.


In [7]:
### 计算 NMI
# 我自己设计的函数可以接受 list 或者 dictl 类型的输入
# 内在的社群结构 intrinsic_membership 和
# Louvain 算法给出的 louvain_membership 

from sklearn.metrics import normalized_mutual_info_score
def NMI(clus1, clus2): # NMI函数可以接受的参数类型可以是 dic 或者 list
    if isinstance(clus1, dict):
        clus1 = list(clus1)
    if isinstance(clus2, dict):
        clus2 = list(clus2)     
    return normalized_mutual_info_score(clus1, clus2)

### 使用范例
NMI(louvain_membership, intrinsic_membership)

1.0

In [8]:
## 计算 ECSim 的函数

# 这个函数将 list 或者 dict 类型的聚类结果进行类型转换
from clusim.clustering import Clustering

def to_clus(input):
    if isinstance(input, list):
        elm2clu_dict = {i: [input[i]] for i in range(len(input))}
    elif isinstance(input, dict):
        elm2clu_dict = {i: [input[i]] for i in input.keys()}
    else:
        raise ValueError("Input must be a list or a dictionary.")

    return Clustering(elm2clu_dict)


import clusim.sim as sim
def ECSim(clus1, clus2): #ECsim函数可以接受的参数类型可以是 dic, list, 或者 Clustering
    if not isinstance(clus1, Clustering):
        clus1 = to_clus(clus1)
    if not isinstance(clus2, Clustering):
        clus2 = to_clus(clus2)     
    return sim.element_sim(clus1, clus2, alpha=0.9)

### 使用范例
ECSim(louvain_membership, intrinsic_membership)

1.0

In [9]:
### 聚类算法 输出NMI及 ECSim

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

from pyspark.ml.clustering import KMeans

from sklearn.metrics import normalized_mutual_info_score

def evaluate_cluster(intr_list, intr_clus, evala): 
# 输入参数 intr的类型为 dict, 为内蕴聚类
# eval 的类型为向量 表示嵌入向量
    return_val = [] # 首先准备好返回值 
    ## 首先做 K Mean
    K = max(intr_list) + 1
    # Create a Spark DataFrame from the points
    # from pyspark.sql import SparkSession
    # from pyspark.ml.linalg import Vectors

    evala_spark = SparkSession.builder.getOrCreate()

    evala_vec = [Vectors.dense(row) for row in evala]
    
    evala_prep = SparkSession.builder.getOrCreate().\
                            createDataFrame([(vector,) for vector in evala_vec], ["embd"])

    # from pyspark.ml.clustering import KMeans

    # Create and fit the KMeans model
    euclid_kmeans = KMeans(k=K, featuresCol="embd")
    cosine_kmeans = KMeans(k=K, featuresCol="embd", distanceMeasure="cosine")
    evala_euclid_model = euclid_kmeans.fit(evala_prep)
    evala_cosine_model = cosine_kmeans.fit(evala_prep)


    # Add the cluster assignment to the DataFrame
    evala_euclid = evala_euclid_model.transform(evala_prep)
    evala_cosine = evala_cosine_model.transform(evala_prep)


    # Extract the cluster assignment and convert it to a list
    evala_euclid_membership = evala_euclid.select("prediction").rdd.flatMap(lambda x: x).collect()
    evala_cosine_membership = evala_cosine.select("prediction").rdd.flatMap(lambda x: x).collect()

    ## 然后开始与内蕴聚类进行比较
    return_val.append(normalized_mutual_info_score(evala_euclid_membership, intr_list))
    return_val.append(normalized_mutual_info_score(evala_cosine_membership, intr_list))
    
    
    evala_euclid_clustering = Clustering(elm2clu_dict={i: [evala_euclid_membership[i]] for i in range(len(evala_euclid_membership))})
    evala_cosine_clustering = Clustering(elm2clu_dict={i: [evala_cosine_membership[i]] for i in range(len(evala_cosine_membership))})
    
    evala_euclid_similarity = sim.element_sim(intr_clus, evala_euclid_clustering, alpha=0.9)
    evala_cosine_similarity = sim.element_sim(intr_clus, evala_cosine_clustering, alpha=0.9)
    return_val.append(evala_euclid_similarity)
    return_val.append(evala_cosine_similarity)
    
    return return_val

In [10]:
### 1 HOPE 方法
from gem.embedding.hope import HOPE

hope_model = HOPE(d=20, beta=0.01) 
# A higher value of beta places more emphasis on capturing higher-order proximities

hope_embd = hope_model.learn_embedding(graph=G, is_weighted=False, no_python=True)
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, hope_embd))

SVD error (low rank): 1.446638


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/24 14:31:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/24 14:31:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


[0.9100566850958083, 0.9191008030411808, 0.7332394108689805, 0.823510680567314]


In [11]:
### 2 Laplacian 方法
from gem.embedding.lap import LaplacianEigenmaps
lap_model = LaplacianEigenmaps(d=20)

lap_embd = lap_model.learn_embedding(graph=G, is_weighted=True, no_python=True)
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, lap_embd))

Laplacian matrix recon. error (low rank): 32.016612
[0.9751187229528936, 0.9654938155479856, 0.9082993311036789, 0.8871015384615384]


In [12]:
### 3 MNMF 方法

from karateclub import MNMF

# Create an instance of the MNMF model
MNMF_model = MNMF(dimensions = 64, clusters = 14, lambd = 0.2, 
             alpha = 0.05, beta = 0.05, iterations = 100, 
             lower_control = 1e-15, eta = 5.0, seed = 42)

# Fit the model to the graph
MNMF_model.fit(G)

# Obtain the graph embeddings
MNMF_embd = MNMF_model.get_embedding()
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, MNMF_embd))

[0.9689192297834323, 0.995859467181148, 0.8982627917444431, 0.9940702722363548]


In [13]:
%%time
### 4 Node2Vec 方法 

from node2vec import Node2Vec

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec_model = Node2Vec(G, dimensions=64, walk_length=30, num_walks=50, workers=32) #, temp_folder='test' # Use temp_folder for big graphs
 
# Embed nodes 
node2vev_fit = node2vec_model.fit(window=10, min_count=1, batch_words=4096)  
# Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)
print("Embedding already generated!!")
node2vec_embd=[]
for i in range(G.number_of_nodes()):
    node2vec_embd.append(node2vev_fit.wv[f'{i}'])

print(evaluate_cluster(intrinsic_list, intrinsic_clustering, node2vec_embd))

Computing transition probabilities:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2/2 [00:00<00:00,  5.53it/s]
Generating walks (CPU: 2): 100%|██████████| 2/2 [00:00<00:00,  5.67it/s]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:00<00:00,  5.62it/s]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:00<00:00,  5.54it/s]
Generating walks (CPU: 5): 100%|██████████| 2/2 [00:00<00:00,  5.60it/s]
Generating walks (CPU: 6): 100%|██████████| 2/2 [00:00<00:00,  5.54it/s]
Generating walks (CPU: 7): 100%|██████████| 2/2 [00:00<00:00,  5.65it/s]
Generating walks (CPU: 8): 100%|██████████| 2/2 [00:00<00:00,  5.63it/s]
Generating walks (CPU: 9): 100%|██████████| 2/2 [00:00<00:00,  5.49it/s]
Generating walks (CPU: 10): 100%|██████████| 2/2 [00:00<00:00,  5.49it/s]
Generating walks (CPU: 11): 100%|██████████| 2/2 [00:00<00:00,  5.61it/s]
Generating walks (CPU: 12): 100%|██████████| 2/2 [00:00<00:00,  5.58it/s]
Generating walks (CPU: 13): 100%|██████████| 2/2 [00:00<00:00,  5.47it/s]
Generating walks (CPU: 14): 100%|██████████| 2/

Embedding already generated!!
[0.9718010208240403, 0.9450323949147932, 0.8981909838562291, 0.8047615023814515]
CPU times: user 4min 7s, sys: 563 ms, total: 4min 8s
Wall time: 33.5 s


In [14]:
### 5 DeepWalk方法
from karateclub import DeepWalk
model = DeepWalk(dimensions=64, walk_length=30, window_size=10)
model.fit(G)
deepwalk_embd = model.get_embedding()
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, deepwalk_embd))



[0.9507255113526435, 0.9680239849015438, 0.8234308875642327, 0.8887482938002788]


In [15]:
### 6 LINE 方法
from ge import LINE
model = LINE(G,embedding_size=60,order='first');
model.train(batch_size=1024,epochs=50,verbose=0);# train model
LINE_embd = model.get_embeddings();# get embedding vectors

LINE_embd = list(LINE_embd.values())
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, LINE_embd))

2023-06-24 14:32:18.439047: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-24 14:32:24.032666: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38223 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:02:00.0, compute capability: 8.0
2023-06-24 14:32:24.034093: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38223 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0
2023-06-24 14:32:24.326183: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error a

[1.0, 0.9771821781614743, 1.0, 0.9199535201640465]


In [16]:
###############################
### Correct Graph LLE
###############################
import numpy as np
import networkx as nx
from sklearn.preprocessing import normalize
import scipy.linalg as lg

def lle(graph, dim):
    A = nx.to_numpy_array(graph, nodelist=sorted(graph.nodes()), weight='weight')
    normalize(A, norm='l1', axis=1, copy=False)
    I_n = np.eye(graph.number_of_nodes())
    I_min_A = np.dot((I_n - A).T, (I_n - A))
    w, v = lg.eig(I_min_A)
    idx = np.argsort(w.real)
    v = v[:, idx]
    embedding = v[:, 1:(dim+1)]
    return embedding

In [17]:
### 7 LLE 方法
D = 15
lle_embd = lle(G, D)
print(evaluate_cluster(intrinsic_list, intrinsic_clustering, lle_embd))

[0.9685299604870004, 1.0, 0.8838846153846155, 1.0]
